voidaccess 1.4.7__tar.gz → 1.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (210) hide show
  1. {voidaccess-1.4.7/voidaccess.egg-info → voidaccess-1.6.0}/PKG-INFO +108 -7
  2. {voidaccess-1.4.7 → voidaccess-1.6.0}/README.md +107 -6
  3. {voidaccess-1.4.7 → voidaccess-1.6.0}/api/main.py +155 -11
  4. voidaccess-1.6.0/api/routes/actors.py +735 -0
  5. {voidaccess-1.4.7 → voidaccess-1.6.0}/api/routes/admin.py +124 -1
  6. {voidaccess-1.4.7 → voidaccess-1.6.0}/api/routes/export.py +235 -6
  7. {voidaccess-1.4.7 → voidaccess-1.6.0}/api/routes/investigations.py +1176 -271
  8. {voidaccess-1.4.7 → voidaccess-1.6.0}/api/routes/settings.py +34 -0
  9. {voidaccess-1.4.7 → voidaccess-1.6.0}/config.py +23 -2
  10. voidaccess-1.6.0/db/migrations/versions/0021_add_search_engine_stats.py +34 -0
  11. voidaccess-1.6.0/db/migrations/versions/0022_add_actor_profiles.py +158 -0
  12. voidaccess-1.6.0/db/migrations/versions/0023_add_investigation_metadata.py +60 -0
  13. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/models.py +183 -0
  14. voidaccess-1.6.0/db/search_engine_stats.py +315 -0
  15. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/session.py +23 -4
  16. voidaccess-1.6.0/export/__init__.py +58 -0
  17. voidaccess-1.6.0/export/ioc_package.py +1055 -0
  18. voidaccess-1.6.0/export/snort_export.py +551 -0
  19. voidaccess-1.6.0/export/yara_export.py +664 -0
  20. {voidaccess-1.4.7 → voidaccess-1.6.0}/extractor/llm_extract.py +50 -38
  21. voidaccess-1.6.0/extractor/normalizer.py +1245 -0
  22. {voidaccess-1.4.7 → voidaccess-1.6.0}/extractor/pipeline.py +200 -6
  23. voidaccess-1.6.0/extractor/regex_patterns.py +2326 -0
  24. {voidaccess-1.4.7 → voidaccess-1.6.0}/fingerprint/profiler.py +26 -0
  25. {voidaccess-1.4.7 → voidaccess-1.6.0}/graph/builder.py +262 -22
  26. {voidaccess-1.4.7 → voidaccess-1.6.0}/graph/model.py +34 -0
  27. {voidaccess-1.4.7 → voidaccess-1.6.0}/monitor/jobs.py +9 -15
  28. {voidaccess-1.4.7 → voidaccess-1.6.0}/pyproject.toml +1 -1
  29. {voidaccess-1.4.7 → voidaccess-1.6.0}/scraper/scrape.py +153 -2
  30. {voidaccess-1.4.7 → voidaccess-1.6.0}/search/__init__.py +118 -24
  31. voidaccess-1.6.0/search/circuit_breaker.py +79 -0
  32. voidaccess-1.6.0/search/query_builder.py +48 -0
  33. {voidaccess-1.4.7 → voidaccess-1.6.0}/search/search.py +119 -26
  34. voidaccess-1.6.0/sources/actor_profiles.py +1684 -0
  35. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/dns_enrichment.py +98 -3
  36. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/domain_reputation.py +68 -4
  37. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/email_reputation.py +60 -3
  38. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/hash_reputation.py +110 -5
  39. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/ip_reputation.py +57 -3
  40. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/paste_scraper.py +23 -30
  41. voidaccess-1.6.0/sources/proxy_client.py +622 -0
  42. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/rss_scraper.py +72 -23
  43. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/seed_manager.py +203 -8
  44. voidaccess-1.6.0/tests/test_cli_proxy_config.py +697 -0
  45. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_db.py +2 -1
  46. voidaccess-1.6.0/tests/test_filter_e2e_manual.py +50 -0
  47. voidaccess-1.6.0/tests/test_filter_parser_manual.py +60 -0
  48. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_pagination.py +7 -6
  49. voidaccess-1.6.0/tests/test_paste_scraper.py +569 -0
  50. voidaccess-1.6.0/tests/test_proxy_client.py +1135 -0
  51. voidaccess-1.6.0/tests/test_regex_patterns.py +2341 -0
  52. voidaccess-1.6.0/tests/test_rss_scraper.py +675 -0
  53. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_settings.py +3 -4
  54. voidaccess-1.6.0/tests/test_snort_export.py +431 -0
  55. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_vector.py +3 -1
  56. voidaccess-1.6.0/tests/test_yara_export.py +353 -0
  57. {voidaccess-1.4.7 → voidaccess-1.6.0}/utils/content_safety.py +59 -2
  58. voidaccess-1.6.0/utils/enrichment_cache.py +676 -0
  59. {voidaccess-1.4.7 → voidaccess-1.6.0}/utils/user_keys.py +10 -7
  60. {voidaccess-1.4.7 → voidaccess-1.6.0}/vector/embedder.py +5 -1
  61. voidaccess-1.6.0/vector/model_singleton.py +87 -0
  62. {voidaccess-1.4.7 → voidaccess-1.6.0}/vector/store.py +1 -1
  63. voidaccess-1.6.0/voidaccess/config.py +14 -0
  64. {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess/llm.py +222 -33
  65. {voidaccess-1.4.7 → voidaccess-1.6.0/voidaccess.egg-info}/PKG-INFO +108 -7
  66. {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess.egg-info/SOURCES.txt +21 -0
  67. {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess_cli/__init__.py +1 -1
  68. voidaccess-1.6.0/voidaccess_cli/adapters/sqlite.py +860 -0
  69. {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess_cli/browser.py +203 -71
  70. voidaccess-1.6.0/voidaccess_cli/commands/actors.py +479 -0
  71. voidaccess-1.6.0/voidaccess_cli/commands/configure.py +408 -0
  72. voidaccess-1.6.0/voidaccess_cli/commands/export.py +352 -0
  73. {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess_cli/commands/investigate.py +420 -15
  74. {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess_cli/commands/show.py +98 -0
  75. {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess_cli/config.py +70 -0
  76. voidaccess-1.6.0/voidaccess_cli/main.py +488 -0
  77. voidaccess-1.4.7/export/__init__.py +0 -34
  78. voidaccess-1.4.7/extractor/normalizer.py +0 -638
  79. voidaccess-1.4.7/extractor/regex_patterns.py +0 -325
  80. voidaccess-1.4.7/search/circuit_breaker.py +0 -247
  81. voidaccess-1.4.7/tests/test_paste_scraper.py +0 -245
  82. voidaccess-1.4.7/tests/test_rss_scraper.py +0 -359
  83. voidaccess-1.4.7/vector/model_singleton.py +0 -49
  84. voidaccess-1.4.7/voidaccess_cli/adapters/sqlite.py +0 -329
  85. voidaccess-1.4.7/voidaccess_cli/commands/configure.py +0 -182
  86. voidaccess-1.4.7/voidaccess_cli/commands/export.py +0 -162
  87. voidaccess-1.4.7/voidaccess_cli/main.py +0 -191
  88. {voidaccess-1.4.7 → voidaccess-1.6.0}/LICENSE +0 -0
  89. {voidaccess-1.4.7 → voidaccess-1.6.0}/analysis/__init__.py +0 -0
  90. {voidaccess-1.4.7 → voidaccess-1.6.0}/analysis/opsec.py +0 -0
  91. {voidaccess-1.4.7 → voidaccess-1.6.0}/analysis/patterns.py +0 -0
  92. {voidaccess-1.4.7 → voidaccess-1.6.0}/analysis/temporal.py +0 -0
  93. {voidaccess-1.4.7 → voidaccess-1.6.0}/api/__init__.py +0 -0
  94. {voidaccess-1.4.7 → voidaccess-1.6.0}/api/auth.py +0 -0
  95. {voidaccess-1.4.7 → voidaccess-1.6.0}/api/routes/__init__.py +0 -0
  96. {voidaccess-1.4.7 → voidaccess-1.6.0}/api/routes/auth.py +0 -0
  97. {voidaccess-1.4.7 → voidaccess-1.6.0}/api/routes/entities.py +0 -0
  98. {voidaccess-1.4.7 → voidaccess-1.6.0}/api/routes/monitors.py +0 -0
  99. {voidaccess-1.4.7 → voidaccess-1.6.0}/api/routes/search.py +0 -0
  100. {voidaccess-1.4.7 → voidaccess-1.6.0}/auth/__init__.py +0 -0
  101. {voidaccess-1.4.7 → voidaccess-1.6.0}/auth/token_blacklist.py +0 -0
  102. {voidaccess-1.4.7 → voidaccess-1.6.0}/crawler/__init__.py +0 -0
  103. {voidaccess-1.4.7 → voidaccess-1.6.0}/crawler/dedup.py +0 -0
  104. {voidaccess-1.4.7 → voidaccess-1.6.0}/crawler/frontier.py +0 -0
  105. {voidaccess-1.4.7 → voidaccess-1.6.0}/crawler/spider.py +0 -0
  106. {voidaccess-1.4.7 → voidaccess-1.6.0}/crawler/utils.py +0 -0
  107. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/__init__.py +0 -0
  108. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/__init__.py +0 -0
  109. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/env.py +0 -0
  110. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0001_initial_schema.py +0 -0
  111. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0002_add_investigation_status_column.py +0 -0
  112. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0002_add_missing_tables.py +0 -0
  113. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0003_add_canonical_value_and_entity_links.py +0 -0
  114. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0004_add_page_posted_at.py +0 -0
  115. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0005_add_extraction_method.py +0 -0
  116. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0006_add_monitor_alerts.py +0 -0
  117. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0007_add_actor_style_profiles.py +0 -0
  118. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0008_add_users_table.py +0 -0
  119. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0009_add_investigation_id_to_relationships.py +0 -0
  120. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0010_add_composite_index_entity_relationships.py +0 -0
  121. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0011_add_page_extraction_cache.py +0 -0
  122. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0013_add_graph_status.py +0 -0
  123. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0015_add_progress_fields.py +0 -0
  124. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0016_backfill_graph_status.py +0 -0
  125. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0017_add_user_api_keys.py +0 -0
  126. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0018_add_user_id_to_investigations.py +0 -0
  127. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0019_add_content_safety_log.py +0 -0
  128. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/migrations/versions/0020_add_entity_source_tracking.py +0 -0
  129. {voidaccess-1.4.7 → voidaccess-1.6.0}/db/queries.py +0 -0
  130. {voidaccess-1.4.7 → voidaccess-1.6.0}/export/misp.py +0 -0
  131. {voidaccess-1.4.7 → voidaccess-1.6.0}/export/sigma.py +0 -0
  132. {voidaccess-1.4.7 → voidaccess-1.6.0}/export/stix.py +0 -0
  133. {voidaccess-1.4.7 → voidaccess-1.6.0}/extractor/__init__.py +0 -0
  134. {voidaccess-1.4.7 → voidaccess-1.6.0}/extractor/ner.py +0 -0
  135. {voidaccess-1.4.7 → voidaccess-1.6.0}/fingerprint/__init__.py +0 -0
  136. {voidaccess-1.4.7 → voidaccess-1.6.0}/fingerprint/stylometry.py +0 -0
  137. {voidaccess-1.4.7 → voidaccess-1.6.0}/graph/__init__.py +0 -0
  138. {voidaccess-1.4.7 → voidaccess-1.6.0}/graph/export.py +0 -0
  139. {voidaccess-1.4.7 → voidaccess-1.6.0}/graph/queries.py +0 -0
  140. {voidaccess-1.4.7 → voidaccess-1.6.0}/graph/visualize.py +0 -0
  141. {voidaccess-1.4.7 → voidaccess-1.6.0}/i18n/__init__.py +0 -0
  142. {voidaccess-1.4.7 → voidaccess-1.6.0}/i18n/detect.py +0 -0
  143. {voidaccess-1.4.7 → voidaccess-1.6.0}/i18n/query_expand.py +0 -0
  144. {voidaccess-1.4.7 → voidaccess-1.6.0}/i18n/translate.py +0 -0
  145. {voidaccess-1.4.7 → voidaccess-1.6.0}/monitor/__init__.py +0 -0
  146. {voidaccess-1.4.7 → voidaccess-1.6.0}/monitor/_db.py +0 -0
  147. {voidaccess-1.4.7 → voidaccess-1.6.0}/monitor/alerts.py +0 -0
  148. {voidaccess-1.4.7 → voidaccess-1.6.0}/monitor/config.py +0 -0
  149. {voidaccess-1.4.7 → voidaccess-1.6.0}/monitor/diff.py +0 -0
  150. {voidaccess-1.4.7 → voidaccess-1.6.0}/monitor/scheduler.py +0 -0
  151. {voidaccess-1.4.7 → voidaccess-1.6.0}/scraper/__init__.py +0 -0
  152. {voidaccess-1.4.7 → voidaccess-1.6.0}/scraper/scrape_js.py +0 -0
  153. {voidaccess-1.4.7 → voidaccess-1.6.0}/setup.cfg +0 -0
  154. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/__init__.py +0 -0
  155. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/blockchain.py +0 -0
  156. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/cache.py +0 -0
  157. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/cisa.py +0 -0
  158. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/engines.py +0 -0
  159. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/enrichment.py +0 -0
  160. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/github_scraper.py +0 -0
  161. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/gitlab_scraper.py +0 -0
  162. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/historical_intel.py +0 -0
  163. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/pastes.py +0 -0
  164. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/seeds.py +0 -0
  165. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/shodan.py +0 -0
  166. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/telegram.py +0 -0
  167. {voidaccess-1.4.7 → voidaccess-1.6.0}/sources/virustotal.py +0 -0
  168. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_analysis_opsec.py +0 -0
  169. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_analysis_stylometry.py +0 -0
  170. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_analysis_temporal.py +0 -0
  171. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_api.py +0 -0
  172. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_api_monitors.py +0 -0
  173. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_blockchain.py +0 -0
  174. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_config.py +0 -0
  175. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_crawler.py +0 -0
  176. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_dns_enrichment.py +0 -0
  177. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_domain_reputation.py +0 -0
  178. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_email_reputation.py +0 -0
  179. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_fingerprint.py +0 -0
  180. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_github_scraper.py +0 -0
  181. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_gitlab_scraper.py +0 -0
  182. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_graph.py +0 -0
  183. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_hash_reputation.py +0 -0
  184. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_i18n.py +0 -0
  185. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_ip_reputation.py +0 -0
  186. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_llm.py +0 -0
  187. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_llm_utils.py +0 -0
  188. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_model_singleton.py +0 -0
  189. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_monitor.py +0 -0
  190. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_scrape_js.py +0 -0
  191. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_sources.py +0 -0
  192. {voidaccess-1.4.7 → voidaccess-1.6.0}/tests/test_sources_enrichment_new.py +0 -0
  193. {voidaccess-1.4.7 → voidaccess-1.6.0}/utils/__init__.py +0 -0
  194. {voidaccess-1.4.7 → voidaccess-1.6.0}/utils/async_utils.py +0 -0
  195. {voidaccess-1.4.7 → voidaccess-1.6.0}/utils/defang.py +0 -0
  196. {voidaccess-1.4.7 → voidaccess-1.6.0}/utils/encryption.py +0 -0
  197. {voidaccess-1.4.7 → voidaccess-1.6.0}/utils/ioc_freshness.py +0 -0
  198. {voidaccess-1.4.7 → voidaccess-1.6.0}/vector/__init__.py +0 -0
  199. {voidaccess-1.4.7 → voidaccess-1.6.0}/vector/search.py +0 -0
  200. {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess/__init__.py +0 -0
  201. {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess/llm_utils.py +0 -0
  202. {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess.egg-info/dependency_links.txt +0 -0
  203. {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess.egg-info/entry_points.txt +0 -0
  204. {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess.egg-info/requires.txt +0 -0
  205. {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess.egg-info/top_level.txt +0 -0
  206. {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess_cli/adapters/__init__.py +0 -0
  207. {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess_cli/commands/__init__.py +0 -0
  208. {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess_cli/commands/enrich.py +0 -0
  209. {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess_cli/display.py +0 -0
  210. {voidaccess-1.4.7 → voidaccess-1.6.0}/voidaccess_cli/tor_detect.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: voidaccess
3
- Version: 1.4.7
3
+ Version: 1.6.0
4
4
  Summary: Dark web OSINT CLI — automated threat intelligence from query to report
5
5
  Author: VoidAccess
6
6
  License-Expression: MIT
@@ -65,6 +65,35 @@ Commercial threat intelligence platforms often charge prohibitive annual fees fo
65
65
 
66
66
  ---
67
67
 
68
+ ## What's New in v1.6.0
69
+
70
+ - **Optional clearnet scraping proxy (ScrapingAnt)** — paste sites and RSS feeds can now be routed through ScrapingAnt. Affects clearnet scraping only; Tor, `.onion`, GitHub, and GitLab traffic are never affected.
71
+ - **Two mutually exclusive transports** — pick one, not both:
72
+ - **REST API transport** — `VOIDACCESS_USE_PROXIES=true` (legacy v1.5.0 toggle) routes requests through ScrapingAnt's Web Scraping API.
73
+ - **Proxy Mode transport** — `VOIDACCESS_USE_PROXY=true` routes requests through ScrapingAnt's HTTP CONNECT endpoint at `proxy.scrapingant.com:8080`.
74
+ - Per [ScrapingAnt docs](https://docs.scrapingant.com/proxy-mode): "Proxy Mode is a light front-end for the scraping API and has all the same functionality and performance" — so the two are alternate transports to the same backend, never chained.
75
+ - **`SCRAPINGANT_PROXY_TYPE`** — `residential` (default) or `datacenter`; per docs this is passed as a `proxy_type=` parameter in the Proxy Mode username string (which is built at connection time as `"scrapingant&browser=false&proxy_type=..."`). NOT a separate hostname.
76
+ - **Single credential** — `SCRAPINGANT_API_KEY` is the only real credential; the Proxy Mode username is a literal constant per docs. No per-customer username field, no second key.
77
+ - **New CLI surfaces** — `voidaccess configure proxy` now prompts for key + type in one uninterrupted block, plus `--enable-proxy / --disable-proxy` for non-interactive Proxy Mode toggling and `--show` for masked state inspection.
78
+ - **55 → 74 proxy-config tests** — covers both transports, the `apply_env()` independent-toggle guarantee, the single-transport selection logic, and the masked `--show` output.
79
+
80
+ ---
81
+
82
+ ## What's New in v1.5.0
83
+
84
+ - 37 new entity types across crypto, credentials, messaging, and network/forensic indicators.
85
+ - YARA, Snort, Suricata, and IOC package ZIP exports.
86
+ - Persistent actor profiles with aliases, infrastructure, notes, and timelines.
87
+ - Cross-alias resolution using shared infrastructure, shared PGP, string similarity, temporal co-activity, and co-investigation.
88
+ - Backend graph community detection and path-between-nodes queries.
89
+ - CLI graph browser path finder and frontend Find Path highlighting.
90
+ - Per-phase pipeline timeouts for enrichment, graph, summary, finalize, and parallel sources.
91
+ - `sources_used` and `infrastructure_clusters` persist in investigation metadata.
92
+ - Cross-run enrichment cache with Redis, SQLite, and memory backends.
93
+ - Auto-discovery and weekly validation of `.onion` seeds.
94
+
95
+ ---
96
+
68
97
  ## Quick Start
69
98
 
70
99
  ### Option A - CLI (no Docker, 30 seconds)
@@ -102,11 +131,28 @@ The Docker stack includes PostgreSQL, Tor, FastAPI, and Next.js.
102
131
  |---|---|
103
132
  | `voidaccess investigate` | Run an investigation |
104
133
  | `voidaccess show` | Interactive entity browser |
105
- | `voidaccess export` | Export STIX/MISP/Sigma/CSV/MD |
134
+ | `voidaccess export` | Export STIX/MISP/Sigma/YARA/Snort/Suricata/package/CSV/MD/JSON |
135
+ | `voidaccess package <file>` | Export an IOC ZIP bundle |
106
136
  | `voidaccess enrich` | Re-enrich saved results |
107
137
  | `voidaccess list` | List saved investigations |
108
- | `voidaccess status` | Config and API key status |
138
+ | `voidaccess status` | Config, API key, cache, engine, and seed status |
139
+ | `voidaccess actors` | List persistent actor profiles |
140
+ | `voidaccess actor <handle>` | Show an actor profile with aliases, infrastructure, notes, and history |
141
+ | `voidaccess actor <handle> --timeline` | Show an actor activity timeline |
142
+ | `voidaccess actor <handle> --note "text"` | Append an analyst note to an actor profile |
143
+ | `voidaccess timeline <handle>` | Shortcut for `voidaccess actor <handle> --timeline` |
109
144
  | `voidaccess configure` | Setup wizard |
145
+ | `voidaccess configure proxy` | ScrapingAnt key, username, type, and routing toggles. Flags: `--enable / --disable` (API gate), `--enable-proxy / --disable-proxy` (proxy gate), `--show` (masked state) |
146
+
147
+ Export examples:
148
+
149
+ ```bash
150
+ voidaccess package investigation.json
151
+ voidaccess export investigation.json --format yara
152
+ voidaccess export investigation.json --format snort
153
+ voidaccess export investigation.json --format suricata
154
+ voidaccess status --seeds
155
+ ```
110
156
 
111
157
  ### CLI vs Docker
112
158
 
@@ -167,14 +213,17 @@ VoidAccess handles the complexity of dark web research through a rigorous sequen
167
213
 
168
214
  ## What It Extracts
169
215
 
170
- The extraction pipeline identifies these entity types:
216
+ The extraction pipeline identifies 55+ entity types:
171
217
 
172
218
  | Category | Examples |
173
219
  |---|---|
174
- | **Cryptocurrency** | Bitcoin, Ethereum, Monero wallet addresses |
220
+ | **Cryptocurrency** | Bitcoin, Ethereum, Monero, Litecoin, Zcash, Dogecoin, XRP, Solana, Tron, Bitcoin Cash, Dash, ENS |
175
221
  | **Network Indicators** | IPv4 addresses, .onion URLs, domains, email addresses, PGP keys |
176
222
  | **File Indicators** | MD5, SHA1, SHA256 hashes |
177
- | **Vulnerabilities** | CVE numbers, MITRE ATT&CK techniques |
223
+ | **Credentials** | AWS keys, GitHub tokens, Slack tokens, Discord tokens, JWTs, Google API keys, Stripe keys, generic API keys, stealer log entries |
224
+ | **Messaging Handles** | Telegram, Discord, XMPP, Tox, Session, Matrix, Wire, ICQ, Wickr |
225
+ | **Network/Forensic** | IPv6, MAC addresses, IPFS CIDs, combo-list entries, YARA rules, MITRE tactics, Exploit-DB IDs, Nuclei templates, seed phrases |
226
+ | **Vulnerabilities** | CVE numbers, MITRE ATT&CK techniques and tactics |
178
227
  | **Threat Actors** | Actor handles, malware families, ransomware group names |
179
228
  | **Paste Sites** | Pastebin, Ghostbin, Rentry, and similar links |
180
229
  | **People/Orgs** | Named persons, organization names, locations |
@@ -214,7 +263,19 @@ Export formats:
214
263
  - **STIX 2.1** — bundles with indicators, threat actors, malware objects
215
264
  - **MISP JSON** — events with galaxies for direct import
216
265
  - **Sigma rules** — auto-generated detection rules from extracted IOCs
217
- - **CSV** flat entity dumps for spreadsheet analysis
266
+ - **YARA rules** - generated rules for malware, credentials, infrastructure, and IOC strings
267
+ - **Snort rules** - network detection rules for IPs, domains, URLs, and selected IOC content
268
+ - **Suricata rules** - Suricata-compatible network rules with the same IOC coverage as Snort
269
+ - **IOC package ZIP** - 21-file bundle containing text IOC lists, STIX, MISP, Sigma, YARA, Snort, Suricata, summary, and CSV
270
+ - **CSV** - flat entity dumps for spreadsheet analysis
271
+
272
+ ---
273
+
274
+ ## Actor Intelligence
275
+
276
+ VoidAccess v1.5.0 persists actor profiles across investigations in `actor_profiles`, with linked aliases and infrastructure in `actor_aliases` and `actor_infrastructure`. Profiles are populated from threat actor, ransomware group, and handle entities, then enriched with co-occurring infrastructure and timeline events.
277
+
278
+ Cross-alias resolution scores five signals: shared infrastructure, shared PGP, string similarity, temporal co-activity, and co-investigation. Use `voidaccess actors` to list profiles, `voidaccess actor <handle>` for the full profile, `voidaccess actor <handle> --timeline` for chronology, and `voidaccess actor <handle> --note "text"` for analyst notes.
218
279
 
219
280
  ---
220
281
 
@@ -253,6 +314,46 @@ All enrichment sources that require a key degrade gracefully when the key is abs
253
314
  | `GITLAB_TOKEN` | Raises GitLab scraping from 15 to 60 req/min | Free | [gitlab.com/profile/personal_access_tokens](https://gitlab.com/-/profile/personal_access_tokens) |
254
315
  | `BLOCKCYPHER_TOKEN` | BTC/ETH wallet balance and transaction graph | Yes | [blockcypher.com](https://www.blockcypher.com) |
255
316
  | `ETHERSCAN_API_KEY` | ETH wallet lookups | Yes | [etherscan.io/apis](https://etherscan.io/apis) |
317
+ | `SCRAPINGANT_API_KEY` + `VOIDACCESS_USE_PROXIES=true` | Optional clearnet proxy for paste sites + RSS feeds (see below) | Yes (free tier) | [scrapingant.com](https://scrapingant.com/?ref=mzliyzh) |
318
+
319
+ ### Optional: Clearnet Scraping Proxy (ScrapingAnt)
320
+
321
+ When a third-party clearnet site rate-limits or blocks VoidAccess's outbound IP, every paste site fetch and every RSS feed fetch in the same investigation can fail. The optional **ScrapingAnt** integration routes those specific requests through ScrapingAnt — either its Web Scraping API or its Proxy Mode HTTP CONNECT endpoint at `proxy.scrapingant.com:8080`.
322
+
323
+ **What it covers** — paste sites (Pastebin, dpaste, paste.ee, Rentry) and the 20 curated RSS security feeds (Krebs on Security, BleepingComputer, Talos, Mandiant, CrowdStrike, Unit 42, CISA, and others). Nothing else.
324
+
325
+ **What it does not cover** — Tor traffic, dark web scraping, and `.onion` fetches are **completely unaffected** by this setting regardless of how it is configured. The proxy only sees the two clearnet sources named above.
326
+
327
+ **GitHub and GitLab scraping are also unaffected** — and intentionally so. Both of those scrapers carry authentication tokens (`GITHUB_TOKEN`, `GITLAB_TOKEN`) in their requests. Forwarding those tokens through a third-party proxy would expose them to that third party, which is unacceptable from a security standpoint. Both scrapers always go direct to the GitHub/GitLab API regardless of the proxy setting. This is a permanent design constraint, not something the proxy toggle can override.
328
+
329
+ **It's entirely optional.** VoidAccess behaves identically without it — paste sites and RSS feeds are simply fetched directly, exactly as they were in every prior release. Add the key only if you see upstream rate-limiting or blocks affecting those two sources.
330
+
331
+ #### Two mutually exclusive transports (v1.6.0)
332
+
333
+ Per the [ScrapingAnt docs](https://docs.scrapingant.com/proxy-mode): *"The proxy mode is a light front-end for the scraping API and has all the same functionality and performance as sending requests to the API endpoint."* Therefore the two transports below are **alternate transports to the same backend service** — pick ONE per request, never both:
334
+
335
+ | Transport | Env var | Required config | What it does |
336
+ |---|---|---|---|
337
+ | **REST API** | `VOIDACCESS_USE_PROXIES=true` | `SCRAPINGANT_API_KEY` | POSTs the target URL to `api.scrapingant.com/v2/general` and returns the response body. Legacy v1.5.0 toggle. |
338
+ | **Proxy Mode** | `VOIDACCESS_USE_PROXY=true` | `SCRAPINGANT_API_KEY` (only) | Routes the request as HTTP CONNECT through `proxy.scrapingant.com:8080` with username string built at connection time per docs: `scrapingant&browser=false&proxy_type=residential\|datacenter`. |
339
+
340
+ The Proxy Mode transport also reads `SCRAPINGANT_PROXY_TYPE` to pick the pool: `residential` (default; harder to detect, slightly higher latency) or `datacenter` (faster, cheaper, easier to fingerprint).
341
+
342
+ **Missing credentials leave both transports inactive.** Setting either transport env var to `true` without `SCRAPINGANT_API_KEY` is a no-op for that transport. No errors, no surprises.
343
+
344
+ **If both transport env vars are set, Proxy Mode wins** with a one-shot info log at runtime — there is no chained mode (Proxy Mode is documented as "the same functionality" as the REST API, so stacking them would double-charge without adding capability).
345
+
346
+ **How to turn it on** — all four surfaces, covering either transport:
347
+
348
+ | Surface | How |
349
+ |---|---|
350
+ | CLI configure wizard | `voidaccess configure` then `voidaccess configure keys` — paste sites and RSS feeds will be flagged with their honest "never Tor" description before any field is asked for. The interactive prompt covers the key, pool type, and asks about each transport separately. |
351
+ | `voidaccess configure proxy` (subcommand) | Interactive prompt for key + pool type. Non-interactive flags: `--enable / --disable` (REST API transport), `--enable-proxy / --disable-proxy` (Proxy Mode transport), `--show` (prints masked key `abcd…5678`, pool type, and both transport states). |
352
+ | `setup.sh` during Docker install | Group F in the Enrichment Keys step; prompts for key + pool type, asks about each transport toggle separately. |
353
+ | `--use-proxies` flag (single run) | `voidaccess investigate "query" --use-proxies` — sets `VOIDACCESS_USE_PROXIES=true` (REST API transport) for the current process only, leaves the on-disk config untouched. |
354
+ | Docker / web settings page | Settings → API Keys → ScrapingAnt. Stored encrypted at rest via the existing per-user `UserApiKey` mechanism (Fernet AES-128). |
355
+
356
+ **Referral signup:** [https://scrapingant.com/?ref=mzliyzh](https://scrapingant.com/?ref=mzliyzh) (referral bonus applied on first paid plan; a free tier is available for low-volume use).
256
357
 
257
358
  ---
258
359
 
@@ -19,6 +19,35 @@ Commercial threat intelligence platforms often charge prohibitive annual fees fo
19
19
 
20
20
  ---
21
21
 
22
+ ## What's New in v1.6.0
23
+
24
+ - **Optional clearnet scraping proxy (ScrapingAnt)** — paste sites and RSS feeds can now be routed through ScrapingAnt. Affects clearnet scraping only; Tor, `.onion`, GitHub, and GitLab traffic are never affected.
25
+ - **Two mutually exclusive transports** — pick one, not both:
26
+ - **REST API transport** — `VOIDACCESS_USE_PROXIES=true` (legacy v1.5.0 toggle) routes requests through ScrapingAnt's Web Scraping API.
27
+ - **Proxy Mode transport** — `VOIDACCESS_USE_PROXY=true` routes requests through ScrapingAnt's HTTP CONNECT endpoint at `proxy.scrapingant.com:8080`.
28
+ - Per [ScrapingAnt docs](https://docs.scrapingant.com/proxy-mode): "Proxy Mode is a light front-end for the scraping API and has all the same functionality and performance" — so the two are alternate transports to the same backend, never chained.
29
+ - **`SCRAPINGANT_PROXY_TYPE`** — `residential` (default) or `datacenter`; per docs this is passed as a `proxy_type=` parameter in the Proxy Mode username string (which is built at connection time as `"scrapingant&browser=false&proxy_type=..."`). NOT a separate hostname.
30
+ - **Single credential** — `SCRAPINGANT_API_KEY` is the only real credential; the Proxy Mode username is a literal constant per docs. No per-customer username field, no second key.
31
+ - **New CLI surfaces** — `voidaccess configure proxy` now prompts for key + type in one uninterrupted block, plus `--enable-proxy / --disable-proxy` for non-interactive Proxy Mode toggling and `--show` for masked state inspection.
32
+ - **55 → 74 proxy-config tests** — covers both transports, the `apply_env()` independent-toggle guarantee, the single-transport selection logic, and the masked `--show` output.
33
+
34
+ ---
35
+
36
+ ## What's New in v1.5.0
37
+
38
+ - 37 new entity types across crypto, credentials, messaging, and network/forensic indicators.
39
+ - YARA, Snort, Suricata, and IOC package ZIP exports.
40
+ - Persistent actor profiles with aliases, infrastructure, notes, and timelines.
41
+ - Cross-alias resolution using shared infrastructure, shared PGP, string similarity, temporal co-activity, and co-investigation.
42
+ - Backend graph community detection and path-between-nodes queries.
43
+ - CLI graph browser path finder and frontend Find Path highlighting.
44
+ - Per-phase pipeline timeouts for enrichment, graph, summary, finalize, and parallel sources.
45
+ - `sources_used` and `infrastructure_clusters` persist in investigation metadata.
46
+ - Cross-run enrichment cache with Redis, SQLite, and memory backends.
47
+ - Auto-discovery and weekly validation of `.onion` seeds.
48
+
49
+ ---
50
+
22
51
  ## Quick Start
23
52
 
24
53
  ### Option A - CLI (no Docker, 30 seconds)
@@ -56,11 +85,28 @@ The Docker stack includes PostgreSQL, Tor, FastAPI, and Next.js.
56
85
  |---|---|
57
86
  | `voidaccess investigate` | Run an investigation |
58
87
  | `voidaccess show` | Interactive entity browser |
59
- | `voidaccess export` | Export STIX/MISP/Sigma/CSV/MD |
88
+ | `voidaccess export` | Export STIX/MISP/Sigma/YARA/Snort/Suricata/package/CSV/MD/JSON |
89
+ | `voidaccess package <file>` | Export an IOC ZIP bundle |
60
90
  | `voidaccess enrich` | Re-enrich saved results |
61
91
  | `voidaccess list` | List saved investigations |
62
- | `voidaccess status` | Config and API key status |
92
+ | `voidaccess status` | Config, API key, cache, engine, and seed status |
93
+ | `voidaccess actors` | List persistent actor profiles |
94
+ | `voidaccess actor <handle>` | Show an actor profile with aliases, infrastructure, notes, and history |
95
+ | `voidaccess actor <handle> --timeline` | Show an actor activity timeline |
96
+ | `voidaccess actor <handle> --note "text"` | Append an analyst note to an actor profile |
97
+ | `voidaccess timeline <handle>` | Shortcut for `voidaccess actor <handle> --timeline` |
63
98
  | `voidaccess configure` | Setup wizard |
99
+ | `voidaccess configure proxy` | ScrapingAnt key, username, type, and routing toggles. Flags: `--enable / --disable` (API gate), `--enable-proxy / --disable-proxy` (proxy gate), `--show` (masked state) |
100
+
101
+ Export examples:
102
+
103
+ ```bash
104
+ voidaccess package investigation.json
105
+ voidaccess export investigation.json --format yara
106
+ voidaccess export investigation.json --format snort
107
+ voidaccess export investigation.json --format suricata
108
+ voidaccess status --seeds
109
+ ```
64
110
 
65
111
  ### CLI vs Docker
66
112
 
@@ -121,14 +167,17 @@ VoidAccess handles the complexity of dark web research through a rigorous sequen
121
167
 
122
168
  ## What It Extracts
123
169
 
124
- The extraction pipeline identifies these entity types:
170
+ The extraction pipeline identifies 55+ entity types:
125
171
 
126
172
  | Category | Examples |
127
173
  |---|---|
128
- | **Cryptocurrency** | Bitcoin, Ethereum, Monero wallet addresses |
174
+ | **Cryptocurrency** | Bitcoin, Ethereum, Monero, Litecoin, Zcash, Dogecoin, XRP, Solana, Tron, Bitcoin Cash, Dash, ENS |
129
175
  | **Network Indicators** | IPv4 addresses, .onion URLs, domains, email addresses, PGP keys |
130
176
  | **File Indicators** | MD5, SHA1, SHA256 hashes |
131
- | **Vulnerabilities** | CVE numbers, MITRE ATT&CK techniques |
177
+ | **Credentials** | AWS keys, GitHub tokens, Slack tokens, Discord tokens, JWTs, Google API keys, Stripe keys, generic API keys, stealer log entries |
178
+ | **Messaging Handles** | Telegram, Discord, XMPP, Tox, Session, Matrix, Wire, ICQ, Wickr |
179
+ | **Network/Forensic** | IPv6, MAC addresses, IPFS CIDs, combo-list entries, YARA rules, MITRE tactics, Exploit-DB IDs, Nuclei templates, seed phrases |
180
+ | **Vulnerabilities** | CVE numbers, MITRE ATT&CK techniques and tactics |
132
181
  | **Threat Actors** | Actor handles, malware families, ransomware group names |
133
182
  | **Paste Sites** | Pastebin, Ghostbin, Rentry, and similar links |
134
183
  | **People/Orgs** | Named persons, organization names, locations |
@@ -168,7 +217,19 @@ Export formats:
168
217
  - **STIX 2.1** — bundles with indicators, threat actors, malware objects
169
218
  - **MISP JSON** — events with galaxies for direct import
170
219
  - **Sigma rules** — auto-generated detection rules from extracted IOCs
171
- - **CSV** flat entity dumps for spreadsheet analysis
220
+ - **YARA rules** - generated rules for malware, credentials, infrastructure, and IOC strings
221
+ - **Snort rules** - network detection rules for IPs, domains, URLs, and selected IOC content
222
+ - **Suricata rules** - Suricata-compatible network rules with the same IOC coverage as Snort
223
+ - **IOC package ZIP** - 21-file bundle containing text IOC lists, STIX, MISP, Sigma, YARA, Snort, Suricata, summary, and CSV
224
+ - **CSV** - flat entity dumps for spreadsheet analysis
225
+
226
+ ---
227
+
228
+ ## Actor Intelligence
229
+
230
+ VoidAccess v1.5.0 persists actor profiles across investigations in `actor_profiles`, with linked aliases and infrastructure in `actor_aliases` and `actor_infrastructure`. Profiles are populated from threat actor, ransomware group, and handle entities, then enriched with co-occurring infrastructure and timeline events.
231
+
232
+ Cross-alias resolution scores five signals: shared infrastructure, shared PGP, string similarity, temporal co-activity, and co-investigation. Use `voidaccess actors` to list profiles, `voidaccess actor <handle>` for the full profile, `voidaccess actor <handle> --timeline` for chronology, and `voidaccess actor <handle> --note "text"` for analyst notes.
172
233
 
173
234
  ---
174
235
 
@@ -207,6 +268,46 @@ All enrichment sources that require a key degrade gracefully when the key is abs
207
268
  | `GITLAB_TOKEN` | Raises GitLab scraping from 15 to 60 req/min | Free | [gitlab.com/profile/personal_access_tokens](https://gitlab.com/-/profile/personal_access_tokens) |
208
269
  | `BLOCKCYPHER_TOKEN` | BTC/ETH wallet balance and transaction graph | Yes | [blockcypher.com](https://www.blockcypher.com) |
209
270
  | `ETHERSCAN_API_KEY` | ETH wallet lookups | Yes | [etherscan.io/apis](https://etherscan.io/apis) |
271
+ | `SCRAPINGANT_API_KEY` + `VOIDACCESS_USE_PROXIES=true` | Optional clearnet proxy for paste sites + RSS feeds (see below) | Yes (free tier) | [scrapingant.com](https://scrapingant.com/?ref=mzliyzh) |
272
+
273
+ ### Optional: Clearnet Scraping Proxy (ScrapingAnt)
274
+
275
+ When a third-party clearnet site rate-limits or blocks VoidAccess's outbound IP, every paste site fetch and every RSS feed fetch in the same investigation can fail. The optional **ScrapingAnt** integration routes those specific requests through ScrapingAnt — either its Web Scraping API or its Proxy Mode HTTP CONNECT endpoint at `proxy.scrapingant.com:8080`.
276
+
277
+ **What it covers** — paste sites (Pastebin, dpaste, paste.ee, Rentry) and the 20 curated RSS security feeds (Krebs on Security, BleepingComputer, Talos, Mandiant, CrowdStrike, Unit 42, CISA, and others). Nothing else.
278
+
279
+ **What it does not cover** — Tor traffic, dark web scraping, and `.onion` fetches are **completely unaffected** by this setting regardless of how it is configured. The proxy only sees the two clearnet sources named above.
280
+
281
+ **GitHub and GitLab scraping are also unaffected** — and intentionally so. Both of those scrapers carry authentication tokens (`GITHUB_TOKEN`, `GITLAB_TOKEN`) in their requests. Forwarding those tokens through a third-party proxy would expose them to that third party, which is unacceptable from a security standpoint. Both scrapers always go direct to the GitHub/GitLab API regardless of the proxy setting. This is a permanent design constraint, not something the proxy toggle can override.
282
+
283
+ **It's entirely optional.** VoidAccess behaves identically without it — paste sites and RSS feeds are simply fetched directly, exactly as they were in every prior release. Add the key only if you see upstream rate-limiting or blocks affecting those two sources.
284
+
285
+ #### Two mutually exclusive transports (v1.6.0)
286
+
287
+ Per the [ScrapingAnt docs](https://docs.scrapingant.com/proxy-mode): *"The proxy mode is a light front-end for the scraping API and has all the same functionality and performance as sending requests to the API endpoint."* Therefore the two transports below are **alternate transports to the same backend service** — pick ONE per request, never both:
288
+
289
+ | Transport | Env var | Required config | What it does |
290
+ |---|---|---|---|
291
+ | **REST API** | `VOIDACCESS_USE_PROXIES=true` | `SCRAPINGANT_API_KEY` | POSTs the target URL to `api.scrapingant.com/v2/general` and returns the response body. Legacy v1.5.0 toggle. |
292
+ | **Proxy Mode** | `VOIDACCESS_USE_PROXY=true` | `SCRAPINGANT_API_KEY` (only) | Routes the request as HTTP CONNECT through `proxy.scrapingant.com:8080` with username string built at connection time per docs: `scrapingant&browser=false&proxy_type=residential\|datacenter`. |
293
+
294
+ The Proxy Mode transport also reads `SCRAPINGANT_PROXY_TYPE` to pick the pool: `residential` (default; harder to detect, slightly higher latency) or `datacenter` (faster, cheaper, easier to fingerprint).
295
+
296
+ **Missing credentials leave both transports inactive.** Setting either transport env var to `true` without `SCRAPINGANT_API_KEY` is a no-op for that transport. No errors, no surprises.
297
+
298
+ **If both transport env vars are set, Proxy Mode wins** with a one-shot info log at runtime — there is no chained mode (Proxy Mode is documented as "the same functionality" as the REST API, so stacking them would double-charge without adding capability).
299
+
300
+ **How to turn it on** — all four surfaces, covering either transport:
301
+
302
+ | Surface | How |
303
+ |---|---|
304
+ | CLI configure wizard | `voidaccess configure` then `voidaccess configure keys` — paste sites and RSS feeds will be flagged with their honest "never Tor" description before any field is asked for. The interactive prompt covers the key, pool type, and asks about each transport separately. |
305
+ | `voidaccess configure proxy` (subcommand) | Interactive prompt for key + pool type. Non-interactive flags: `--enable / --disable` (REST API transport), `--enable-proxy / --disable-proxy` (Proxy Mode transport), `--show` (prints masked key `abcd…5678`, pool type, and both transport states). |
306
+ | `setup.sh` during Docker install | Group F in the Enrichment Keys step; prompts for key + pool type, asks about each transport toggle separately. |
307
+ | `--use-proxies` flag (single run) | `voidaccess investigate "query" --use-proxies` — sets `VOIDACCESS_USE_PROXIES=true` (REST API transport) for the current process only, leaves the on-disk config untouched. |
308
+ | Docker / web settings page | Settings → API Keys → ScrapingAnt. Stored encrypted at rest via the existing per-user `UserApiKey` mechanism (Fernet AES-128). |
309
+
310
+ **Referral signup:** [https://scrapingant.com/?ref=mzliyzh](https://scrapingant.com/?ref=mzliyzh) (referral bonus applied on first paid plan; a free tier is available for low-volume use).
210
311
 
211
312
  ---
212
313
 
@@ -14,7 +14,8 @@ import asyncio
14
14
  import logging
15
15
  import os
16
16
  from contextlib import asynccontextmanager
17
- from typing import Callable
17
+ from datetime import datetime, timedelta, timezone
18
+ from typing import Callable, Optional
18
19
 
19
20
  from fastapi import FastAPI, Depends, Request
20
21
  from fastapi.exceptions import RequestValidationError
@@ -24,7 +25,7 @@ from slowapi import Limiter
24
25
  from slowapi.errors import RateLimitExceeded
25
26
  from slowapi.util import get_remote_address
26
27
 
27
- from api.routes import entities, export, investigations, monitors, search, auth, admin, settings
28
+ from api.routes import entities, export, investigations, monitors, search, auth, admin, settings, actors
28
29
  from api.auth import get_current_user
29
30
  from monitor.scheduler import start_scheduler
30
31
 
@@ -118,18 +119,19 @@ async def lifespan(app: FastAPI):
118
119
  except Exception as e:
119
120
  logger.warning(f"Seed database load failed (non-fatal): {e}")
120
121
 
121
- # Recover stranded processing investigations
122
+ # Recover stranded processing investigations (Phase 6.3 startup sweep)
123
+ # On startup: every investigation left in 'processing' by a previous
124
+ # process is marked failed — the pipeline tasks that owned them are
125
+ # gone. A periodic sweep (every 5 min) handles investigations that
126
+ # get stuck while the server is alive.
122
127
  try:
123
128
  if os.getenv("DATABASE_URL"):
124
- from db.session import get_session
125
- from db.models import Investigation
126
- with get_session() as session:
127
- stranded_count = session.query(Investigation).filter(Investigation.status == "processing").update(
128
- {"status": "failed", "summary": "Investigation interrupted due to server restart."}
129
+ swept = await _sweep_stuck_investigations(cutoff_minutes=None)
130
+ if swept:
131
+ logger.warning(
132
+ "Recovered %d stranded investigations (marked as failed).",
133
+ swept,
129
134
  )
130
- if stranded_count > 0:
131
- session.commit()
132
- logger.warning(f"Recovered {stranded_count} stranded investigations (marked as failed).")
133
135
  except Exception as e:
134
136
  logger.warning(f"Failed to recover stranded investigations: {e}")
135
137
 
@@ -145,9 +147,31 @@ async def lifespan(app: FastAPI):
145
147
  logger.error(f"APScheduler failed to start: {e}")
146
148
  scheduler = None
147
149
 
150
+ # Start periodic stuck-investigation sweeper (Phase 6.3). Runs every
151
+ # 5 minutes and marks investigations stuck in 'processing' for more
152
+ # than INVESTIGATION_HARD_TIMEOUT_MINUTES as 'failed'. Cancelled on
153
+ # shutdown below.
154
+ _periodic_sweep_task: Optional[asyncio.Task] = None
155
+ if os.getenv("DATABASE_URL"):
156
+ try:
157
+ _periodic_sweep_task = asyncio.create_task(
158
+ _periodic_stuck_sweep(),
159
+ name="voidaccess-stuck-investigation-sweeper",
160
+ )
161
+ logger.info("Periodic stuck-investigation sweeper started (every 5 min).")
162
+ except Exception as e:
163
+ logger.warning(f"Failed to start periodic sweeper: {e}")
164
+
148
165
  yield
149
166
 
150
167
  # --- Shutdown ---
168
+ if _periodic_sweep_task is not None and not _periodic_sweep_task.done():
169
+ _periodic_sweep_task.cancel()
170
+ try:
171
+ await _periodic_sweep_task
172
+ except (asyncio.CancelledError, Exception):
173
+ pass
174
+
151
175
  if scheduler and scheduler.running:
152
176
  scheduler.shutdown(wait=False)
153
177
  logger.warning("APScheduler stopped")
@@ -170,6 +194,120 @@ async def lifespan(app: FastAPI):
170
194
  pass
171
195
 
172
196
 
197
+ # ---------------------------------------------------------------------------
198
+ # Stuck-investigation sweeper (Phase 6.3)
199
+ # ---------------------------------------------------------------------------
200
+ # FastAPI BackgroundTasks runs in the same process as the HTTP handler.
201
+ # If the worker crashes mid-investigation, the row stays at status='processing'
202
+ # forever. This sweeper marks them 'failed' on two schedules:
203
+ #
204
+ # 1. Startup — cutoff_minutes=None → every 'processing' row is swept
205
+ # (the prior process is gone, no legitimate owner).
206
+ # 2. Periodic — every 5 minutes, cutoff = INVESTIGATION_HARD_TIMEOUT_MINUTES
207
+ # (configurable via env). Defends against in-process hangs.
208
+ #
209
+ # The sweep only ever UPDATES status; it never deletes rows.
210
+
211
+ # Hard timeout after which an investigation is considered permanently stuck.
212
+ # Default 30 min — generous enough to cover the slowest legitimate run
213
+ # (parallel_sources 300s + enrichment 120s + graph 60s + summary 90s + finalize
214
+ # 30s ≈ 10 min on a healthy host; 30 min is 3x that to absorb transient
215
+ # network slowness without false positives).
216
+ INVESTIGATION_HARD_TIMEOUT_MINUTES = int(
217
+ os.getenv("VOIDACCESS_INVESTIGATION_HARD_TIMEOUT_MINUTES", "30") or 30
218
+ )
219
+ # Periodic sweep interval. 5 min is a good default — catches stuck rows
220
+ # quickly without flooding the DB.
221
+ SWEEP_INTERVAL_SECONDS = int(
222
+ os.getenv("VOIDACCESS_SWEEP_INTERVAL_SECONDS", "300") or 300
223
+ )
224
+
225
+
226
+ async def _sweep_stuck_investigations(cutoff_minutes: Optional[int] = 30) -> int:
227
+ """Mark investigations stuck in 'processing' as 'failed'.
228
+
229
+ Args:
230
+ cutoff_minutes: Only sweep rows older than this many minutes.
231
+ ``None`` → startup mode: sweep *all* processing rows (the prior
232
+ process is gone, no legitimate owner remains).
233
+ ``int`` → periodic mode: sweep only rows older than the cutoff.
234
+
235
+ Returns the number of rows swept. Returns 0 when DB is unconfigured,
236
+ the table is missing, or no rows match — never raises.
237
+ """
238
+ if not os.getenv("DATABASE_URL"):
239
+ return 0
240
+ try:
241
+ from db.session import get_session
242
+ from db.models import Investigation
243
+
244
+ # Build the query in a short-lived session, do the UPDATE in another.
245
+ with get_session() as session:
246
+ query = session.query(Investigation).filter(
247
+ Investigation.status == "processing"
248
+ )
249
+ if cutoff_minutes is not None:
250
+ cutoff_dt = datetime.now(timezone.utc) - timedelta(
251
+ minutes=cutoff_minutes
252
+ )
253
+ query = query.filter(Investigation.created_at < cutoff_dt)
254
+
255
+ stuck = query.all()
256
+ if not stuck:
257
+ return 0
258
+
259
+ swept_ids = [inv.id for inv in stuck]
260
+ sweep_reason = (
261
+ "Server restarted mid-investigation"
262
+ if cutoff_minutes is None
263
+ else f"Investigation timed out after {cutoff_minutes} min — "
264
+ "server may have restarted or pipeline may be hung"
265
+ )
266
+
267
+ # Update outside the read session.
268
+ from sqlalchemy import update
269
+ with get_session() as session:
270
+ session.execute(
271
+ update(Investigation)
272
+ .where(Investigation.id.in_(swept_ids))
273
+ .values(
274
+ status="failed",
275
+ summary=sweep_reason,
276
+ )
277
+ )
278
+ session.commit()
279
+
280
+ for inv_id in swept_ids:
281
+ logger.warning("Swept stuck investigation: %s", inv_id)
282
+ logger.info("Swept %d stuck investigations (cutoff=%s)", len(swept_ids), cutoff_minutes)
283
+ return len(swept_ids)
284
+ except Exception as exc:
285
+ logger.warning("Swept-investigation sweep failed: %s", exc)
286
+ return 0
287
+
288
+
289
+ async def _periodic_stuck_sweep() -> None:
290
+ """Background task: every SWEEP_INTERVAL_SECONDS, sweep stuck rows.
291
+
292
+ Runs until cancelled by the lifespan teardown. Sleeps in a loop so
293
+ cancelling the task is the only stop signal — never raises.
294
+ """
295
+ try:
296
+ while True:
297
+ await asyncio.sleep(SWEEP_INTERVAL_SECONDS)
298
+ try:
299
+ await _sweep_stuck_investigations(
300
+ cutoff_minutes=INVESTIGATION_HARD_TIMEOUT_MINUTES,
301
+ )
302
+ except asyncio.CancelledError:
303
+ raise
304
+ except Exception as exc:
305
+ logger.warning("Periodic stuck-investigation sweep iteration failed: %s", exc)
306
+ except asyncio.CancelledError:
307
+ logger.info("Periodic stuck-investigation sweep cancelled.")
308
+ return
309
+
310
+
173
311
  # ---------------------------------------------------------------------------
174
312
  # App setup
175
313
  # ---------------------------------------------------------------------------
@@ -294,6 +432,12 @@ app.include_router(
294
432
  tags=["monitors"],
295
433
  dependencies=[Depends(get_current_user)],
296
434
  )
435
+ app.include_router(
436
+ actors.router,
437
+ prefix="/actors",
438
+ tags=["actors"],
439
+ dependencies=[Depends(get_current_user)],
440
+ )
297
441
  app.include_router(
298
442
  admin.router,
299
443
  prefix="/admin",