voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
graph/visualize.py ADDED
@@ -0,0 +1,178 @@
1
+ """
2
+ graph/visualize.py — Interactive graph visualisation using pyvis.
3
+
4
+ Converts a NetworkX MultiDiGraph to a pyvis Network and exports it as a
5
+ self-contained HTML file suitable for embedding in Streamlit via st.components.
6
+
7
+ If pyvis is not installed, all functions log a warning and return None / empty
8
+ string. They never raise on a missing dependency.
9
+
10
+ Public interface
11
+ ----------------
12
+ build_pyvis_network(graph, max_nodes, highlight_node_id) → Network | None
13
+ export_html(network, filepath) → None
14
+ get_html_string(network) → str
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import logging
20
+ from typing import TYPE_CHECKING, Optional
21
+
22
+ import networkx as nx
23
+
24
+ from graph.model import NODE_TYPES
25
+
26
+ if TYPE_CHECKING:
27
+ pass # pyvis.network.Network imported conditionally at runtime
28
+
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # Node colour palette (hex strings)
33
+ # ---------------------------------------------------------------------------
34
+
35
+ _NODE_COLORS: dict[str, str] = {
36
+ NODE_TYPES.THREAT_ACTOR: "#e74c3c", # red
37
+ NODE_TYPES.CRYPTO_WALLET: "#f39c12", # gold
38
+ NODE_TYPES.MALWARE_FAMILY: "#9b59b6", # purple
39
+ NODE_TYPES.RANSOMWARE_GROUP: "#9b59b6", # purple
40
+ NODE_TYPES.ONION_URL: "#3498db", # blue
41
+ NODE_TYPES.FORUM: "#3498db", # blue
42
+ NODE_TYPES.CVE: "#e67e22", # orange
43
+ NODE_TYPES.EMAIL_ADDRESS: "#2ecc71", # green
44
+ NODE_TYPES.PGP_KEY: "#2ecc71", # green
45
+ NODE_TYPES.PASTE: "#95a5a6", # grey
46
+ }
47
+
48
+ _DEFAULT_COLOR = "#bdc3c7" # light grey fallback
49
+ _HIGHLIGHT_BORDER = "#f1c40f" # yellow
50
+
51
+ # Edge width thresholds mapped to pyvis ``width`` values
52
+ _EDGE_WIDTH_THIN = 1.0 # confidence < 0.4
53
+ _EDGE_WIDTH_MEDIUM = 3.0 # 0.4 <= confidence < 0.7
54
+ _EDGE_WIDTH_THICK = 5.0 # confidence >= 0.7
55
+
56
+
57
+ def _confidence_to_width(confidence: float) -> float:
58
+ if confidence < 0.4:
59
+ return _EDGE_WIDTH_THIN
60
+ if confidence < 0.7:
61
+ return _EDGE_WIDTH_MEDIUM
62
+ return _EDGE_WIDTH_THICK
63
+
64
+
65
+ # ---------------------------------------------------------------------------
66
+ # Public functions
67
+ # ---------------------------------------------------------------------------
68
+
69
+
70
+ def build_pyvis_network(
71
+ graph: nx.MultiDiGraph,
72
+ max_nodes: int = 200,
73
+ highlight_node_id: Optional[str] = None,
74
+ ) -> "Optional[object]":
75
+ """
76
+ Convert *graph* into a pyvis Network.
77
+
78
+ If the graph has more than *max_nodes* nodes, only the highest-degree
79
+ nodes are retained.
80
+
81
+ If *highlight_node_id* is given, that node receives a yellow border.
82
+
83
+ Returns the pyvis Network, or None if pyvis is not installed.
84
+ """
85
+ try:
86
+ from pyvis.network import Network # noqa: PLC0415
87
+ except ImportError:
88
+ logger.warning(
89
+ "pyvis is not installed — graph visualisation is unavailable. "
90
+ "Install it with: pip install pyvis"
91
+ )
92
+ return None
93
+
94
+ # Trim graph to max_nodes highest-degree nodes if necessary
95
+ if graph.number_of_nodes() > max_nodes:
96
+ top_nodes = sorted(
97
+ graph.nodes(), key=lambda n: graph.degree(n), reverse=True
98
+ )[:max_nodes]
99
+ subgraph = graph.subgraph(top_nodes)
100
+ else:
101
+ subgraph = graph
102
+
103
+ net = Network(
104
+ height="750px",
105
+ width="100%",
106
+ directed=True,
107
+ notebook=False,
108
+ )
109
+ net.force_atlas_2based()
110
+
111
+ for node_id, data in subgraph.nodes(data=True):
112
+ node_type = data.get("node_type", "")
113
+ color = _NODE_COLORS.get(node_type, _DEFAULT_COLOR)
114
+
115
+ node_kwargs: dict = {
116
+ "label": node_id,
117
+ "color": color,
118
+ "title": f"Type: {node_type}\nID: {node_id}",
119
+ }
120
+
121
+ if node_id == highlight_node_id:
122
+ node_kwargs["color"] = {
123
+ "background": color,
124
+ "border": _HIGHLIGHT_BORDER,
125
+ }
126
+ node_kwargs["borderWidth"] = 3
127
+
128
+ net.add_node(node_id, **node_kwargs)
129
+
130
+ for src, tgt, data in subgraph.edges(data=True):
131
+ confidence = data.get("confidence", 0.5)
132
+ edge_type = data.get("edge_type", "")
133
+ net.add_edge(
134
+ src,
135
+ tgt,
136
+ title=f"{edge_type} (confidence={confidence:.2f})",
137
+ width=_confidence_to_width(confidence),
138
+ )
139
+
140
+ return net
141
+
142
+
143
+ def export_html(network: "object", filepath: str) -> None:
144
+ """
145
+ Save the pyvis Network as a self-contained HTML file.
146
+
147
+ Does nothing if *network* is None (pyvis not installed).
148
+ """
149
+ if network is None:
150
+ logger.warning("export_html called with None network — skipping.")
151
+ return
152
+ try:
153
+ network.save_graph(filepath) # type: ignore[attr-defined]
154
+ except Exception as exc:
155
+ logger.warning("export_html failed: %s", exc)
156
+
157
+
158
+ def get_html_string(network: "object") -> str:
159
+ """
160
+ Return the full interactive HTML as a string.
161
+
162
+ Used for embedding in Streamlit via ``st.components.v1.html()``.
163
+ Returns an empty string if *network* is None or pyvis is not installed.
164
+ """
165
+ if network is None:
166
+ return ""
167
+ try:
168
+ return network.generate_html() # type: ignore[attr-defined]
169
+ except AttributeError:
170
+ # Older pyvis versions use get_network_html
171
+ try:
172
+ return network.get_network_html() # type: ignore[attr-defined]
173
+ except Exception as exc:
174
+ logger.warning("get_html_string failed: %s", exc)
175
+ return ""
176
+ except Exception as exc:
177
+ logger.warning("get_html_string failed: %s", exc)
178
+ return ""
i18n/__init__.py ADDED
@@ -0,0 +1,24 @@
1
+ """
2
+ i18n — Multilingual intelligence: language detection, translation pipeline,
3
+ and query expansion for broader dark web coverage.
4
+
5
+ Public interface
6
+ ---------------
7
+ from i18n.detect import detect_language, detect_language_batch, is_non_english
8
+ from i18n.translate import translate_to_english, translate_batch
9
+ from i18n.query_expand import expand_query, get_multilingual_search_terms
10
+ """
11
+
12
+ from i18n.detect import detect_language, detect_language_batch, is_non_english
13
+ from i18n.query_expand import expand_query, get_multilingual_search_terms
14
+ from i18n.translate import translate_batch, translate_to_english
15
+
16
+ __all__ = [
17
+ "detect_language",
18
+ "detect_language_batch",
19
+ "is_non_english",
20
+ "translate_to_english",
21
+ "translate_batch",
22
+ "expand_query",
23
+ "get_multilingual_search_terms",
24
+ ]
i18n/detect.py ADDED
@@ -0,0 +1,76 @@
1
+ """
2
+ i18n/detect.py — Language detection for scraped content.
3
+
4
+ Uses the langdetect library for per-text language identification.
5
+ Returns ISO 639-1 language codes ("en", "ru", "zh", "ar", "es", "pt", etc.).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+ from typing import Optional
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def detect_language(text: str) -> Optional[str]:
17
+ """
18
+ Return the ISO 639-1 language code for *text*.
19
+
20
+ Returns None for very short text (<50 chars), undetectable text, or
21
+ when langdetect is not installed. Never raises.
22
+ """
23
+ try:
24
+ if not text or len(text) < 50:
25
+ return None
26
+
27
+ from langdetect import detect as ld_detect, LangDetectException # type: ignore
28
+
29
+ try:
30
+ return ld_detect(text)
31
+ except Exception:
32
+ return None
33
+
34
+ except ImportError:
35
+ logger.debug("detect_language: langdetect not installed")
36
+ return None
37
+ except Exception:
38
+ return None
39
+
40
+
41
+ def detect_language_batch(texts: list[str]) -> list[Optional[str]]:
42
+ """
43
+ Batch language detection. More efficient than calling detect_language
44
+ in a loop for large datasets.
45
+ """
46
+ try:
47
+ from langdetect import detect as ld_detect # type: ignore
48
+
49
+ results: list[Optional[str]] = []
50
+ for text in texts:
51
+ if not text or len(text) < 50:
52
+ results.append(None)
53
+ continue
54
+ try:
55
+ results.append(ld_detect(text))
56
+ except Exception:
57
+ results.append(None)
58
+ return results
59
+
60
+ except ImportError:
61
+ logger.debug("detect_language_batch: langdetect not installed")
62
+ return [None] * len(texts)
63
+ except Exception:
64
+ return [None] * len(texts)
65
+
66
+
67
+ def is_non_english(text: str) -> bool:
68
+ """
69
+ Return True if the detected language is not English (or detection fails).
70
+
71
+ Used as a quick gate before running translation.
72
+ """
73
+ lang = detect_language(text)
74
+ if lang is None:
75
+ return True
76
+ return lang != "en"
i18n/query_expand.py ADDED
@@ -0,0 +1,72 @@
1
+ """
2
+ i18n/query_expand.py — Expands a search query into multiple languages for
3
+ broader dark web coverage.
4
+
5
+ Russian, Chinese, and Arabic dark web communities contain high-value
6
+ intelligence that is almost entirely missed by English-only tools.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import logging
12
+ from typing import Optional
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ _DEFAULT_TARGET_LANGUAGES = ["ru", "zh", "ar", "es", "de", "fr", "pt"]
17
+
18
+
19
+ def expand_query(
20
+ query: str,
21
+ target_languages: Optional[list[str]] = None,
22
+ ) -> dict[str, str]:
23
+ """
24
+ Translate the query into multiple languages.
25
+
26
+ Default target languages: ru, zh, ar, es, de, fr, pt.
27
+ Returns dict: {"en": original_query, "ru": translated, ...}
28
+ Skips languages where translation fails — never returns None values.
29
+
30
+ Args:
31
+ query: Original English query
32
+ target_languages: List of ISO 639-1 codes. If None, uses I18N_LANGUAGES
33
+ from config.py, or falls back to default languages.
34
+ """
35
+ from i18n.translate import _translate_from_english
36
+
37
+ if target_languages is None:
38
+ try:
39
+ from config import I18N_LANGUAGES
40
+ if I18N_LANGUAGES:
41
+ target_languages = I18N_LANGUAGES
42
+ except ImportError:
43
+ pass
44
+ if not target_languages:
45
+ target_languages = ["en"] + _DEFAULT_TARGET_LANGUAGES
46
+
47
+ result: dict[str, str] = {"en": query}
48
+
49
+ for lang in target_languages:
50
+ if lang == "en":
51
+ continue
52
+ try:
53
+ translated = _translate_from_english(query, lang)
54
+ if translated is not None and translated != query:
55
+ result[lang] = translated
56
+ except Exception as exc:
57
+ logger.debug("expand_query: skipping lang=%s (%s)", lang, exc)
58
+
59
+ return result
60
+
61
+
62
+ def get_multilingual_search_terms(
63
+ query: str,
64
+ target_languages: Optional[list[str]] = None,
65
+ ) -> list[str]:
66
+ """
67
+ Return a flat list of all query translations (including original English).
68
+
69
+ Used by search.py to fan out searches in multiple languages.
70
+ """
71
+ translations = expand_query(query, target_languages)
72
+ return list(translations.values())
i18n/translate.py ADDED
@@ -0,0 +1,210 @@
1
+ """
2
+ i18n/translate.py — Translation pipeline.
3
+
4
+ Strategy (tried in order, falls back on failure):
5
+ 1. DeepL API if DEEPL_API_KEY is set
6
+ 2. Helsinki-NLP/opus-mt local model if transformers is available
7
+ 3. Returns None if both are unavailable
8
+
9
+ Text longer than 2000 chars is split into sentences, each translated,
10
+ then rejoined.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import logging
16
+ import os
17
+ import re
18
+ from typing import Optional
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # Module-level cache: {(src_lang, tgt_lang): (tokenizer, model)}
23
+ _model_cache: dict[tuple[str, str], tuple] = {}
24
+
25
+ _SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+")
26
+
27
+
28
+ def _split_sentences(text: str) -> list[str]:
29
+ parts = _SENTENCE_SPLIT_RE.split(text.strip())
30
+ return [p for p in parts if p.strip()]
31
+
32
+
33
+ def _deepl_translate(
34
+ text: str, target_lang: str, source_lang: Optional[str] = None
35
+ ) -> Optional[str]:
36
+ """Call the DeepL API to translate *text* to *target_lang*."""
37
+ api_key = os.getenv("DEEPL_API_KEY", "")
38
+ if not api_key:
39
+ return None
40
+
41
+ try:
42
+ import requests # type: ignore
43
+
44
+ params: dict = {
45
+ "auth_key": api_key,
46
+ "text": text,
47
+ "target_lang": target_lang.upper(),
48
+ }
49
+ if source_lang:
50
+ params["source_lang"] = source_lang.upper()
51
+
52
+ resp = requests.post(
53
+ "https://api-free.deepl.com/v2/translate",
54
+ data=params,
55
+ timeout=10,
56
+ )
57
+ resp.raise_for_status()
58
+ return resp.json()["translations"][0]["text"]
59
+ except Exception as exc:
60
+ logger.debug("DeepL translate failed: %s", exc)
61
+ return None
62
+
63
+
64
+ def _helsinki_translate(
65
+ text: str, src_lang: str, tgt_lang: str = "en"
66
+ ) -> Optional[str]:
67
+ """Translate using a Helsinki-NLP/opus-mt local model."""
68
+ try:
69
+ from transformers import MarianMTModel, MarianTokenizer # type: ignore
70
+
71
+ cache_key = (src_lang, tgt_lang)
72
+ if cache_key not in _model_cache:
73
+ model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
74
+ tokenizer = MarianTokenizer.from_pretrained(model_name)
75
+ model = MarianMTModel.from_pretrained(model_name)
76
+ _model_cache[cache_key] = (tokenizer, model)
77
+
78
+ tokenizer, model = _model_cache[cache_key]
79
+ inputs = tokenizer([text], return_tensors="pt", padding=True, truncation=True)
80
+ translated = model.generate(**inputs)
81
+ return tokenizer.decode(translated[0], skip_special_tokens=True)
82
+ except Exception as exc:
83
+ logger.debug("Helsinki-NLP translate failed: %s", exc)
84
+ return None
85
+
86
+
87
+ def _translate_long_text(
88
+ text: str,
89
+ translate_fn,
90
+ *args,
91
+ **kwargs,
92
+ ) -> Optional[str]:
93
+ """Split text into sentences, translate each, rejoin."""
94
+ sentences = _split_sentences(text)
95
+ if not sentences:
96
+ return translate_fn(text, *args, **kwargs)
97
+
98
+ translated_parts: list[str] = []
99
+ for sentence in sentences:
100
+ result = translate_fn(sentence, *args, **kwargs)
101
+ if result is None:
102
+ return None
103
+ translated_parts.append(result)
104
+ return " ".join(translated_parts)
105
+
106
+
107
+ def translate_to_english(
108
+ text: str,
109
+ source_lang: Optional[str] = None,
110
+ ) -> Optional[str]:
111
+ """
112
+ Translate *text* to English.
113
+
114
+ source_lang: ISO 639-1 code, or None to auto-detect.
115
+ Returns None on complete failure. Never raises.
116
+ """
117
+ try:
118
+ if not text:
119
+ return None
120
+
121
+ api_key = os.getenv("DEEPL_API_KEY", "")
122
+
123
+ # Split long texts at sentence boundaries
124
+ use_chunking = len(text) > 2000
125
+
126
+ # Strategy 1: DeepL
127
+ if api_key:
128
+ if use_chunking:
129
+ result = _translate_long_text(
130
+ text, _deepl_translate, "EN", source_lang
131
+ )
132
+ else:
133
+ result = _deepl_translate(text, "EN", source_lang)
134
+ if result is not None:
135
+ return result
136
+
137
+ # Strategy 2: Helsinki-NLP local model
138
+ if source_lang:
139
+ if use_chunking:
140
+ result = _translate_long_text(
141
+ text, _helsinki_translate, source_lang, "en"
142
+ )
143
+ else:
144
+ result = _helsinki_translate(text, source_lang, "en")
145
+ if result is not None:
146
+ return result
147
+
148
+ return None
149
+
150
+ except Exception as exc:
151
+ logger.debug("translate_to_english: unexpected error (%s)", exc)
152
+ return None
153
+
154
+
155
+ def translate_batch(
156
+ texts: list[str],
157
+ source_lang: Optional[str] = None,
158
+ ) -> list[Optional[str]]:
159
+ """
160
+ Translate a list of texts to English.
161
+
162
+ English texts are returned as-is. Returns list of same length.
163
+ """
164
+ try:
165
+ from i18n.detect import detect_language
166
+ except ImportError:
167
+ detect_language = lambda t: None # noqa: E731
168
+
169
+ results: list[Optional[str]] = []
170
+ for text in texts:
171
+ if not text:
172
+ results.append(text)
173
+ continue
174
+ detected = detect_language(text) if not source_lang else source_lang
175
+ if detected == "en":
176
+ results.append(text)
177
+ else:
178
+ results.append(translate_to_english(text, detected))
179
+ return results
180
+
181
+
182
+ def _translate_from_english(text: str, target_lang: str) -> Optional[str]:
183
+ """
184
+ Translate English text to *target_lang*.
185
+
186
+ Mirrors translate_to_english but in reverse direction.
187
+ Used by query_expand.py.
188
+ """
189
+ try:
190
+ if not text:
191
+ return None
192
+
193
+ api_key = os.getenv("DEEPL_API_KEY", "")
194
+
195
+ # Strategy 1: DeepL (EN → target)
196
+ if api_key:
197
+ result = _deepl_translate(text, target_lang, "EN")
198
+ if result is not None:
199
+ return result
200
+
201
+ # Strategy 2: Helsinki-NLP (en → target_lang)
202
+ result = _helsinki_translate(text, "en", target_lang)
203
+ if result is not None:
204
+ return result
205
+
206
+ return None
207
+
208
+ except Exception as exc:
209
+ logger.debug("_translate_from_english: error (%s)", exc)
210
+ return None
monitor/__init__.py ADDED
@@ -0,0 +1,27 @@
1
+ """
2
+ monitor — Phase 4 continuous monitoring, diffing, and alerts.
3
+ """
4
+
5
+ from monitor.alerts import dispatch_alerts
6
+ from monitor.config import load_watches
7
+ from monitor.diff import compute_diff, is_significant_change
8
+ from monitor.jobs import run_keyword_watch, run_url_watch
9
+ from monitor.scheduler import (
10
+ get_job_status,
11
+ start_scheduler,
12
+ stop_scheduler,
13
+ trigger_job_now,
14
+ )
15
+
16
+ __all__ = [
17
+ "load_watches",
18
+ "run_keyword_watch",
19
+ "run_url_watch",
20
+ "compute_diff",
21
+ "is_significant_change",
22
+ "dispatch_alerts",
23
+ "start_scheduler",
24
+ "stop_scheduler",
25
+ "get_job_status",
26
+ "trigger_job_now",
27
+ ]
monitor/_db.py ADDED
@@ -0,0 +1,74 @@
1
+ """
2
+ Database helpers for monitor jobs (URL watch state). Not imported by tests as pipeline mocks.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import logging
8
+ import os
9
+ from urllib.parse import urlparse
10
+
11
+ from db.models import SourceType
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def _onion_key_for_url(url: str) -> str:
17
+ """Stable key for Source.onion_address (max 255)."""
18
+ u = (url or "").strip()
19
+ if not u:
20
+ return ""
21
+ try:
22
+ p = urlparse(u)
23
+ host = (p.netloc or p.path or "").split("/")[0]
24
+ if host:
25
+ return host[:255]
26
+ except Exception:
27
+ pass
28
+ return u[:255]
29
+
30
+
31
+ def get_last_cleaned_text_for_url(url: str) -> str:
32
+ """Return latest cleaned_text for *url* from pages table, or ''. Never raises."""
33
+ if not os.getenv("DATABASE_URL"):
34
+ return ""
35
+ try:
36
+ from db.queries import get_page_by_url # noqa: PLC0415
37
+ from db.session import get_session # noqa: PLC0415
38
+
39
+ with get_session() as session:
40
+ page = get_page_by_url(session, url)
41
+ if page is None or not page.cleaned_text:
42
+ return ""
43
+ return str(page.cleaned_text)
44
+ except Exception as exc:
45
+ logger.warning("get_last_cleaned_text_for_url failed: %s", exc)
46
+ return ""
47
+
48
+
49
+ def update_source_watch_fingerprint(url: str, content_hash_hex: str) -> None:
50
+ """
51
+ Upsert Source row and store a short content fingerprint in status (VARCHAR(20)).
52
+ """
53
+ if not os.getenv("DATABASE_URL"):
54
+ return
55
+ fp = (content_hash_hex or "")[:20]
56
+ if not fp:
57
+ return
58
+ key = _onion_key_for_url(url)
59
+ if not key:
60
+ return
61
+ try:
62
+ from db.queries import get_or_create_source, update_source_status # noqa: PLC0415
63
+ from db.session import get_session # noqa: PLC0415
64
+
65
+ with get_session() as session:
66
+ src, _created = get_or_create_source(
67
+ session,
68
+ onion_address=key,
69
+ source_type=SourceType.CRAWLED.value,
70
+ )
71
+ session.flush()
72
+ update_source_status(session, src.id, fp)
73
+ except Exception as exc:
74
+ logger.warning("update_source_watch_fingerprint failed: %s", exc)