voidaccess 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- analysis/__init__.py +49 -0
- analysis/opsec.py +454 -0
- analysis/patterns.py +202 -0
- analysis/temporal.py +201 -0
- api/__init__.py +1 -0
- api/auth.py +163 -0
- api/main.py +509 -0
- api/routes/__init__.py +1 -0
- api/routes/admin.py +214 -0
- api/routes/auth.py +157 -0
- api/routes/entities.py +871 -0
- api/routes/export.py +359 -0
- api/routes/investigations.py +2567 -0
- api/routes/monitors.py +405 -0
- api/routes/search.py +157 -0
- api/routes/settings.py +851 -0
- auth/__init__.py +1 -0
- auth/token_blacklist.py +108 -0
- cli/__init__.py +3 -0
- cli/adapters/__init__.py +1 -0
- cli/adapters/sqlite.py +273 -0
- cli/browser.py +376 -0
- cli/commands/__init__.py +1 -0
- cli/commands/configure.py +185 -0
- cli/commands/enrich.py +154 -0
- cli/commands/export.py +158 -0
- cli/commands/investigate.py +601 -0
- cli/commands/show.py +87 -0
- cli/config.py +180 -0
- cli/display.py +212 -0
- cli/main.py +154 -0
- cli/tor_detect.py +71 -0
- config.py +180 -0
- crawler/__init__.py +28 -0
- crawler/dedup.py +97 -0
- crawler/frontier.py +115 -0
- crawler/spider.py +462 -0
- crawler/utils.py +122 -0
- db/__init__.py +47 -0
- db/migrations/__init__.py +0 -0
- db/migrations/env.py +80 -0
- db/migrations/versions/0001_initial_schema.py +270 -0
- db/migrations/versions/0002_add_investigation_status_column.py +27 -0
- db/migrations/versions/0002_add_missing_tables.py +33 -0
- db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
- db/migrations/versions/0004_add_page_posted_at.py +41 -0
- db/migrations/versions/0005_add_extraction_method.py +32 -0
- db/migrations/versions/0006_add_monitor_alerts.py +26 -0
- db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
- db/migrations/versions/0008_add_users_table.py +47 -0
- db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
- db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
- db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
- db/migrations/versions/0013_add_graph_status.py +31 -0
- db/migrations/versions/0015_add_progress_fields.py +41 -0
- db/migrations/versions/0016_backfill_graph_status.py +33 -0
- db/migrations/versions/0017_add_user_api_keys.py +44 -0
- db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
- db/migrations/versions/0019_add_content_safety_log.py +46 -0
- db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
- db/models.py +618 -0
- db/queries.py +841 -0
- db/session.py +270 -0
- export/__init__.py +34 -0
- export/misp.py +257 -0
- export/sigma.py +342 -0
- export/stix.py +418 -0
- extractor/__init__.py +21 -0
- extractor/llm_extract.py +372 -0
- extractor/ner.py +512 -0
- extractor/normalizer.py +638 -0
- extractor/pipeline.py +401 -0
- extractor/regex_patterns.py +325 -0
- fingerprint/__init__.py +33 -0
- fingerprint/profiler.py +240 -0
- fingerprint/stylometry.py +249 -0
- graph/__init__.py +73 -0
- graph/builder.py +894 -0
- graph/export.py +225 -0
- graph/model.py +83 -0
- graph/queries.py +297 -0
- graph/visualize.py +178 -0
- i18n/__init__.py +24 -0
- i18n/detect.py +76 -0
- i18n/query_expand.py +72 -0
- i18n/translate.py +210 -0
- monitor/__init__.py +27 -0
- monitor/_db.py +74 -0
- monitor/alerts.py +345 -0
- monitor/config.py +118 -0
- monitor/diff.py +75 -0
- monitor/jobs.py +247 -0
- monitor/scheduler.py +184 -0
- scraper/__init__.py +0 -0
- scraper/scrape.py +857 -0
- scraper/scrape_js.py +272 -0
- search/__init__.py +318 -0
- search/circuit_breaker.py +240 -0
- search/search.py +334 -0
- sources/__init__.py +96 -0
- sources/blockchain.py +444 -0
- sources/cache.py +93 -0
- sources/cisa.py +108 -0
- sources/dns_enrichment.py +557 -0
- sources/domain_reputation.py +643 -0
- sources/email_reputation.py +635 -0
- sources/engines.py +244 -0
- sources/enrichment.py +1244 -0
- sources/github_scraper.py +589 -0
- sources/gitlab_scraper.py +624 -0
- sources/hash_reputation.py +856 -0
- sources/historical_intel.py +253 -0
- sources/ip_reputation.py +521 -0
- sources/paste_scraper.py +484 -0
- sources/pastes.py +278 -0
- sources/rss_scraper.py +576 -0
- sources/seed_manager.py +373 -0
- sources/seeds.py +368 -0
- sources/shodan.py +103 -0
- sources/telegram.py +199 -0
- sources/virustotal.py +113 -0
- utils/__init__.py +0 -0
- utils/async_utils.py +89 -0
- utils/content_safety.py +193 -0
- utils/defang.py +94 -0
- utils/encryption.py +34 -0
- utils/ioc_freshness.py +124 -0
- utils/user_keys.py +33 -0
- vector/__init__.py +39 -0
- vector/embedder.py +100 -0
- vector/model_singleton.py +49 -0
- vector/search.py +87 -0
- vector/store.py +514 -0
- voidaccess/__init__.py +0 -0
- voidaccess/llm.py +717 -0
- voidaccess/llm_utils.py +696 -0
- voidaccess-1.3.0.dist-info/METADATA +395 -0
- voidaccess-1.3.0.dist-info/RECORD +142 -0
- voidaccess-1.3.0.dist-info/WHEEL +5 -0
- voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
- voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
- voidaccess-1.3.0.dist-info/top_level.txt +19 -0
graph/visualize.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""
|
|
2
|
+
graph/visualize.py — Interactive graph visualisation using pyvis.
|
|
3
|
+
|
|
4
|
+
Converts a NetworkX MultiDiGraph to a pyvis Network and exports it as a
|
|
5
|
+
self-contained HTML file suitable for embedding in Streamlit via st.components.
|
|
6
|
+
|
|
7
|
+
If pyvis is not installed, all functions log a warning and return None / empty
|
|
8
|
+
string. They never raise on a missing dependency.
|
|
9
|
+
|
|
10
|
+
Public interface
|
|
11
|
+
----------------
|
|
12
|
+
build_pyvis_network(graph, max_nodes, highlight_node_id) → Network | None
|
|
13
|
+
export_html(network, filepath) → None
|
|
14
|
+
get_html_string(network) → str
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import logging
|
|
20
|
+
from typing import TYPE_CHECKING, Optional
|
|
21
|
+
|
|
22
|
+
import networkx as nx
|
|
23
|
+
|
|
24
|
+
from graph.model import NODE_TYPES
|
|
25
|
+
|
|
26
|
+
if TYPE_CHECKING:
|
|
27
|
+
pass # pyvis.network.Network imported conditionally at runtime
|
|
28
|
+
|
|
29
|
+
logger = logging.getLogger(__name__)
|
|
30
|
+
|
|
31
|
+
# ---------------------------------------------------------------------------
|
|
32
|
+
# Node colour palette (hex strings)
|
|
33
|
+
# ---------------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
_NODE_COLORS: dict[str, str] = {
|
|
36
|
+
NODE_TYPES.THREAT_ACTOR: "#e74c3c", # red
|
|
37
|
+
NODE_TYPES.CRYPTO_WALLET: "#f39c12", # gold
|
|
38
|
+
NODE_TYPES.MALWARE_FAMILY: "#9b59b6", # purple
|
|
39
|
+
NODE_TYPES.RANSOMWARE_GROUP: "#9b59b6", # purple
|
|
40
|
+
NODE_TYPES.ONION_URL: "#3498db", # blue
|
|
41
|
+
NODE_TYPES.FORUM: "#3498db", # blue
|
|
42
|
+
NODE_TYPES.CVE: "#e67e22", # orange
|
|
43
|
+
NODE_TYPES.EMAIL_ADDRESS: "#2ecc71", # green
|
|
44
|
+
NODE_TYPES.PGP_KEY: "#2ecc71", # green
|
|
45
|
+
NODE_TYPES.PASTE: "#95a5a6", # grey
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
_DEFAULT_COLOR = "#bdc3c7" # light grey fallback
|
|
49
|
+
_HIGHLIGHT_BORDER = "#f1c40f" # yellow
|
|
50
|
+
|
|
51
|
+
# Edge width thresholds mapped to pyvis ``width`` values
|
|
52
|
+
_EDGE_WIDTH_THIN = 1.0 # confidence < 0.4
|
|
53
|
+
_EDGE_WIDTH_MEDIUM = 3.0 # 0.4 <= confidence < 0.7
|
|
54
|
+
_EDGE_WIDTH_THICK = 5.0 # confidence >= 0.7
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _confidence_to_width(confidence: float) -> float:
|
|
58
|
+
if confidence < 0.4:
|
|
59
|
+
return _EDGE_WIDTH_THIN
|
|
60
|
+
if confidence < 0.7:
|
|
61
|
+
return _EDGE_WIDTH_MEDIUM
|
|
62
|
+
return _EDGE_WIDTH_THICK
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# ---------------------------------------------------------------------------
|
|
66
|
+
# Public functions
|
|
67
|
+
# ---------------------------------------------------------------------------
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def build_pyvis_network(
|
|
71
|
+
graph: nx.MultiDiGraph,
|
|
72
|
+
max_nodes: int = 200,
|
|
73
|
+
highlight_node_id: Optional[str] = None,
|
|
74
|
+
) -> "Optional[object]":
|
|
75
|
+
"""
|
|
76
|
+
Convert *graph* into a pyvis Network.
|
|
77
|
+
|
|
78
|
+
If the graph has more than *max_nodes* nodes, only the highest-degree
|
|
79
|
+
nodes are retained.
|
|
80
|
+
|
|
81
|
+
If *highlight_node_id* is given, that node receives a yellow border.
|
|
82
|
+
|
|
83
|
+
Returns the pyvis Network, or None if pyvis is not installed.
|
|
84
|
+
"""
|
|
85
|
+
try:
|
|
86
|
+
from pyvis.network import Network # noqa: PLC0415
|
|
87
|
+
except ImportError:
|
|
88
|
+
logger.warning(
|
|
89
|
+
"pyvis is not installed — graph visualisation is unavailable. "
|
|
90
|
+
"Install it with: pip install pyvis"
|
|
91
|
+
)
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
# Trim graph to max_nodes highest-degree nodes if necessary
|
|
95
|
+
if graph.number_of_nodes() > max_nodes:
|
|
96
|
+
top_nodes = sorted(
|
|
97
|
+
graph.nodes(), key=lambda n: graph.degree(n), reverse=True
|
|
98
|
+
)[:max_nodes]
|
|
99
|
+
subgraph = graph.subgraph(top_nodes)
|
|
100
|
+
else:
|
|
101
|
+
subgraph = graph
|
|
102
|
+
|
|
103
|
+
net = Network(
|
|
104
|
+
height="750px",
|
|
105
|
+
width="100%",
|
|
106
|
+
directed=True,
|
|
107
|
+
notebook=False,
|
|
108
|
+
)
|
|
109
|
+
net.force_atlas_2based()
|
|
110
|
+
|
|
111
|
+
for node_id, data in subgraph.nodes(data=True):
|
|
112
|
+
node_type = data.get("node_type", "")
|
|
113
|
+
color = _NODE_COLORS.get(node_type, _DEFAULT_COLOR)
|
|
114
|
+
|
|
115
|
+
node_kwargs: dict = {
|
|
116
|
+
"label": node_id,
|
|
117
|
+
"color": color,
|
|
118
|
+
"title": f"Type: {node_type}\nID: {node_id}",
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
if node_id == highlight_node_id:
|
|
122
|
+
node_kwargs["color"] = {
|
|
123
|
+
"background": color,
|
|
124
|
+
"border": _HIGHLIGHT_BORDER,
|
|
125
|
+
}
|
|
126
|
+
node_kwargs["borderWidth"] = 3
|
|
127
|
+
|
|
128
|
+
net.add_node(node_id, **node_kwargs)
|
|
129
|
+
|
|
130
|
+
for src, tgt, data in subgraph.edges(data=True):
|
|
131
|
+
confidence = data.get("confidence", 0.5)
|
|
132
|
+
edge_type = data.get("edge_type", "")
|
|
133
|
+
net.add_edge(
|
|
134
|
+
src,
|
|
135
|
+
tgt,
|
|
136
|
+
title=f"{edge_type} (confidence={confidence:.2f})",
|
|
137
|
+
width=_confidence_to_width(confidence),
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
return net
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def export_html(network: "object", filepath: str) -> None:
|
|
144
|
+
"""
|
|
145
|
+
Save the pyvis Network as a self-contained HTML file.
|
|
146
|
+
|
|
147
|
+
Does nothing if *network* is None (pyvis not installed).
|
|
148
|
+
"""
|
|
149
|
+
if network is None:
|
|
150
|
+
logger.warning("export_html called with None network — skipping.")
|
|
151
|
+
return
|
|
152
|
+
try:
|
|
153
|
+
network.save_graph(filepath) # type: ignore[attr-defined]
|
|
154
|
+
except Exception as exc:
|
|
155
|
+
logger.warning("export_html failed: %s", exc)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def get_html_string(network: "object") -> str:
|
|
159
|
+
"""
|
|
160
|
+
Return the full interactive HTML as a string.
|
|
161
|
+
|
|
162
|
+
Used for embedding in Streamlit via ``st.components.v1.html()``.
|
|
163
|
+
Returns an empty string if *network* is None or pyvis is not installed.
|
|
164
|
+
"""
|
|
165
|
+
if network is None:
|
|
166
|
+
return ""
|
|
167
|
+
try:
|
|
168
|
+
return network.generate_html() # type: ignore[attr-defined]
|
|
169
|
+
except AttributeError:
|
|
170
|
+
# Older pyvis versions use get_network_html
|
|
171
|
+
try:
|
|
172
|
+
return network.get_network_html() # type: ignore[attr-defined]
|
|
173
|
+
except Exception as exc:
|
|
174
|
+
logger.warning("get_html_string failed: %s", exc)
|
|
175
|
+
return ""
|
|
176
|
+
except Exception as exc:
|
|
177
|
+
logger.warning("get_html_string failed: %s", exc)
|
|
178
|
+
return ""
|
i18n/__init__.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""
|
|
2
|
+
i18n — Multilingual intelligence: language detection, translation pipeline,
|
|
3
|
+
and query expansion for broader dark web coverage.
|
|
4
|
+
|
|
5
|
+
Public interface
|
|
6
|
+
---------------
|
|
7
|
+
from i18n.detect import detect_language, detect_language_batch, is_non_english
|
|
8
|
+
from i18n.translate import translate_to_english, translate_batch
|
|
9
|
+
from i18n.query_expand import expand_query, get_multilingual_search_terms
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from i18n.detect import detect_language, detect_language_batch, is_non_english
|
|
13
|
+
from i18n.query_expand import expand_query, get_multilingual_search_terms
|
|
14
|
+
from i18n.translate import translate_batch, translate_to_english
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"detect_language",
|
|
18
|
+
"detect_language_batch",
|
|
19
|
+
"is_non_english",
|
|
20
|
+
"translate_to_english",
|
|
21
|
+
"translate_batch",
|
|
22
|
+
"expand_query",
|
|
23
|
+
"get_multilingual_search_terms",
|
|
24
|
+
]
|
i18n/detect.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""
|
|
2
|
+
i18n/detect.py — Language detection for scraped content.
|
|
3
|
+
|
|
4
|
+
Uses the langdetect library for per-text language identification.
|
|
5
|
+
Returns ISO 639-1 language codes ("en", "ru", "zh", "ar", "es", "pt", etc.).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from typing import Optional
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def detect_language(text: str) -> Optional[str]:
|
|
17
|
+
"""
|
|
18
|
+
Return the ISO 639-1 language code for *text*.
|
|
19
|
+
|
|
20
|
+
Returns None for very short text (<50 chars), undetectable text, or
|
|
21
|
+
when langdetect is not installed. Never raises.
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
if not text or len(text) < 50:
|
|
25
|
+
return None
|
|
26
|
+
|
|
27
|
+
from langdetect import detect as ld_detect, LangDetectException # type: ignore
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
return ld_detect(text)
|
|
31
|
+
except Exception:
|
|
32
|
+
return None
|
|
33
|
+
|
|
34
|
+
except ImportError:
|
|
35
|
+
logger.debug("detect_language: langdetect not installed")
|
|
36
|
+
return None
|
|
37
|
+
except Exception:
|
|
38
|
+
return None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def detect_language_batch(texts: list[str]) -> list[Optional[str]]:
|
|
42
|
+
"""
|
|
43
|
+
Batch language detection. More efficient than calling detect_language
|
|
44
|
+
in a loop for large datasets.
|
|
45
|
+
"""
|
|
46
|
+
try:
|
|
47
|
+
from langdetect import detect as ld_detect # type: ignore
|
|
48
|
+
|
|
49
|
+
results: list[Optional[str]] = []
|
|
50
|
+
for text in texts:
|
|
51
|
+
if not text or len(text) < 50:
|
|
52
|
+
results.append(None)
|
|
53
|
+
continue
|
|
54
|
+
try:
|
|
55
|
+
results.append(ld_detect(text))
|
|
56
|
+
except Exception:
|
|
57
|
+
results.append(None)
|
|
58
|
+
return results
|
|
59
|
+
|
|
60
|
+
except ImportError:
|
|
61
|
+
logger.debug("detect_language_batch: langdetect not installed")
|
|
62
|
+
return [None] * len(texts)
|
|
63
|
+
except Exception:
|
|
64
|
+
return [None] * len(texts)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def is_non_english(text: str) -> bool:
|
|
68
|
+
"""
|
|
69
|
+
Return True if the detected language is not English (or detection fails).
|
|
70
|
+
|
|
71
|
+
Used as a quick gate before running translation.
|
|
72
|
+
"""
|
|
73
|
+
lang = detect_language(text)
|
|
74
|
+
if lang is None:
|
|
75
|
+
return True
|
|
76
|
+
return lang != "en"
|
i18n/query_expand.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""
|
|
2
|
+
i18n/query_expand.py — Expands a search query into multiple languages for
|
|
3
|
+
broader dark web coverage.
|
|
4
|
+
|
|
5
|
+
Russian, Chinese, and Arabic dark web communities contain high-value
|
|
6
|
+
intelligence that is almost entirely missed by English-only tools.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
_DEFAULT_TARGET_LANGUAGES = ["ru", "zh", "ar", "es", "de", "fr", "pt"]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def expand_query(
|
|
20
|
+
query: str,
|
|
21
|
+
target_languages: Optional[list[str]] = None,
|
|
22
|
+
) -> dict[str, str]:
|
|
23
|
+
"""
|
|
24
|
+
Translate the query into multiple languages.
|
|
25
|
+
|
|
26
|
+
Default target languages: ru, zh, ar, es, de, fr, pt.
|
|
27
|
+
Returns dict: {"en": original_query, "ru": translated, ...}
|
|
28
|
+
Skips languages where translation fails — never returns None values.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
query: Original English query
|
|
32
|
+
target_languages: List of ISO 639-1 codes. If None, uses I18N_LANGUAGES
|
|
33
|
+
from config.py, or falls back to default languages.
|
|
34
|
+
"""
|
|
35
|
+
from i18n.translate import _translate_from_english
|
|
36
|
+
|
|
37
|
+
if target_languages is None:
|
|
38
|
+
try:
|
|
39
|
+
from config import I18N_LANGUAGES
|
|
40
|
+
if I18N_LANGUAGES:
|
|
41
|
+
target_languages = I18N_LANGUAGES
|
|
42
|
+
except ImportError:
|
|
43
|
+
pass
|
|
44
|
+
if not target_languages:
|
|
45
|
+
target_languages = ["en"] + _DEFAULT_TARGET_LANGUAGES
|
|
46
|
+
|
|
47
|
+
result: dict[str, str] = {"en": query}
|
|
48
|
+
|
|
49
|
+
for lang in target_languages:
|
|
50
|
+
if lang == "en":
|
|
51
|
+
continue
|
|
52
|
+
try:
|
|
53
|
+
translated = _translate_from_english(query, lang)
|
|
54
|
+
if translated is not None and translated != query:
|
|
55
|
+
result[lang] = translated
|
|
56
|
+
except Exception as exc:
|
|
57
|
+
logger.debug("expand_query: skipping lang=%s (%s)", lang, exc)
|
|
58
|
+
|
|
59
|
+
return result
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def get_multilingual_search_terms(
|
|
63
|
+
query: str,
|
|
64
|
+
target_languages: Optional[list[str]] = None,
|
|
65
|
+
) -> list[str]:
|
|
66
|
+
"""
|
|
67
|
+
Return a flat list of all query translations (including original English).
|
|
68
|
+
|
|
69
|
+
Used by search.py to fan out searches in multiple languages.
|
|
70
|
+
"""
|
|
71
|
+
translations = expand_query(query, target_languages)
|
|
72
|
+
return list(translations.values())
|
i18n/translate.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""
|
|
2
|
+
i18n/translate.py — Translation pipeline.
|
|
3
|
+
|
|
4
|
+
Strategy (tried in order, falls back on failure):
|
|
5
|
+
1. DeepL API if DEEPL_API_KEY is set
|
|
6
|
+
2. Helsinki-NLP/opus-mt local model if transformers is available
|
|
7
|
+
3. Returns None if both are unavailable
|
|
8
|
+
|
|
9
|
+
Text longer than 2000 chars is split into sentences, each translated,
|
|
10
|
+
then rejoined.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
import os
|
|
17
|
+
import re
|
|
18
|
+
from typing import Optional
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
# Module-level cache: {(src_lang, tgt_lang): (tokenizer, model)}
|
|
23
|
+
_model_cache: dict[tuple[str, str], tuple] = {}
|
|
24
|
+
|
|
25
|
+
_SENTENCE_SPLIT_RE = re.compile(r"(?<=[.!?])\s+")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _split_sentences(text: str) -> list[str]:
|
|
29
|
+
parts = _SENTENCE_SPLIT_RE.split(text.strip())
|
|
30
|
+
return [p for p in parts if p.strip()]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _deepl_translate(
|
|
34
|
+
text: str, target_lang: str, source_lang: Optional[str] = None
|
|
35
|
+
) -> Optional[str]:
|
|
36
|
+
"""Call the DeepL API to translate *text* to *target_lang*."""
|
|
37
|
+
api_key = os.getenv("DEEPL_API_KEY", "")
|
|
38
|
+
if not api_key:
|
|
39
|
+
return None
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
import requests # type: ignore
|
|
43
|
+
|
|
44
|
+
params: dict = {
|
|
45
|
+
"auth_key": api_key,
|
|
46
|
+
"text": text,
|
|
47
|
+
"target_lang": target_lang.upper(),
|
|
48
|
+
}
|
|
49
|
+
if source_lang:
|
|
50
|
+
params["source_lang"] = source_lang.upper()
|
|
51
|
+
|
|
52
|
+
resp = requests.post(
|
|
53
|
+
"https://api-free.deepl.com/v2/translate",
|
|
54
|
+
data=params,
|
|
55
|
+
timeout=10,
|
|
56
|
+
)
|
|
57
|
+
resp.raise_for_status()
|
|
58
|
+
return resp.json()["translations"][0]["text"]
|
|
59
|
+
except Exception as exc:
|
|
60
|
+
logger.debug("DeepL translate failed: %s", exc)
|
|
61
|
+
return None
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _helsinki_translate(
|
|
65
|
+
text: str, src_lang: str, tgt_lang: str = "en"
|
|
66
|
+
) -> Optional[str]:
|
|
67
|
+
"""Translate using a Helsinki-NLP/opus-mt local model."""
|
|
68
|
+
try:
|
|
69
|
+
from transformers import MarianMTModel, MarianTokenizer # type: ignore
|
|
70
|
+
|
|
71
|
+
cache_key = (src_lang, tgt_lang)
|
|
72
|
+
if cache_key not in _model_cache:
|
|
73
|
+
model_name = f"Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}"
|
|
74
|
+
tokenizer = MarianTokenizer.from_pretrained(model_name)
|
|
75
|
+
model = MarianMTModel.from_pretrained(model_name)
|
|
76
|
+
_model_cache[cache_key] = (tokenizer, model)
|
|
77
|
+
|
|
78
|
+
tokenizer, model = _model_cache[cache_key]
|
|
79
|
+
inputs = tokenizer([text], return_tensors="pt", padding=True, truncation=True)
|
|
80
|
+
translated = model.generate(**inputs)
|
|
81
|
+
return tokenizer.decode(translated[0], skip_special_tokens=True)
|
|
82
|
+
except Exception as exc:
|
|
83
|
+
logger.debug("Helsinki-NLP translate failed: %s", exc)
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _translate_long_text(
|
|
88
|
+
text: str,
|
|
89
|
+
translate_fn,
|
|
90
|
+
*args,
|
|
91
|
+
**kwargs,
|
|
92
|
+
) -> Optional[str]:
|
|
93
|
+
"""Split text into sentences, translate each, rejoin."""
|
|
94
|
+
sentences = _split_sentences(text)
|
|
95
|
+
if not sentences:
|
|
96
|
+
return translate_fn(text, *args, **kwargs)
|
|
97
|
+
|
|
98
|
+
translated_parts: list[str] = []
|
|
99
|
+
for sentence in sentences:
|
|
100
|
+
result = translate_fn(sentence, *args, **kwargs)
|
|
101
|
+
if result is None:
|
|
102
|
+
return None
|
|
103
|
+
translated_parts.append(result)
|
|
104
|
+
return " ".join(translated_parts)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def translate_to_english(
|
|
108
|
+
text: str,
|
|
109
|
+
source_lang: Optional[str] = None,
|
|
110
|
+
) -> Optional[str]:
|
|
111
|
+
"""
|
|
112
|
+
Translate *text* to English.
|
|
113
|
+
|
|
114
|
+
source_lang: ISO 639-1 code, or None to auto-detect.
|
|
115
|
+
Returns None on complete failure. Never raises.
|
|
116
|
+
"""
|
|
117
|
+
try:
|
|
118
|
+
if not text:
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
api_key = os.getenv("DEEPL_API_KEY", "")
|
|
122
|
+
|
|
123
|
+
# Split long texts at sentence boundaries
|
|
124
|
+
use_chunking = len(text) > 2000
|
|
125
|
+
|
|
126
|
+
# Strategy 1: DeepL
|
|
127
|
+
if api_key:
|
|
128
|
+
if use_chunking:
|
|
129
|
+
result = _translate_long_text(
|
|
130
|
+
text, _deepl_translate, "EN", source_lang
|
|
131
|
+
)
|
|
132
|
+
else:
|
|
133
|
+
result = _deepl_translate(text, "EN", source_lang)
|
|
134
|
+
if result is not None:
|
|
135
|
+
return result
|
|
136
|
+
|
|
137
|
+
# Strategy 2: Helsinki-NLP local model
|
|
138
|
+
if source_lang:
|
|
139
|
+
if use_chunking:
|
|
140
|
+
result = _translate_long_text(
|
|
141
|
+
text, _helsinki_translate, source_lang, "en"
|
|
142
|
+
)
|
|
143
|
+
else:
|
|
144
|
+
result = _helsinki_translate(text, source_lang, "en")
|
|
145
|
+
if result is not None:
|
|
146
|
+
return result
|
|
147
|
+
|
|
148
|
+
return None
|
|
149
|
+
|
|
150
|
+
except Exception as exc:
|
|
151
|
+
logger.debug("translate_to_english: unexpected error (%s)", exc)
|
|
152
|
+
return None
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def translate_batch(
|
|
156
|
+
texts: list[str],
|
|
157
|
+
source_lang: Optional[str] = None,
|
|
158
|
+
) -> list[Optional[str]]:
|
|
159
|
+
"""
|
|
160
|
+
Translate a list of texts to English.
|
|
161
|
+
|
|
162
|
+
English texts are returned as-is. Returns list of same length.
|
|
163
|
+
"""
|
|
164
|
+
try:
|
|
165
|
+
from i18n.detect import detect_language
|
|
166
|
+
except ImportError:
|
|
167
|
+
detect_language = lambda t: None # noqa: E731
|
|
168
|
+
|
|
169
|
+
results: list[Optional[str]] = []
|
|
170
|
+
for text in texts:
|
|
171
|
+
if not text:
|
|
172
|
+
results.append(text)
|
|
173
|
+
continue
|
|
174
|
+
detected = detect_language(text) if not source_lang else source_lang
|
|
175
|
+
if detected == "en":
|
|
176
|
+
results.append(text)
|
|
177
|
+
else:
|
|
178
|
+
results.append(translate_to_english(text, detected))
|
|
179
|
+
return results
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _translate_from_english(text: str, target_lang: str) -> Optional[str]:
|
|
183
|
+
"""
|
|
184
|
+
Translate English text to *target_lang*.
|
|
185
|
+
|
|
186
|
+
Mirrors translate_to_english but in reverse direction.
|
|
187
|
+
Used by query_expand.py.
|
|
188
|
+
"""
|
|
189
|
+
try:
|
|
190
|
+
if not text:
|
|
191
|
+
return None
|
|
192
|
+
|
|
193
|
+
api_key = os.getenv("DEEPL_API_KEY", "")
|
|
194
|
+
|
|
195
|
+
# Strategy 1: DeepL (EN → target)
|
|
196
|
+
if api_key:
|
|
197
|
+
result = _deepl_translate(text, target_lang, "EN")
|
|
198
|
+
if result is not None:
|
|
199
|
+
return result
|
|
200
|
+
|
|
201
|
+
# Strategy 2: Helsinki-NLP (en → target_lang)
|
|
202
|
+
result = _helsinki_translate(text, "en", target_lang)
|
|
203
|
+
if result is not None:
|
|
204
|
+
return result
|
|
205
|
+
|
|
206
|
+
return None
|
|
207
|
+
|
|
208
|
+
except Exception as exc:
|
|
209
|
+
logger.debug("_translate_from_english: error (%s)", exc)
|
|
210
|
+
return None
|
monitor/__init__.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
monitor — Phase 4 continuous monitoring, diffing, and alerts.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from monitor.alerts import dispatch_alerts
|
|
6
|
+
from monitor.config import load_watches
|
|
7
|
+
from monitor.diff import compute_diff, is_significant_change
|
|
8
|
+
from monitor.jobs import run_keyword_watch, run_url_watch
|
|
9
|
+
from monitor.scheduler import (
|
|
10
|
+
get_job_status,
|
|
11
|
+
start_scheduler,
|
|
12
|
+
stop_scheduler,
|
|
13
|
+
trigger_job_now,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"load_watches",
|
|
18
|
+
"run_keyword_watch",
|
|
19
|
+
"run_url_watch",
|
|
20
|
+
"compute_diff",
|
|
21
|
+
"is_significant_change",
|
|
22
|
+
"dispatch_alerts",
|
|
23
|
+
"start_scheduler",
|
|
24
|
+
"stop_scheduler",
|
|
25
|
+
"get_job_status",
|
|
26
|
+
"trigger_job_now",
|
|
27
|
+
]
|
monitor/_db.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Database helpers for monitor jobs (URL watch state). Not imported by tests as pipeline mocks.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
from urllib.parse import urlparse
|
|
10
|
+
|
|
11
|
+
from db.models import SourceType
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _onion_key_for_url(url: str) -> str:
|
|
17
|
+
"""Stable key for Source.onion_address (max 255)."""
|
|
18
|
+
u = (url or "").strip()
|
|
19
|
+
if not u:
|
|
20
|
+
return ""
|
|
21
|
+
try:
|
|
22
|
+
p = urlparse(u)
|
|
23
|
+
host = (p.netloc or p.path or "").split("/")[0]
|
|
24
|
+
if host:
|
|
25
|
+
return host[:255]
|
|
26
|
+
except Exception:
|
|
27
|
+
pass
|
|
28
|
+
return u[:255]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_last_cleaned_text_for_url(url: str) -> str:
|
|
32
|
+
"""Return latest cleaned_text for *url* from pages table, or ''. Never raises."""
|
|
33
|
+
if not os.getenv("DATABASE_URL"):
|
|
34
|
+
return ""
|
|
35
|
+
try:
|
|
36
|
+
from db.queries import get_page_by_url # noqa: PLC0415
|
|
37
|
+
from db.session import get_session # noqa: PLC0415
|
|
38
|
+
|
|
39
|
+
with get_session() as session:
|
|
40
|
+
page = get_page_by_url(session, url)
|
|
41
|
+
if page is None or not page.cleaned_text:
|
|
42
|
+
return ""
|
|
43
|
+
return str(page.cleaned_text)
|
|
44
|
+
except Exception as exc:
|
|
45
|
+
logger.warning("get_last_cleaned_text_for_url failed: %s", exc)
|
|
46
|
+
return ""
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def update_source_watch_fingerprint(url: str, content_hash_hex: str) -> None:
|
|
50
|
+
"""
|
|
51
|
+
Upsert Source row and store a short content fingerprint in status (VARCHAR(20)).
|
|
52
|
+
"""
|
|
53
|
+
if not os.getenv("DATABASE_URL"):
|
|
54
|
+
return
|
|
55
|
+
fp = (content_hash_hex or "")[:20]
|
|
56
|
+
if not fp:
|
|
57
|
+
return
|
|
58
|
+
key = _onion_key_for_url(url)
|
|
59
|
+
if not key:
|
|
60
|
+
return
|
|
61
|
+
try:
|
|
62
|
+
from db.queries import get_or_create_source, update_source_status # noqa: PLC0415
|
|
63
|
+
from db.session import get_session # noqa: PLC0415
|
|
64
|
+
|
|
65
|
+
with get_session() as session:
|
|
66
|
+
src, _created = get_or_create_source(
|
|
67
|
+
session,
|
|
68
|
+
onion_address=key,
|
|
69
|
+
source_type=SourceType.CRAWLED.value,
|
|
70
|
+
)
|
|
71
|
+
session.flush()
|
|
72
|
+
update_source_status(session, src.id, fp)
|
|
73
|
+
except Exception as exc:
|
|
74
|
+
logger.warning("update_source_watch_fingerprint failed: %s", exc)
|