voidaccess 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. analysis/__init__.py +49 -0
  2. analysis/opsec.py +454 -0
  3. analysis/patterns.py +202 -0
  4. analysis/temporal.py +201 -0
  5. api/__init__.py +1 -0
  6. api/auth.py +163 -0
  7. api/main.py +509 -0
  8. api/routes/__init__.py +1 -0
  9. api/routes/admin.py +214 -0
  10. api/routes/auth.py +157 -0
  11. api/routes/entities.py +871 -0
  12. api/routes/export.py +359 -0
  13. api/routes/investigations.py +2567 -0
  14. api/routes/monitors.py +405 -0
  15. api/routes/search.py +157 -0
  16. api/routes/settings.py +851 -0
  17. auth/__init__.py +1 -0
  18. auth/token_blacklist.py +108 -0
  19. cli/__init__.py +3 -0
  20. cli/adapters/__init__.py +1 -0
  21. cli/adapters/sqlite.py +273 -0
  22. cli/browser.py +376 -0
  23. cli/commands/__init__.py +1 -0
  24. cli/commands/configure.py +185 -0
  25. cli/commands/enrich.py +154 -0
  26. cli/commands/export.py +158 -0
  27. cli/commands/investigate.py +601 -0
  28. cli/commands/show.py +87 -0
  29. cli/config.py +180 -0
  30. cli/display.py +212 -0
  31. cli/main.py +154 -0
  32. cli/tor_detect.py +71 -0
  33. config.py +180 -0
  34. crawler/__init__.py +28 -0
  35. crawler/dedup.py +97 -0
  36. crawler/frontier.py +115 -0
  37. crawler/spider.py +462 -0
  38. crawler/utils.py +122 -0
  39. db/__init__.py +47 -0
  40. db/migrations/__init__.py +0 -0
  41. db/migrations/env.py +80 -0
  42. db/migrations/versions/0001_initial_schema.py +270 -0
  43. db/migrations/versions/0002_add_investigation_status_column.py +27 -0
  44. db/migrations/versions/0002_add_missing_tables.py +33 -0
  45. db/migrations/versions/0003_add_canonical_value_and_entity_links.py +61 -0
  46. db/migrations/versions/0004_add_page_posted_at.py +41 -0
  47. db/migrations/versions/0005_add_extraction_method.py +32 -0
  48. db/migrations/versions/0006_add_monitor_alerts.py +26 -0
  49. db/migrations/versions/0007_add_actor_style_profiles.py +23 -0
  50. db/migrations/versions/0008_add_users_table.py +47 -0
  51. db/migrations/versions/0009_add_investigation_id_to_relationships.py +29 -0
  52. db/migrations/versions/0010_add_composite_index_entity_relationships.py +22 -0
  53. db/migrations/versions/0011_add_page_extraction_cache.py +52 -0
  54. db/migrations/versions/0013_add_graph_status.py +31 -0
  55. db/migrations/versions/0015_add_progress_fields.py +41 -0
  56. db/migrations/versions/0016_backfill_graph_status.py +33 -0
  57. db/migrations/versions/0017_add_user_api_keys.py +44 -0
  58. db/migrations/versions/0018_add_user_id_to_investigations.py +33 -0
  59. db/migrations/versions/0019_add_content_safety_log.py +46 -0
  60. db/migrations/versions/0020_add_entity_source_tracking.py +50 -0
  61. db/models.py +618 -0
  62. db/queries.py +841 -0
  63. db/session.py +270 -0
  64. export/__init__.py +34 -0
  65. export/misp.py +257 -0
  66. export/sigma.py +342 -0
  67. export/stix.py +418 -0
  68. extractor/__init__.py +21 -0
  69. extractor/llm_extract.py +372 -0
  70. extractor/ner.py +512 -0
  71. extractor/normalizer.py +638 -0
  72. extractor/pipeline.py +401 -0
  73. extractor/regex_patterns.py +325 -0
  74. fingerprint/__init__.py +33 -0
  75. fingerprint/profiler.py +240 -0
  76. fingerprint/stylometry.py +249 -0
  77. graph/__init__.py +73 -0
  78. graph/builder.py +894 -0
  79. graph/export.py +225 -0
  80. graph/model.py +83 -0
  81. graph/queries.py +297 -0
  82. graph/visualize.py +178 -0
  83. i18n/__init__.py +24 -0
  84. i18n/detect.py +76 -0
  85. i18n/query_expand.py +72 -0
  86. i18n/translate.py +210 -0
  87. monitor/__init__.py +27 -0
  88. monitor/_db.py +74 -0
  89. monitor/alerts.py +345 -0
  90. monitor/config.py +118 -0
  91. monitor/diff.py +75 -0
  92. monitor/jobs.py +247 -0
  93. monitor/scheduler.py +184 -0
  94. scraper/__init__.py +0 -0
  95. scraper/scrape.py +857 -0
  96. scraper/scrape_js.py +272 -0
  97. search/__init__.py +318 -0
  98. search/circuit_breaker.py +240 -0
  99. search/search.py +334 -0
  100. sources/__init__.py +96 -0
  101. sources/blockchain.py +444 -0
  102. sources/cache.py +93 -0
  103. sources/cisa.py +108 -0
  104. sources/dns_enrichment.py +557 -0
  105. sources/domain_reputation.py +643 -0
  106. sources/email_reputation.py +635 -0
  107. sources/engines.py +244 -0
  108. sources/enrichment.py +1244 -0
  109. sources/github_scraper.py +589 -0
  110. sources/gitlab_scraper.py +624 -0
  111. sources/hash_reputation.py +856 -0
  112. sources/historical_intel.py +253 -0
  113. sources/ip_reputation.py +521 -0
  114. sources/paste_scraper.py +484 -0
  115. sources/pastes.py +278 -0
  116. sources/rss_scraper.py +576 -0
  117. sources/seed_manager.py +373 -0
  118. sources/seeds.py +368 -0
  119. sources/shodan.py +103 -0
  120. sources/telegram.py +199 -0
  121. sources/virustotal.py +113 -0
  122. utils/__init__.py +0 -0
  123. utils/async_utils.py +89 -0
  124. utils/content_safety.py +193 -0
  125. utils/defang.py +94 -0
  126. utils/encryption.py +34 -0
  127. utils/ioc_freshness.py +124 -0
  128. utils/user_keys.py +33 -0
  129. vector/__init__.py +39 -0
  130. vector/embedder.py +100 -0
  131. vector/model_singleton.py +49 -0
  132. vector/search.py +87 -0
  133. vector/store.py +514 -0
  134. voidaccess/__init__.py +0 -0
  135. voidaccess/llm.py +717 -0
  136. voidaccess/llm_utils.py +696 -0
  137. voidaccess-1.3.0.dist-info/METADATA +395 -0
  138. voidaccess-1.3.0.dist-info/RECORD +142 -0
  139. voidaccess-1.3.0.dist-info/WHEEL +5 -0
  140. voidaccess-1.3.0.dist-info/entry_points.txt +2 -0
  141. voidaccess-1.3.0.dist-info/licenses/LICENSE +21 -0
  142. voidaccess-1.3.0.dist-info/top_level.txt +19 -0
monitor/alerts.py ADDED
@@ -0,0 +1,345 @@
1
+ """
2
+ Alert delivery (webhook, Telegram bot, SMTP) and persisted monitor_alerts records.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import asyncio
8
+ import json
9
+ import logging
10
+ import os
11
+ import re
12
+ import smtplib
13
+ from email.mime.text import MIMEText
14
+ from typing import Any, Optional
15
+
16
+ from monitor.diff import is_significant_change
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def _summarize_job_result(job_result: dict) -> str:
22
+ parts: list[str] = []
23
+ for key in ("query", "url", "changed", "new_pages", "new_entities", "duplicate_pages_skipped"):
24
+ if key in job_result:
25
+ parts.append(f"{key}={job_result[key]!r}")
26
+ if not parts:
27
+ parts.append(str(job_result)[:500])
28
+ return "; ".join(parts)[:1500]
29
+
30
+
31
+ async def send_webhook(url: str, payload: dict) -> bool:
32
+ """POST JSON to *url*; True on HTTP 2xx."""
33
+ try:
34
+ import aiohttp # noqa: PLC0415
35
+
36
+ timeout = aiohttp.ClientTimeout(total=10)
37
+ async with aiohttp.ClientSession(timeout=timeout) as session:
38
+ async with session.post(
39
+ url,
40
+ json=payload,
41
+ headers={"Content-Type": "application/json"},
42
+ ) as resp:
43
+ return 200 <= resp.status < 300
44
+ except Exception as exc:
45
+ logger.error("send_webhook failed: %s", exc)
46
+ return False
47
+
48
+
49
+ async def send_telegram_alert(chat_id: str, message: str) -> bool:
50
+ token = os.getenv("TELEGRAM_BOT_TOKEN", "").strip()
51
+ if not token:
52
+ return False
53
+ try:
54
+ import aiohttp # noqa: PLC0415
55
+
56
+ api = f"https://api.telegram.org/bot{token}/sendMessage"
57
+ timeout = aiohttp.ClientTimeout(total=10)
58
+ payload = {"chat_id": chat_id, "text": message[:4096]}
59
+ async with aiohttp.ClientSession(timeout=timeout) as session:
60
+ async with session.post(api, json=payload) as resp:
61
+ if resp.status != 200:
62
+ body = await resp.text()
63
+ logger.error("Telegram API %s: %s", resp.status, body[:200])
64
+ return resp.status == 200
65
+ except Exception as exc:
66
+ logger.error("send_telegram_alert failed: %s", exc)
67
+ return False
68
+
69
+
70
+ async def send_email_alert(to: str, subject: str, body: str) -> bool:
71
+ host = os.getenv("SMTP_HOST", "").strip()
72
+ if not host:
73
+ return False
74
+ port = int(os.getenv("SMTP_PORT", "587") or "587")
75
+ user = os.getenv("SMTP_USER", "").strip()
76
+ password = os.getenv("SMTP_PASS", "").strip()
77
+
78
+ def _send_sync() -> bool:
79
+ msg = MIMEText(body, "plain", "utf-8")
80
+ msg["Subject"] = subject
81
+ msg["From"] = user or "voidaccess@localhost"
82
+ msg["To"] = to
83
+ with smtplib.SMTP(host, port, timeout=15) as smtp:
84
+ smtp.ehlo()
85
+ if user and password:
86
+ smtp.starttls()
87
+ smtp.ehlo()
88
+ smtp.login(user, password)
89
+ smtp.sendmail(msg["From"], [to], msg.as_string())
90
+ return True
91
+
92
+ try:
93
+ return await asyncio.to_thread(_send_sync)
94
+ except Exception as exc:
95
+ logger.error("send_email_alert failed: %s", exc)
96
+ return False
97
+
98
+
99
+ def _derive_severity(change_type: str, diff: dict) -> str:
100
+ """Derive alert severity from change type and magnitude."""
101
+ entity_mag = int(
102
+ diff.get("new_entity_count", 0)
103
+ or diff.get("entity_count", 0)
104
+ or diff.get("new_entities", 0)
105
+ or len(diff.get("new_entities", []) if isinstance(diff.get("new_entities"), list) else [])
106
+ or 0
107
+ )
108
+ if change_type in ("new_entities",) and entity_mag >= 10:
109
+ return "critical"
110
+ if change_type in ("new_entities", "new_page", "first_result"):
111
+ return "warning"
112
+ if change_type in ("significant_change",):
113
+ return "warning"
114
+ if change_type == "content_change":
115
+ return "info"
116
+ return "info"
117
+
118
+
119
+ def _count_entity_delta(diff: dict) -> int:
120
+ """Extract entity count delta from diff result."""
121
+ v = (
122
+ diff.get("new_entity_count")
123
+ if diff.get("new_entity_count") is not None
124
+ else None
125
+ )
126
+ if v is not None:
127
+ return int(v)
128
+ v = diff.get("entity_count")
129
+ if v is not None:
130
+ return int(v)
131
+ ne = diff.get("new_entities")
132
+ if isinstance(ne, list):
133
+ return len(ne)
134
+ if isinstance(ne, int):
135
+ return ne
136
+ return 0
137
+
138
+
139
+ def _sanitize_diff(diff: dict) -> dict:
140
+ """
141
+ Ensure diff data is JSON-serializable and not too large.
142
+ Truncate large text fields; strip angle-bracket tags from strings.
143
+ """
144
+ sanitized: dict[str, Any] = {}
145
+ for k, v in diff.items():
146
+ if isinstance(v, str):
147
+ s = re.sub(r"<[^>]+>", "", v)
148
+ if len(s) > 500:
149
+ sanitized[k] = s[:500] + "..."
150
+ else:
151
+ sanitized[k] = s
152
+ elif isinstance(v, list) and len(v) > 50:
153
+ sanitized[k] = v[:50]
154
+ elif isinstance(v, (str, int, float, bool, list, dict)) or v is None:
155
+ sanitized[k] = v
156
+ return sanitized
157
+
158
+
159
+ def build_alert_context(watch: dict, job_result: dict) -> Optional[dict[str, Any]]:
160
+ """
161
+ If this job should produce an alert, return change_type, summary, diff_result.
162
+ Otherwise None.
163
+ """
164
+ alert_on = watch.get("alert_on") or "new_results"
165
+ wtype = watch.get("type", "keyword")
166
+
167
+ if wtype == "keyword":
168
+ np = int(job_result.get("new_pages") or 0)
169
+ ne = int(job_result.get("new_entities") or 0)
170
+ should = False
171
+ if alert_on == "new_results":
172
+ should = np > 0 or ne > 0
173
+ elif alert_on == "any_change":
174
+ should = np > 0 or ne > 0
175
+ elif alert_on == "any_appearance":
176
+ should = ne > 0
177
+ if not should:
178
+ return None
179
+ if ne > 0:
180
+ ct = "new_entities"
181
+ elif np > 0:
182
+ ct = "new_page"
183
+ else:
184
+ ct = "new_entities"
185
+ summary = f"{ct}: {np} new page(s), {ne} new entities (query={job_result.get('query', '')!r})"
186
+ diff_result = {
187
+ "query": job_result.get("query"),
188
+ "new_pages": np,
189
+ "new_entities": ne,
190
+ "entity_count": ne,
191
+ "duplicate_pages_skipped": job_result.get("duplicate_pages_skipped"),
192
+ }
193
+ return {"change_type": ct, "summary": summary, "diff_result": diff_result}
194
+
195
+ # URL watch
196
+ changed = bool(job_result.get("changed"))
197
+ if not changed:
198
+ return None
199
+
200
+ ne = int(job_result.get("new_entities") or 0)
201
+ cr = float(job_result.get("change_ratio") or 0.0)
202
+ is_first = bool(job_result.get("is_first_scrape"))
203
+ sig = is_significant_change({"change_ratio": cr}, threshold=0.1)
204
+
205
+ should = False
206
+ if alert_on == "new_results":
207
+ should = is_first or (ne > 0)
208
+ elif alert_on == "any_change":
209
+ should = sig or is_first
210
+ elif alert_on == "any_appearance":
211
+ should = ne > 0
212
+ if not should:
213
+ return None
214
+
215
+ if is_first:
216
+ ct = "first_result"
217
+ elif ne > 0 and sig:
218
+ ct = "significant_change"
219
+ elif ne > 0:
220
+ ct = "new_entities"
221
+ elif sig:
222
+ ct = "significant_change"
223
+ else:
224
+ ct = "content_change"
225
+
226
+ summary = (
227
+ f"{ct}: {job_result.get('url', '')!r} — "
228
+ f"entities={ne}, change_ratio={cr:.3f}"
229
+ )
230
+ diff_result = {
231
+ "url": job_result.get("url"),
232
+ "new_entities": ne,
233
+ "entity_count": ne,
234
+ "change_ratio": cr,
235
+ "lines_added": job_result.get("lines_added"),
236
+ "lines_removed": job_result.get("lines_removed"),
237
+ "diff_summary": job_result.get("diff_summary"),
238
+ "is_first_scrape": is_first,
239
+ }
240
+ return {"change_type": ct, "summary": summary, "diff_result": diff_result}
241
+
242
+
243
+ async def dispatch_alerts(watch: dict, job_result: dict) -> list[str]:
244
+ """
245
+ Dispatch to all configured external channels concurrently.
246
+ Returns channel names that succeeded (webhook, telegram, email).
247
+ """
248
+ name = watch.get("name", "watch")
249
+ summary = _summarize_job_result(job_result)
250
+ text = f"[VoidAccess Alert] {name}: {summary}"
251
+ payload = {
252
+ "watch": name,
253
+ "job_result": job_result,
254
+ "message": text,
255
+ }
256
+
257
+ tasks: list[Any] = []
258
+ labels: list[str] = []
259
+
260
+ wu = watch.get("webhook_url")
261
+ if wu and isinstance(wu, str) and wu.strip():
262
+ tasks.append(send_webhook(wu.strip(), payload))
263
+ labels.append("webhook")
264
+
265
+ tc = watch.get("telegram_chat_id")
266
+ if tc and isinstance(tc, str) and tc.strip():
267
+ tasks.append(send_telegram_alert(tc.strip(), text))
268
+ labels.append("telegram")
269
+
270
+ em = watch.get("email")
271
+ if em and isinstance(em, str) and em.strip():
272
+ tasks.append(
273
+ send_email_alert(
274
+ em.strip(),
275
+ f"[VoidAccess Alert] {name}",
276
+ json.dumps(job_result, indent=2, default=str)[:20000],
277
+ )
278
+ )
279
+ labels.append("email")
280
+
281
+ if not tasks:
282
+ return []
283
+
284
+ results = await asyncio.gather(*tasks, return_exceptions=True)
285
+ delivered: list[str] = []
286
+ for label, res in zip(labels, results):
287
+ if res is True:
288
+ delivered.append(label)
289
+ elif isinstance(res, Exception):
290
+ logger.error("alert channel %s failed: %s", label, res)
291
+ return delivered
292
+
293
+
294
+ def _persist_alert_record(
295
+ watch: dict,
296
+ change_type: str,
297
+ summary: str,
298
+ diff_result: dict,
299
+ delivered_channels: list[str],
300
+ ) -> None:
301
+ from db.queries import create_monitor_alert
302
+ from db.session import get_session
303
+
304
+ severity = _derive_severity(change_type, diff_result)
305
+ entity_delta = _count_entity_delta(diff_result)
306
+ with get_session() as session:
307
+ create_monitor_alert(
308
+ session=session,
309
+ monitor_name=str(watch.get("name", "watch")),
310
+ change_type=change_type,
311
+ summary=summary,
312
+ diff_data=_sanitize_diff(diff_result),
313
+ severity=severity,
314
+ entity_count_delta=entity_delta,
315
+ delivery_channels=delivered_channels,
316
+ )
317
+
318
+
319
+ async def evaluate_and_dispatch_alerts(watch: dict, job_result: dict) -> None:
320
+ """
321
+ If the watch policy says we should alert, send external notifications
322
+ and persist a MonitorAlert row. DB failures never block delivery.
323
+ """
324
+ ctx = build_alert_context(watch, job_result)
325
+ if ctx is None:
326
+ return
327
+
328
+ delivered_channels: list[str] = []
329
+ try:
330
+ delivered_channels = await dispatch_alerts(watch, job_result)
331
+ except Exception as exc:
332
+ logger.error("dispatch_alerts failed for %s: %s", watch.get("name"), exc)
333
+
334
+ try:
335
+ _persist_alert_record(
336
+ watch,
337
+ ctx["change_type"],
338
+ ctx["summary"],
339
+ ctx.get("diff_result") or {},
340
+ delivered_channels,
341
+ )
342
+ except Exception as exc:
343
+ logger.warning(
344
+ "Failed to persist alert for %s: %s", watch.get("name"), exc
345
+ )
monitor/config.py ADDED
@@ -0,0 +1,118 @@
1
+ """
2
+ Load and validate monitor watch definitions from monitors.yaml.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import logging
8
+ import os
9
+ from pathlib import Path
10
+ from typing import Any, Optional
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ _ALERT_ON = frozenset({"new_results", "any_change", "any_appearance"})
15
+ _TYPES = frozenset({"keyword", "url"})
16
+
17
+
18
+ def _yaml_path() -> Path:
19
+ env_path = os.getenv("MONITORS_CONFIG_PATH")
20
+ if env_path:
21
+ return Path(env_path)
22
+ return Path(__file__).resolve().parent.parent / "data" / "monitors.yaml"
23
+
24
+
25
+ def _as_float(val: Any, default: Optional[float] = None) -> Optional[float]:
26
+ try:
27
+ return float(val)
28
+ except (TypeError, ValueError):
29
+ return default
30
+
31
+
32
+ def _validate_and_normalize(raw: dict) -> dict | None:
33
+ name = raw.get("name")
34
+ if not name or not isinstance(name, str):
35
+ logger.warning("Monitor entry skipped: missing or invalid 'name'")
36
+ return None
37
+ wtype = raw.get("type")
38
+ if wtype not in _TYPES:
39
+ logger.warning("Monitor %r skipped: invalid type %r", name, wtype)
40
+ return None
41
+ interval = _as_float(raw.get("interval_hours"))
42
+ if interval is None or interval < 0.5:
43
+ logger.warning("Monitor %r skipped: interval_hours must be >= 0.5", name)
44
+ return None
45
+ alert_on = raw.get("alert_on")
46
+ if alert_on not in _ALERT_ON:
47
+ logger.warning("Monitor %r skipped: invalid alert_on %r", name, alert_on)
48
+ return None
49
+
50
+ if wtype == "keyword":
51
+ q = raw.get("query")
52
+ if not q or not isinstance(q, str):
53
+ logger.warning("Monitor %r skipped: keyword watch needs query", name)
54
+ return None
55
+ else:
56
+ u = raw.get("url")
57
+ if not u or not isinstance(u, str):
58
+ logger.warning("Monitor %r skipped: url watch needs url", name)
59
+ return None
60
+
61
+ enabled = raw.get("enabled", True)
62
+ if not isinstance(enabled, bool):
63
+ enabled = bool(enabled)
64
+
65
+ out: dict[str, Any] = {
66
+ "name": name.strip(),
67
+ "type": wtype,
68
+ "interval_hours": interval,
69
+ "alert_on": alert_on,
70
+ "enabled": enabled,
71
+ "webhook_url": raw.get("webhook_url"),
72
+ "telegram_chat_id": raw.get("telegram_chat_id"),
73
+ "email": raw.get("email"),
74
+ }
75
+ if wtype == "keyword":
76
+ out["query"] = str(raw["query"]).strip()
77
+ else:
78
+ out["url"] = str(raw["url"]).strip()
79
+
80
+ return out
81
+
82
+
83
+ def load_watches() -> list[dict]:
84
+ """Parse monitors.yaml; invalid entries are skipped. Returns [] if missing."""
85
+ path = _yaml_path()
86
+ if not path.is_file():
87
+ return []
88
+ try:
89
+ import yaml # noqa: PLC0415
90
+ except ImportError:
91
+ logger.warning("PyYAML not installed; no watches loaded")
92
+ return []
93
+ try:
94
+ data = yaml.safe_load(path.read_text(encoding="utf-8"))
95
+ except Exception as exc:
96
+ logger.warning("Failed to read monitors.yaml: %s", exc)
97
+ return []
98
+ if not data or not isinstance(data, dict):
99
+ return []
100
+ watches_raw = data.get("watches")
101
+ if not watches_raw or not isinstance(watches_raw, list):
102
+ return []
103
+ out: list[dict] = []
104
+ for item in watches_raw:
105
+ if not isinstance(item, dict):
106
+ logger.warning("Monitor entry skipped: not a mapping")
107
+ continue
108
+ norm = _validate_and_normalize(item)
109
+ if norm:
110
+ out.append(norm)
111
+ return out
112
+
113
+
114
+ def get_watch_by_name(name: str) -> dict | None:
115
+ for w in load_watches():
116
+ if w.get("name") == name:
117
+ return w
118
+ return None
monitor/diff.py ADDED
@@ -0,0 +1,75 @@
1
+ """
2
+ Content change detection using difflib (unified diff + similarity ratio).
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import difflib
8
+ import hashlib
9
+ from typing import Any
10
+
11
+
12
+ def compute_diff(old_text: str, new_text: str) -> dict[str, Any]:
13
+ """
14
+ Compare two text blobs. change_ratio: 0.0 identical, 1.0 completely different.
15
+ """
16
+ old_bytes = old_text.encode("utf-8")
17
+ new_bytes = new_text.encode("utf-8")
18
+ content_hash_old = hashlib.sha256(old_bytes).hexdigest()
19
+ content_hash_new = hashlib.sha256(new_bytes).hexdigest()
20
+
21
+ if old_text == new_text:
22
+ return {
23
+ "changed": False,
24
+ "content_hash_old": content_hash_old,
25
+ "content_hash_new": content_hash_new,
26
+ "lines_added": 0,
27
+ "lines_removed": 0,
28
+ "diff_summary": "",
29
+ "change_ratio": 0.0,
30
+ }
31
+
32
+ old_lines = old_text.splitlines(keepends=True)
33
+ new_lines = new_text.splitlines(keepends=True)
34
+ diff_lines = list(
35
+ difflib.unified_diff(
36
+ old_lines,
37
+ new_lines,
38
+ fromfile="old",
39
+ tofile="new",
40
+ lineterm="",
41
+ )
42
+ )
43
+ lines_added = 0
44
+ lines_removed = 0
45
+ for line in diff_lines:
46
+ if line.startswith("+++") or line.startswith("---") or line.startswith("@@"):
47
+ continue
48
+ if line.startswith("+"):
49
+ lines_added += 1
50
+ elif line.startswith("-"):
51
+ lines_removed += 1
52
+
53
+ matcher = difflib.SequenceMatcher(None, old_text, new_text)
54
+ change_ratio = 1.0 - matcher.ratio()
55
+
56
+ summary_src = "\n".join(diff_lines)
57
+ diff_summary = summary_src[:500]
58
+
59
+ return {
60
+ "changed": True,
61
+ "content_hash_old": content_hash_old,
62
+ "content_hash_new": content_hash_new,
63
+ "lines_added": lines_added,
64
+ "lines_removed": lines_removed,
65
+ "diff_summary": diff_summary,
66
+ "change_ratio": float(change_ratio),
67
+ }
68
+
69
+
70
+ def is_significant_change(diff: dict, threshold: float = 0.1) -> bool:
71
+ """True when change_ratio meets or exceeds *threshold*."""
72
+ try:
73
+ return float(diff.get("change_ratio", 0.0)) >= threshold
74
+ except (TypeError, ValueError):
75
+ return False