mnemox 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. ember/__init__.py +3 -0
  2. ember/__main__.py +129 -0
  3. ember/capture/__init__.py +6 -0
  4. ember/capture/drain.py +341 -0
  5. ember/capture/secrets.py +137 -0
  6. ember/cli.py +309 -0
  7. ember/config.py +180 -0
  8. ember/crypto/__init__.py +35 -0
  9. ember/crypto/_blind.py +70 -0
  10. ember/crypto/_cipher.py +122 -0
  11. ember/crypto/_kdf.py +58 -0
  12. ember/crypto/errors.py +13 -0
  13. ember/db/__init__.py +25 -0
  14. ember/db/migrate.py +32 -0
  15. ember/db/migrations/__init__.py +1 -0
  16. ember/db/migrations/_migration_0001.py +51 -0
  17. ember/db/migrations/_migration_0002.py +192 -0
  18. ember/embeddings/__init__.py +8 -0
  19. ember/embeddings/engine.py +201 -0
  20. ember/embeddings/tokenizer.py +89 -0
  21. ember/extraction/__init__.py +5 -0
  22. ember/extraction/local.py +81 -0
  23. ember/invalidation/__init__.py +15 -0
  24. ember/invalidation/cosine.py +140 -0
  25. ember/keystore/__init__.py +27 -0
  26. ember/keystore/_backend.py +256 -0
  27. ember/keystore/_salt.py +40 -0
  28. ember/keystore/errors.py +9 -0
  29. ember/mcp/__init__.py +5 -0
  30. ember/mcp/tools.py +510 -0
  31. ember/mirror/__init__.py +5 -0
  32. ember/mirror/render.py +143 -0
  33. ember/models/__init__.py +1 -0
  34. ember/models/cache.py +40 -0
  35. ember/models/loader.py +54 -0
  36. ember/modes/__init__.py +19 -0
  37. ember/modes/scope.py +243 -0
  38. ember/oplog/__init__.py +19 -0
  39. ember/oplog/backfill.py +98 -0
  40. ember/oplog/codec.py +87 -0
  41. ember/oplog/entry.py +35 -0
  42. ember/oplog/producer.py +192 -0
  43. ember/ranking/__init__.py +7 -0
  44. ember/ranking/cosine.py +12 -0
  45. ember/ranking/fusion.py +78 -0
  46. ember/ranking/result.py +18 -0
  47. ember/retrieval/__init__.py +5 -0
  48. ember/retrieval/_search.py +143 -0
  49. ember/retrieval/bm25.py +88 -0
  50. ember/retrieval/tiers.py +101 -0
  51. ember/schema/__init__.py +1 -0
  52. ember/server.py +156 -0
  53. ember/storage/__init__.py +31 -0
  54. ember/storage/paths.py +33 -0
  55. ember/storage/store.py +210 -0
  56. ember/sync/__init__.py +32 -0
  57. ember/sync/_aad.py +23 -0
  58. ember/sync/client.py +382 -0
  59. ember/sync/cursor.py +76 -0
  60. ember/sync/errors.py +22 -0
  61. ember/sync/merge.py +197 -0
  62. ember/sync/oplog_store.py +169 -0
  63. ember/sync/pull.py +53 -0
  64. ember/sync/push.py +42 -0
  65. ember/sync/transport.py +65 -0
  66. ember/vec/__init__.py +19 -0
  67. ember/vec/_build.py +151 -0
  68. ember/vec/_gate.py +66 -0
  69. ember/vec/_meta.py +67 -0
  70. ember/vec/_query.py +86 -0
  71. ember/vec/index.py +98 -0
  72. ember/write/__init__.py +6 -0
  73. ember/write/flush.py +197 -0
  74. ember/write/pipeline.py +104 -0
  75. mnemox-0.2.0.dist-info/METADATA +329 -0
  76. mnemox-0.2.0.dist-info/RECORD +80 -0
  77. mnemox-0.2.0.dist-info/WHEEL +5 -0
  78. mnemox-0.2.0.dist-info/entry_points.txt +2 -0
  79. mnemox-0.2.0.dist-info/licenses/LICENSE +21 -0
  80. mnemox-0.2.0.dist-info/top_level.txt +1 -0
ember/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """Ember — local-first memory system."""
2
+
3
+ __version__ = "0.2.0"
ember/__main__.py ADDED
@@ -0,0 +1,129 @@
1
+ """Entry point for ``python -m ember``.
2
+
3
+ Usage::
4
+
5
+ python -m ember mirror --project-id <id> [--output-dir memory] [--db-path /path/to.db]
6
+ python -m ember serve
7
+ python -m ember init [--mode project|global|server]
8
+ python -m ember --version
9
+ """
10
+
11
+ import argparse
12
+ import sys
13
+
14
+ from ember.cli import cmd_backfill, cmd_drain, cmd_init, get_version
15
+
16
+
17
+ def _cmd_mirror(args: argparse.Namespace) -> None:
18
+ from ember.storage import store as _store_factory
19
+ from ember.mirror import render_to_md
20
+
21
+ s = _store_factory(args.db_path or None)
22
+ try:
23
+ result = render_to_md(args.project_id, args.output_dir, store=s)
24
+ finally:
25
+ s.close()
26
+
27
+ total_atoms = sum(result.values())
28
+ total_files = len(result)
29
+ print(f"mirrored {total_atoms} atoms across {total_files} files")
30
+
31
+
32
+ def _cmd_serve(args: argparse.Namespace) -> None:
33
+ import asyncio
34
+ from ember.server import run_stdio
35
+
36
+ asyncio.run(run_stdio())
37
+
38
+
39
+ def main() -> None:
40
+ parser = argparse.ArgumentParser(
41
+ prog="ember",
42
+ description="Ember local-first memory system.",
43
+ )
44
+ parser.add_argument(
45
+ "--version",
46
+ action="version",
47
+ version=f"ember {get_version()}",
48
+ )
49
+ sub = parser.add_subparsers(dest="command", metavar="<command>")
50
+
51
+ mirror_p = sub.add_parser("mirror", help="Render Ember records to memory/*.md files.")
52
+ mirror_p.add_argument("--project-id", required=True, help="Ember project identifier.")
53
+ mirror_p.add_argument(
54
+ "--output-dir",
55
+ default="memory",
56
+ help="Output directory for .md files (default: memory).",
57
+ )
58
+ mirror_p.add_argument(
59
+ "--db-path",
60
+ default=None,
61
+ help="Path to the Ember SQLite DB (default: resolved via EMBER_DB_PATH or ~/.ember/ember.db).",
62
+ )
63
+
64
+ sub.add_parser("serve", help="Start the Ember MCP server over stdio.")
65
+
66
+ init_p = sub.add_parser("init", help="Initialise the Ember store and write config.json.")
67
+ init_p.add_argument(
68
+ "--mode",
69
+ choices=["project", "global", "server"],
70
+ default=None,
71
+ help="Initialisation mode (default: project).",
72
+ )
73
+
74
+ drain_p = sub.add_parser("drain", help="Drain the capture queue and write records to the store.")
75
+ drain_p.add_argument(
76
+ "--queue-path",
77
+ default=None,
78
+ dest="queue_path",
79
+ help="Path to capture-queue.jsonl (default: <project-root>/.ember/capture-queue.jsonl).",
80
+ )
81
+ drain_p.add_argument(
82
+ "--db-path",
83
+ default=None,
84
+ dest="db_path",
85
+ help="Path to the Ember SQLite DB (default: resolved via EMBER_DB_PATH or mode).",
86
+ )
87
+ drain_p.add_argument(
88
+ "--project-id",
89
+ default=None,
90
+ dest="project_id",
91
+ help="Bound project scope; only records with this project_id are written.",
92
+ )
93
+
94
+ backfill_p = sub.add_parser(
95
+ "backfill",
96
+ help="Re-emit pre-existing atoms into the oplog so they sync to the hub.",
97
+ )
98
+ backfill_p.add_argument(
99
+ "--db-path",
100
+ default=None,
101
+ dest="db_path",
102
+ help="Path to the Ember SQLite DB (default: resolved via EMBER_DB_PATH or mode).",
103
+ )
104
+ backfill_p.add_argument(
105
+ "--project-id",
106
+ default=None,
107
+ dest="project_id",
108
+ help="Project scope to backfill (default: derived from the project root).",
109
+ )
110
+
111
+ args = parser.parse_args()
112
+ if args.command is None:
113
+ parser.print_help()
114
+ sys.exit(1)
115
+
116
+ if args.command == "mirror":
117
+ _cmd_mirror(args)
118
+ elif args.command == "serve":
119
+ _cmd_serve(args)
120
+ elif args.command == "init":
121
+ sys.exit(cmd_init(args.mode))
122
+ elif args.command == "drain":
123
+ sys.exit(cmd_drain(args.queue_path, args.db_path, args.project_id))
124
+ elif args.command == "backfill":
125
+ sys.exit(cmd_backfill(args.db_path, args.project_id))
126
+
127
+
128
+ if __name__ == "__main__":
129
+ main()
@@ -0,0 +1,6 @@
1
+ """Ember capture: drain queue consumer + secret scanner."""
2
+
3
+ from .drain import drain
4
+ from .secrets import looks_like_secret
5
+
6
+ __all__ = ["drain", "looks_like_secret"]
ember/capture/drain.py ADDED
@@ -0,0 +1,341 @@
1
+ """Drain the ember capture queue into the write pipeline.
2
+
3
+ Reads .ember/capture-queue.jsonl, validates each record, filters secrets and
4
+ foreign project_ids, then calls ember.write.write() for each valid record.
5
+
6
+ Durability model (atomic rename):
7
+ 1. queue absent/empty → zero-counts no-op.
8
+ 2. os.replace(queue, queue+".processing") — atomic rename; producer keeps
9
+ appending to the original path without data loss.
10
+ 3. Read .processing line-by-line, write() each valid record.
11
+ 4. Clean completion → os.remove(.processing) (== rotation/truncation).
12
+ 5. Stale .processing at entry → drain it first.
13
+ If write() raises mid-drain: leave .processing in place and re-raise after
14
+ logging; next run reprocesses the whole batch (may double-write succeeded
15
+ records; accepted — cosine invalidate_superseded mitigates duplicates).
16
+ Skips (malformed/foreign/secret) never leave .processing.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import json
22
+ import os
23
+ import re
24
+ import sys
25
+ from datetime import datetime, timezone
26
+ from pathlib import Path
27
+ from typing import TYPE_CHECKING
28
+
29
+ if TYPE_CHECKING:
30
+ from ember.storage.store import Store
31
+
32
+ # ---------------------------------------------------------------------------
33
+ # Timestamp validation
34
+ # ---------------------------------------------------------------------------
35
+
36
+ # Accepts second-precision (ember-capture.sh) and millisecond-precision (DB DEFAULT).
37
+ # Pattern: YYYY-MM-DDTHH:MM:SS[.fraction]Z
38
+ _TS_RE = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?Z$")
39
+
40
+
41
+ def _validate_ts(ts: object) -> "str | None":
42
+ """Validate an ISO-8601 UTC timestamp string.
43
+
44
+ Returns the original string if valid, else None.
45
+ Accepts second-precision (YYYY-MM-DDTHH:MM:SSZ) and millisecond-precision
46
+ (YYYY-MM-DDTHH:MM:SS.fffZ).
47
+ """
48
+ if not isinstance(ts, str):
49
+ return None
50
+ if not _TS_RE.match(ts):
51
+ return None
52
+ # Round-trip validation (catches out-of-range dates like month=13)
53
+ try:
54
+ # Normalise to a form datetime.strptime can parse.
55
+ ts_clean = ts.rstrip("Z")
56
+ if "." in ts_clean:
57
+ # Truncate/pad fractional to 6 digits for %f
58
+ parts = ts_clean.split(".")
59
+ frac = parts[1][:6].ljust(6, "0")
60
+ ts_clean = f"{parts[0]}.{frac}"
61
+ datetime.strptime(ts_clean, "%Y-%m-%dT%H:%M:%S.%f").replace(
62
+ tzinfo=timezone.utc
63
+ )
64
+ else:
65
+ datetime.strptime(ts_clean, "%Y-%m-%dT%H:%M:%S").replace(
66
+ tzinfo=timezone.utc
67
+ )
68
+ except ValueError:
69
+ return None
70
+ return ts
71
+
72
+
73
+ # ---------------------------------------------------------------------------
74
+ # Constants
75
+ # ---------------------------------------------------------------------------
76
+
77
+ _MAX_LINE_BYTES = 1 * 1024 * 1024 # 1 MB
78
+
79
+
80
+ # ---------------------------------------------------------------------------
81
+ # Public API
82
+ # ---------------------------------------------------------------------------
83
+
84
+
85
+ def drain(
86
+ queue_path: "str | Path | None" = None,
87
+ project_id: "str | None" = None,
88
+ *,
89
+ store: "Store | None" = None,
90
+ ) -> dict:
91
+ """Drain the capture queue and write valid records to the store.
92
+
93
+ Args:
94
+ queue_path: Path to capture-queue.jsonl. If None, resolves via
95
+ EMBER_PROJECT_ROOT (or cwd) + /.ember/capture-queue.jsonl.
96
+ project_id: Bound project scope. If None, reads EMBER_PROJECT_ID from
97
+ env; if still None, accepts any record project_id (unbound).
98
+ store: Optional injected Store. If None, opens one for the drain and
99
+ closes it when done.
100
+
101
+ Returns:
102
+ dict with keys: written, skipped_malformed, skipped_secret,
103
+ skipped_foreign.
104
+ """
105
+ from ember.capture.secrets import looks_like_secret
106
+ from ember.write import write
107
+ from ember.storage import store as open_store
108
+
109
+ counts: dict[str, int] = {
110
+ "written": 0,
111
+ "skipped_malformed": 0,
112
+ "skipped_secret": 0,
113
+ "skipped_foreign": 0,
114
+ }
115
+
116
+ # Resolve queue path
117
+ if queue_path is None:
118
+ from ember.config import get_project_root
119
+ root = get_project_root()
120
+ resolved_queue = Path(root) / ".ember" / "capture-queue.jsonl"
121
+ else:
122
+ resolved_queue = Path(queue_path)
123
+
124
+ processing_path = Path(str(resolved_queue) + ".processing")
125
+
126
+ # Resolve bound project_id
127
+ if project_id is None:
128
+ project_id = os.environ.get("EMBER_PROJECT_ID") or None
129
+
130
+ # Open store if not injected
131
+ own_store = store is None
132
+ if own_store:
133
+ store = open_store()
134
+
135
+ try:
136
+ _drain_inner(
137
+ resolved_queue,
138
+ processing_path,
139
+ project_id,
140
+ store,
141
+ counts,
142
+ looks_like_secret,
143
+ write,
144
+ )
145
+ finally:
146
+ if own_store:
147
+ store.close()
148
+
149
+ return counts
150
+
151
+
152
+ def _drain_inner(
153
+ queue: Path,
154
+ processing: Path,
155
+ project_id: "str | None",
156
+ store: "Store",
157
+ counts: dict,
158
+ looks_like_secret_fn,
159
+ write_fn,
160
+ ) -> None:
161
+ """Core drain logic (separated for testability)."""
162
+
163
+ # Handle stale .processing from a prior crash — drain it first.
164
+ if processing.exists():
165
+ _drain_file(
166
+ processing,
167
+ project_id,
168
+ store,
169
+ counts,
170
+ looks_like_secret_fn,
171
+ write_fn,
172
+ )
173
+ # If drain succeeds, remove stale processing file. If removal fails,
174
+ # return early to avoid os.replace(queue, processing) overwriting the
175
+ # not-yet-removed file with a fresh batch.
176
+ try:
177
+ os.remove(processing)
178
+ except OSError as exc:
179
+ print(
180
+ f"ember drain: warning: could not remove stale .processing: {exc}",
181
+ file=sys.stderr,
182
+ )
183
+ return
184
+
185
+ # Check if queue file exists and is non-empty.
186
+ if not queue.exists():
187
+ return
188
+ if queue.stat().st_size == 0:
189
+ return
190
+
191
+ # Atomic rename: producer can keep writing to the original path.
192
+ try:
193
+ os.replace(queue, processing)
194
+ except OSError as exc:
195
+ print(
196
+ f"ember drain: error: could not rename queue to processing: {exc}",
197
+ file=sys.stderr,
198
+ )
199
+ return
200
+
201
+ # Drain the .processing file.
202
+ _drain_file(
203
+ processing,
204
+ project_id,
205
+ store,
206
+ counts,
207
+ looks_like_secret_fn,
208
+ write_fn,
209
+ )
210
+
211
+ # Clean completion: remove .processing.
212
+ try:
213
+ os.remove(processing)
214
+ except OSError as exc:
215
+ print(
216
+ f"ember drain: warning: could not remove .processing after drain: {exc}",
217
+ file=sys.stderr,
218
+ )
219
+
220
+
221
+ def _drain_file(
222
+ path: Path,
223
+ project_id: "str | None",
224
+ store: "Store",
225
+ counts: dict,
226
+ looks_like_secret_fn,
227
+ write_fn,
228
+ ) -> None:
229
+ """Read path line-by-line and process each record.
230
+
231
+ Raises if write_fn raises (caller handles cleanup). Skips do NOT raise.
232
+ """
233
+ try:
234
+ data = path.read_bytes()
235
+ except OSError as exc:
236
+ print(
237
+ f"ember drain: error: could not read {path}: {exc}",
238
+ file=sys.stderr,
239
+ )
240
+ return
241
+
242
+ for raw_line in data.split(b"\n"):
243
+ line = raw_line.strip()
244
+ if not line:
245
+ continue
246
+
247
+ # 1. Size guard
248
+ if len(line) > _MAX_LINE_BYTES:
249
+ print(
250
+ "ember drain: skipped malformed record: line exceeds 1MB",
251
+ file=sys.stderr,
252
+ )
253
+ counts["skipped_malformed"] += 1
254
+ continue
255
+
256
+ # 2. JSON parse
257
+ try:
258
+ record = json.loads(line)
259
+ except json.JSONDecodeError as exc:
260
+ print(
261
+ f"ember drain: skipped malformed record: JSONDecodeError: {exc}",
262
+ file=sys.stderr,
263
+ )
264
+ counts["skipped_malformed"] += 1
265
+ continue
266
+
267
+ if not isinstance(record, dict):
268
+ print(
269
+ "ember drain: skipped malformed record: not a JSON object",
270
+ file=sys.stderr,
271
+ )
272
+ counts["skipped_malformed"] += 1
273
+ continue
274
+
275
+ # 3. Required fields: content (non-empty str), project_id (str)
276
+ content = record.get("content")
277
+ rec_project_id = record.get("project_id")
278
+
279
+ if not isinstance(content, str) or not content.strip():
280
+ print(
281
+ "ember drain: skipped malformed record: missing or empty content",
282
+ file=sys.stderr,
283
+ )
284
+ counts["skipped_malformed"] += 1
285
+ continue
286
+
287
+ if not isinstance(rec_project_id, str) or not rec_project_id:
288
+ print(
289
+ "ember drain: skipped malformed record: missing or invalid project_id",
290
+ file=sys.stderr,
291
+ )
292
+ counts["skipped_malformed"] += 1
293
+ continue
294
+
295
+ # 4. Bound-scope check
296
+ if project_id is not None and rec_project_id != project_id:
297
+ print(
298
+ f"ember drain: skipped foreign record: project_id={rec_project_id!r} "
299
+ f"(bound to {project_id!r})",
300
+ file=sys.stderr,
301
+ )
302
+ counts["skipped_foreign"] += 1
303
+ continue
304
+
305
+ # 5. Secret scanner — log WITHOUT echoing content
306
+ if looks_like_secret_fn(content):
307
+ print(
308
+ "ember drain: skipped secret record: content looks like a credential",
309
+ file=sys.stderr,
310
+ )
311
+ counts["skipped_secret"] += 1
312
+ continue
313
+
314
+ # 6. Validate metadata
315
+ raw_metadata = record.get("metadata")
316
+ metadata = raw_metadata if isinstance(raw_metadata, dict) else None
317
+
318
+ # 7. Validate type
319
+ raw_type = record.get("type")
320
+ rec_type = raw_type if isinstance(raw_type, str) and raw_type else "diary"
321
+
322
+ # 8. Validate event timestamp
323
+ valid_ts = _validate_ts(record.get("ts"))
324
+ created_at = valid_ts # None if invalid → write() uses SQL DEFAULT now
325
+
326
+ if record.get("ts") and valid_ts is None:
327
+ print(
328
+ "ember drain: malformed ts for record — using drain time as created_at",
329
+ file=sys.stderr,
330
+ )
331
+
332
+ # 9. Write (may raise — caller handles cleanup)
333
+ write_fn(
334
+ content,
335
+ metadata=metadata,
336
+ project_id=rec_project_id,
337
+ store=store,
338
+ type=rec_type,
339
+ created_at=created_at,
340
+ )
341
+ counts["written"] += 1
@@ -0,0 +1,137 @@
1
+ """Secret scanner for ember capture drain.
2
+
3
+ Detects credentials/secrets in content strings and prevents them from being
4
+ written to the store. Conservative: over-skip is acceptable; under-skip is not.
5
+
6
+ All patterns compiled at module level for performance.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import math
12
+ import re
13
+
14
+ # ---------------------------------------------------------------------------
15
+ # Compiled patterns
16
+ # ---------------------------------------------------------------------------
17
+
18
+ # PEM private key block
19
+ _RE_PEM = re.compile(r"-----BEGIN\s+(?:\w+\s+)*PRIVATE KEY-----", re.IGNORECASE)
20
+
21
+ # OpenAI API key: sk- followed by 20+ alphanumeric chars
22
+ _RE_OPENAI = re.compile(r"\bsk-[A-Za-z0-9]{20,}\b")
23
+
24
+ # AWS Access Key ID
25
+ _RE_AWS_KEY = re.compile(r"\bAKIA[0-9A-Z]{16}\b")
26
+
27
+ # AWS secret access key assignment
28
+ _RE_AWS_SECRET = re.compile(r"aws_secret_access_key\s*[=:]\s*\S{30,}", re.IGNORECASE)
29
+
30
+ # Bearer token (Authorization header style)
31
+ _RE_BEARER = re.compile(r"\bBearer\s+[A-Za-z0-9._\-]{20,}\b")
32
+
33
+ # JWT: eyJ...eyJ.... (header.payload.signature pattern)
34
+ _RE_JWT = re.compile(r"\beyJ[\w-]+\.eyJ[\w-]+\.[\w-]+\b")
35
+
36
+ # .env-style assignment with keyword gate: KEY=value (8+ char value)
37
+ # Matches KEY=VALUE assignments anywhere on a line — after 'export ', after
38
+ # leading whitespace, or at column 0. The keyword gate requires one of the
39
+ # sensitive keywords inside the variable name. No start-of-line anchor so
40
+ # "export API_KEY=value" and " SECRET_TOKEN=value" are caught.
41
+ # Keywords: KEY|TOKEN|SECRET|PASSWORD|PASSWD|PWD|API|CREDENTIAL|PRIVATE
42
+ _RE_DOTENV = re.compile(
43
+ r"(?<![A-Za-z0-9_])(?:[A-Z][A-Z0-9_]*)?(?:KEY|TOKEN|SECRET|PASSWORD|PASSWD|PWD|API|CREDENTIAL|PRIVATE)"
44
+ r"[A-Z0-9_]*\s*=\s*\S{8,}",
45
+ re.IGNORECASE,
46
+ )
47
+
48
+ # Generic high-entropy hex string: 32+ hex chars
49
+ _RE_HEX32 = re.compile(r"\b[0-9a-fA-F]{32,}\b")
50
+
51
+ # Generic high-entropy base64 string: 40+ base64 chars
52
+ _RE_B64_40 = re.compile(r"[A-Za-z0-9+/]{40,}={0,2}")
53
+
54
+ # Entropy threshold (bits per character)
55
+ _ENTROPY_THRESHOLD = 3.5
56
+
57
+
58
+ def _shannon_entropy(s: str) -> float:
59
+ """Compute Shannon entropy of a string in bits per character."""
60
+ if not s:
61
+ return 0.0
62
+ freq: dict[str, int] = {}
63
+ for c in s:
64
+ freq[c] = freq.get(c, 0) + 1
65
+ n = len(s)
66
+ entropy = 0.0
67
+ for count in freq.values():
68
+ p = count / n
69
+ entropy -= p * math.log2(p)
70
+ return entropy
71
+
72
+
73
+ def _has_high_entropy_hex(content: str) -> bool:
74
+ """Return True if content contains a 32+ char hex string with entropy > threshold."""
75
+ for m in _RE_HEX32.finditer(content):
76
+ if _shannon_entropy(m.group()) > _ENTROPY_THRESHOLD:
77
+ return True
78
+ return False
79
+
80
+
81
+ def _has_high_entropy_b64(content: str) -> bool:
82
+ """Return True if content contains a 40+ char base64 string with entropy > threshold."""
83
+ for m in _RE_B64_40.finditer(content):
84
+ token = m.group().rstrip("=")
85
+ if len(token) >= 40 and _shannon_entropy(token) > _ENTROPY_THRESHOLD:
86
+ return True
87
+ return False
88
+
89
+
90
+ def looks_like_secret(content: str) -> bool:
91
+ """Return True if content looks like it contains a credential/secret.
92
+
93
+ Conservative: false positives (over-skip) are acceptable.
94
+ Never logs or echoes the content — caller must NOT log content on True.
95
+
96
+ Detectors (in order):
97
+ 1. PEM private key block
98
+ 2. OpenAI API key (sk-...)
99
+ 3. AWS AKIA access key ID
100
+ 4. AWS secret access key assignment
101
+ 5. Bearer token
102
+ 6. JWT (eyJ...eyJ...signature)
103
+ 7. .env-style KEY=value with secret keyword
104
+ 8. High-entropy hex32+ string
105
+ 9. High-entropy base64-40+ string
106
+ """
107
+ if not content:
108
+ return False
109
+
110
+ if _RE_PEM.search(content):
111
+ return True
112
+
113
+ if _RE_OPENAI.search(content):
114
+ return True
115
+
116
+ if _RE_AWS_KEY.search(content):
117
+ return True
118
+
119
+ if _RE_AWS_SECRET.search(content):
120
+ return True
121
+
122
+ if _RE_BEARER.search(content):
123
+ return True
124
+
125
+ if _RE_JWT.search(content):
126
+ return True
127
+
128
+ if _RE_DOTENV.search(content):
129
+ return True
130
+
131
+ if _has_high_entropy_hex(content):
132
+ return True
133
+
134
+ if _has_high_entropy_b64(content):
135
+ return True
136
+
137
+ return False