mnemox 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ember/__init__.py +3 -0
- ember/__main__.py +129 -0
- ember/capture/__init__.py +6 -0
- ember/capture/drain.py +341 -0
- ember/capture/secrets.py +137 -0
- ember/cli.py +309 -0
- ember/config.py +180 -0
- ember/crypto/__init__.py +35 -0
- ember/crypto/_blind.py +70 -0
- ember/crypto/_cipher.py +122 -0
- ember/crypto/_kdf.py +58 -0
- ember/crypto/errors.py +13 -0
- ember/db/__init__.py +25 -0
- ember/db/migrate.py +32 -0
- ember/db/migrations/__init__.py +1 -0
- ember/db/migrations/_migration_0001.py +51 -0
- ember/db/migrations/_migration_0002.py +192 -0
- ember/embeddings/__init__.py +8 -0
- ember/embeddings/engine.py +201 -0
- ember/embeddings/tokenizer.py +89 -0
- ember/extraction/__init__.py +5 -0
- ember/extraction/local.py +81 -0
- ember/invalidation/__init__.py +15 -0
- ember/invalidation/cosine.py +140 -0
- ember/keystore/__init__.py +27 -0
- ember/keystore/_backend.py +256 -0
- ember/keystore/_salt.py +40 -0
- ember/keystore/errors.py +9 -0
- ember/mcp/__init__.py +5 -0
- ember/mcp/tools.py +510 -0
- ember/mirror/__init__.py +5 -0
- ember/mirror/render.py +143 -0
- ember/models/__init__.py +1 -0
- ember/models/cache.py +40 -0
- ember/models/loader.py +54 -0
- ember/modes/__init__.py +19 -0
- ember/modes/scope.py +243 -0
- ember/oplog/__init__.py +19 -0
- ember/oplog/backfill.py +98 -0
- ember/oplog/codec.py +87 -0
- ember/oplog/entry.py +35 -0
- ember/oplog/producer.py +192 -0
- ember/ranking/__init__.py +7 -0
- ember/ranking/cosine.py +12 -0
- ember/ranking/fusion.py +78 -0
- ember/ranking/result.py +18 -0
- ember/retrieval/__init__.py +5 -0
- ember/retrieval/_search.py +143 -0
- ember/retrieval/bm25.py +88 -0
- ember/retrieval/tiers.py +101 -0
- ember/schema/__init__.py +1 -0
- ember/server.py +156 -0
- ember/storage/__init__.py +31 -0
- ember/storage/paths.py +33 -0
- ember/storage/store.py +210 -0
- ember/sync/__init__.py +32 -0
- ember/sync/_aad.py +23 -0
- ember/sync/client.py +382 -0
- ember/sync/cursor.py +76 -0
- ember/sync/errors.py +22 -0
- ember/sync/merge.py +197 -0
- ember/sync/oplog_store.py +169 -0
- ember/sync/pull.py +53 -0
- ember/sync/push.py +42 -0
- ember/sync/transport.py +65 -0
- ember/vec/__init__.py +19 -0
- ember/vec/_build.py +151 -0
- ember/vec/_gate.py +66 -0
- ember/vec/_meta.py +67 -0
- ember/vec/_query.py +86 -0
- ember/vec/index.py +98 -0
- ember/write/__init__.py +6 -0
- ember/write/flush.py +197 -0
- ember/write/pipeline.py +104 -0
- mnemox-0.2.0.dist-info/METADATA +329 -0
- mnemox-0.2.0.dist-info/RECORD +80 -0
- mnemox-0.2.0.dist-info/WHEEL +5 -0
- mnemox-0.2.0.dist-info/entry_points.txt +2 -0
- mnemox-0.2.0.dist-info/licenses/LICENSE +21 -0
- mnemox-0.2.0.dist-info/top_level.txt +1 -0
ember/__init__.py
ADDED
ember/__main__.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""Entry point for ``python -m ember``.
|
|
2
|
+
|
|
3
|
+
Usage::
|
|
4
|
+
|
|
5
|
+
python -m ember mirror --project-id <id> [--output-dir memory] [--db-path /path/to.db]
|
|
6
|
+
python -m ember serve
|
|
7
|
+
python -m ember init [--mode project|global|server]
|
|
8
|
+
python -m ember --version
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import sys
|
|
13
|
+
|
|
14
|
+
from ember.cli import cmd_backfill, cmd_drain, cmd_init, get_version
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _cmd_mirror(args: argparse.Namespace) -> None:
|
|
18
|
+
from ember.storage import store as _store_factory
|
|
19
|
+
from ember.mirror import render_to_md
|
|
20
|
+
|
|
21
|
+
s = _store_factory(args.db_path or None)
|
|
22
|
+
try:
|
|
23
|
+
result = render_to_md(args.project_id, args.output_dir, store=s)
|
|
24
|
+
finally:
|
|
25
|
+
s.close()
|
|
26
|
+
|
|
27
|
+
total_atoms = sum(result.values())
|
|
28
|
+
total_files = len(result)
|
|
29
|
+
print(f"mirrored {total_atoms} atoms across {total_files} files")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _cmd_serve(args: argparse.Namespace) -> None:
|
|
33
|
+
import asyncio
|
|
34
|
+
from ember.server import run_stdio
|
|
35
|
+
|
|
36
|
+
asyncio.run(run_stdio())
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def main() -> None:
|
|
40
|
+
parser = argparse.ArgumentParser(
|
|
41
|
+
prog="ember",
|
|
42
|
+
description="Ember local-first memory system.",
|
|
43
|
+
)
|
|
44
|
+
parser.add_argument(
|
|
45
|
+
"--version",
|
|
46
|
+
action="version",
|
|
47
|
+
version=f"ember {get_version()}",
|
|
48
|
+
)
|
|
49
|
+
sub = parser.add_subparsers(dest="command", metavar="<command>")
|
|
50
|
+
|
|
51
|
+
mirror_p = sub.add_parser("mirror", help="Render Ember records to memory/*.md files.")
|
|
52
|
+
mirror_p.add_argument("--project-id", required=True, help="Ember project identifier.")
|
|
53
|
+
mirror_p.add_argument(
|
|
54
|
+
"--output-dir",
|
|
55
|
+
default="memory",
|
|
56
|
+
help="Output directory for .md files (default: memory).",
|
|
57
|
+
)
|
|
58
|
+
mirror_p.add_argument(
|
|
59
|
+
"--db-path",
|
|
60
|
+
default=None,
|
|
61
|
+
help="Path to the Ember SQLite DB (default: resolved via EMBER_DB_PATH or ~/.ember/ember.db).",
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
sub.add_parser("serve", help="Start the Ember MCP server over stdio.")
|
|
65
|
+
|
|
66
|
+
init_p = sub.add_parser("init", help="Initialise the Ember store and write config.json.")
|
|
67
|
+
init_p.add_argument(
|
|
68
|
+
"--mode",
|
|
69
|
+
choices=["project", "global", "server"],
|
|
70
|
+
default=None,
|
|
71
|
+
help="Initialisation mode (default: project).",
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
drain_p = sub.add_parser("drain", help="Drain the capture queue and write records to the store.")
|
|
75
|
+
drain_p.add_argument(
|
|
76
|
+
"--queue-path",
|
|
77
|
+
default=None,
|
|
78
|
+
dest="queue_path",
|
|
79
|
+
help="Path to capture-queue.jsonl (default: <project-root>/.ember/capture-queue.jsonl).",
|
|
80
|
+
)
|
|
81
|
+
drain_p.add_argument(
|
|
82
|
+
"--db-path",
|
|
83
|
+
default=None,
|
|
84
|
+
dest="db_path",
|
|
85
|
+
help="Path to the Ember SQLite DB (default: resolved via EMBER_DB_PATH or mode).",
|
|
86
|
+
)
|
|
87
|
+
drain_p.add_argument(
|
|
88
|
+
"--project-id",
|
|
89
|
+
default=None,
|
|
90
|
+
dest="project_id",
|
|
91
|
+
help="Bound project scope; only records with this project_id are written.",
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
backfill_p = sub.add_parser(
|
|
95
|
+
"backfill",
|
|
96
|
+
help="Re-emit pre-existing atoms into the oplog so they sync to the hub.",
|
|
97
|
+
)
|
|
98
|
+
backfill_p.add_argument(
|
|
99
|
+
"--db-path",
|
|
100
|
+
default=None,
|
|
101
|
+
dest="db_path",
|
|
102
|
+
help="Path to the Ember SQLite DB (default: resolved via EMBER_DB_PATH or mode).",
|
|
103
|
+
)
|
|
104
|
+
backfill_p.add_argument(
|
|
105
|
+
"--project-id",
|
|
106
|
+
default=None,
|
|
107
|
+
dest="project_id",
|
|
108
|
+
help="Project scope to backfill (default: derived from the project root).",
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
args = parser.parse_args()
|
|
112
|
+
if args.command is None:
|
|
113
|
+
parser.print_help()
|
|
114
|
+
sys.exit(1)
|
|
115
|
+
|
|
116
|
+
if args.command == "mirror":
|
|
117
|
+
_cmd_mirror(args)
|
|
118
|
+
elif args.command == "serve":
|
|
119
|
+
_cmd_serve(args)
|
|
120
|
+
elif args.command == "init":
|
|
121
|
+
sys.exit(cmd_init(args.mode))
|
|
122
|
+
elif args.command == "drain":
|
|
123
|
+
sys.exit(cmd_drain(args.queue_path, args.db_path, args.project_id))
|
|
124
|
+
elif args.command == "backfill":
|
|
125
|
+
sys.exit(cmd_backfill(args.db_path, args.project_id))
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
if __name__ == "__main__":
|
|
129
|
+
main()
|
ember/capture/drain.py
ADDED
|
@@ -0,0 +1,341 @@
|
|
|
1
|
+
"""Drain the ember capture queue into the write pipeline.
|
|
2
|
+
|
|
3
|
+
Reads .ember/capture-queue.jsonl, validates each record, filters secrets and
|
|
4
|
+
foreign project_ids, then calls ember.write.write() for each valid record.
|
|
5
|
+
|
|
6
|
+
Durability model (atomic rename):
|
|
7
|
+
1. queue absent/empty → zero-counts no-op.
|
|
8
|
+
2. os.replace(queue, queue+".processing") — atomic rename; producer keeps
|
|
9
|
+
appending to the original path without data loss.
|
|
10
|
+
3. Read .processing line-by-line, write() each valid record.
|
|
11
|
+
4. Clean completion → os.remove(.processing) (== rotation/truncation).
|
|
12
|
+
5. Stale .processing at entry → drain it first.
|
|
13
|
+
If write() raises mid-drain: leave .processing in place and re-raise after
|
|
14
|
+
logging; next run reprocesses the whole batch (may double-write succeeded
|
|
15
|
+
records; accepted — cosine invalidate_superseded mitigates duplicates).
|
|
16
|
+
Skips (malformed/foreign/secret) never leave .processing.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import json
|
|
22
|
+
import os
|
|
23
|
+
import re
|
|
24
|
+
import sys
|
|
25
|
+
from datetime import datetime, timezone
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
from typing import TYPE_CHECKING
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from ember.storage.store import Store
|
|
31
|
+
|
|
32
|
+
# ---------------------------------------------------------------------------
|
|
33
|
+
# Timestamp validation
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
# Accepts second-precision (ember-capture.sh) and millisecond-precision (DB DEFAULT).
|
|
37
|
+
# Pattern: YYYY-MM-DDTHH:MM:SS[.fraction]Z
|
|
38
|
+
_TS_RE = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?Z$")
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _validate_ts(ts: object) -> "str | None":
|
|
42
|
+
"""Validate an ISO-8601 UTC timestamp string.
|
|
43
|
+
|
|
44
|
+
Returns the original string if valid, else None.
|
|
45
|
+
Accepts second-precision (YYYY-MM-DDTHH:MM:SSZ) and millisecond-precision
|
|
46
|
+
(YYYY-MM-DDTHH:MM:SS.fffZ).
|
|
47
|
+
"""
|
|
48
|
+
if not isinstance(ts, str):
|
|
49
|
+
return None
|
|
50
|
+
if not _TS_RE.match(ts):
|
|
51
|
+
return None
|
|
52
|
+
# Round-trip validation (catches out-of-range dates like month=13)
|
|
53
|
+
try:
|
|
54
|
+
# Normalise to a form datetime.strptime can parse.
|
|
55
|
+
ts_clean = ts.rstrip("Z")
|
|
56
|
+
if "." in ts_clean:
|
|
57
|
+
# Truncate/pad fractional to 6 digits for %f
|
|
58
|
+
parts = ts_clean.split(".")
|
|
59
|
+
frac = parts[1][:6].ljust(6, "0")
|
|
60
|
+
ts_clean = f"{parts[0]}.{frac}"
|
|
61
|
+
datetime.strptime(ts_clean, "%Y-%m-%dT%H:%M:%S.%f").replace(
|
|
62
|
+
tzinfo=timezone.utc
|
|
63
|
+
)
|
|
64
|
+
else:
|
|
65
|
+
datetime.strptime(ts_clean, "%Y-%m-%dT%H:%M:%S").replace(
|
|
66
|
+
tzinfo=timezone.utc
|
|
67
|
+
)
|
|
68
|
+
except ValueError:
|
|
69
|
+
return None
|
|
70
|
+
return ts
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# ---------------------------------------------------------------------------
|
|
74
|
+
# Constants
|
|
75
|
+
# ---------------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
_MAX_LINE_BYTES = 1 * 1024 * 1024 # 1 MB
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# ---------------------------------------------------------------------------
|
|
81
|
+
# Public API
|
|
82
|
+
# ---------------------------------------------------------------------------
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def drain(
|
|
86
|
+
queue_path: "str | Path | None" = None,
|
|
87
|
+
project_id: "str | None" = None,
|
|
88
|
+
*,
|
|
89
|
+
store: "Store | None" = None,
|
|
90
|
+
) -> dict:
|
|
91
|
+
"""Drain the capture queue and write valid records to the store.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
queue_path: Path to capture-queue.jsonl. If None, resolves via
|
|
95
|
+
EMBER_PROJECT_ROOT (or cwd) + /.ember/capture-queue.jsonl.
|
|
96
|
+
project_id: Bound project scope. If None, reads EMBER_PROJECT_ID from
|
|
97
|
+
env; if still None, accepts any record project_id (unbound).
|
|
98
|
+
store: Optional injected Store. If None, opens one for the drain and
|
|
99
|
+
closes it when done.
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
dict with keys: written, skipped_malformed, skipped_secret,
|
|
103
|
+
skipped_foreign.
|
|
104
|
+
"""
|
|
105
|
+
from ember.capture.secrets import looks_like_secret
|
|
106
|
+
from ember.write import write
|
|
107
|
+
from ember.storage import store as open_store
|
|
108
|
+
|
|
109
|
+
counts: dict[str, int] = {
|
|
110
|
+
"written": 0,
|
|
111
|
+
"skipped_malformed": 0,
|
|
112
|
+
"skipped_secret": 0,
|
|
113
|
+
"skipped_foreign": 0,
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
# Resolve queue path
|
|
117
|
+
if queue_path is None:
|
|
118
|
+
from ember.config import get_project_root
|
|
119
|
+
root = get_project_root()
|
|
120
|
+
resolved_queue = Path(root) / ".ember" / "capture-queue.jsonl"
|
|
121
|
+
else:
|
|
122
|
+
resolved_queue = Path(queue_path)
|
|
123
|
+
|
|
124
|
+
processing_path = Path(str(resolved_queue) + ".processing")
|
|
125
|
+
|
|
126
|
+
# Resolve bound project_id
|
|
127
|
+
if project_id is None:
|
|
128
|
+
project_id = os.environ.get("EMBER_PROJECT_ID") or None
|
|
129
|
+
|
|
130
|
+
# Open store if not injected
|
|
131
|
+
own_store = store is None
|
|
132
|
+
if own_store:
|
|
133
|
+
store = open_store()
|
|
134
|
+
|
|
135
|
+
try:
|
|
136
|
+
_drain_inner(
|
|
137
|
+
resolved_queue,
|
|
138
|
+
processing_path,
|
|
139
|
+
project_id,
|
|
140
|
+
store,
|
|
141
|
+
counts,
|
|
142
|
+
looks_like_secret,
|
|
143
|
+
write,
|
|
144
|
+
)
|
|
145
|
+
finally:
|
|
146
|
+
if own_store:
|
|
147
|
+
store.close()
|
|
148
|
+
|
|
149
|
+
return counts
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _drain_inner(
|
|
153
|
+
queue: Path,
|
|
154
|
+
processing: Path,
|
|
155
|
+
project_id: "str | None",
|
|
156
|
+
store: "Store",
|
|
157
|
+
counts: dict,
|
|
158
|
+
looks_like_secret_fn,
|
|
159
|
+
write_fn,
|
|
160
|
+
) -> None:
|
|
161
|
+
"""Core drain logic (separated for testability)."""
|
|
162
|
+
|
|
163
|
+
# Handle stale .processing from a prior crash — drain it first.
|
|
164
|
+
if processing.exists():
|
|
165
|
+
_drain_file(
|
|
166
|
+
processing,
|
|
167
|
+
project_id,
|
|
168
|
+
store,
|
|
169
|
+
counts,
|
|
170
|
+
looks_like_secret_fn,
|
|
171
|
+
write_fn,
|
|
172
|
+
)
|
|
173
|
+
# If drain succeeds, remove stale processing file. If removal fails,
|
|
174
|
+
# return early to avoid os.replace(queue, processing) overwriting the
|
|
175
|
+
# not-yet-removed file with a fresh batch.
|
|
176
|
+
try:
|
|
177
|
+
os.remove(processing)
|
|
178
|
+
except OSError as exc:
|
|
179
|
+
print(
|
|
180
|
+
f"ember drain: warning: could not remove stale .processing: {exc}",
|
|
181
|
+
file=sys.stderr,
|
|
182
|
+
)
|
|
183
|
+
return
|
|
184
|
+
|
|
185
|
+
# Check if queue file exists and is non-empty.
|
|
186
|
+
if not queue.exists():
|
|
187
|
+
return
|
|
188
|
+
if queue.stat().st_size == 0:
|
|
189
|
+
return
|
|
190
|
+
|
|
191
|
+
# Atomic rename: producer can keep writing to the original path.
|
|
192
|
+
try:
|
|
193
|
+
os.replace(queue, processing)
|
|
194
|
+
except OSError as exc:
|
|
195
|
+
print(
|
|
196
|
+
f"ember drain: error: could not rename queue to processing: {exc}",
|
|
197
|
+
file=sys.stderr,
|
|
198
|
+
)
|
|
199
|
+
return
|
|
200
|
+
|
|
201
|
+
# Drain the .processing file.
|
|
202
|
+
_drain_file(
|
|
203
|
+
processing,
|
|
204
|
+
project_id,
|
|
205
|
+
store,
|
|
206
|
+
counts,
|
|
207
|
+
looks_like_secret_fn,
|
|
208
|
+
write_fn,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Clean completion: remove .processing.
|
|
212
|
+
try:
|
|
213
|
+
os.remove(processing)
|
|
214
|
+
except OSError as exc:
|
|
215
|
+
print(
|
|
216
|
+
f"ember drain: warning: could not remove .processing after drain: {exc}",
|
|
217
|
+
file=sys.stderr,
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _drain_file(
|
|
222
|
+
path: Path,
|
|
223
|
+
project_id: "str | None",
|
|
224
|
+
store: "Store",
|
|
225
|
+
counts: dict,
|
|
226
|
+
looks_like_secret_fn,
|
|
227
|
+
write_fn,
|
|
228
|
+
) -> None:
|
|
229
|
+
"""Read path line-by-line and process each record.
|
|
230
|
+
|
|
231
|
+
Raises if write_fn raises (caller handles cleanup). Skips do NOT raise.
|
|
232
|
+
"""
|
|
233
|
+
try:
|
|
234
|
+
data = path.read_bytes()
|
|
235
|
+
except OSError as exc:
|
|
236
|
+
print(
|
|
237
|
+
f"ember drain: error: could not read {path}: {exc}",
|
|
238
|
+
file=sys.stderr,
|
|
239
|
+
)
|
|
240
|
+
return
|
|
241
|
+
|
|
242
|
+
for raw_line in data.split(b"\n"):
|
|
243
|
+
line = raw_line.strip()
|
|
244
|
+
if not line:
|
|
245
|
+
continue
|
|
246
|
+
|
|
247
|
+
# 1. Size guard
|
|
248
|
+
if len(line) > _MAX_LINE_BYTES:
|
|
249
|
+
print(
|
|
250
|
+
"ember drain: skipped malformed record: line exceeds 1MB",
|
|
251
|
+
file=sys.stderr,
|
|
252
|
+
)
|
|
253
|
+
counts["skipped_malformed"] += 1
|
|
254
|
+
continue
|
|
255
|
+
|
|
256
|
+
# 2. JSON parse
|
|
257
|
+
try:
|
|
258
|
+
record = json.loads(line)
|
|
259
|
+
except json.JSONDecodeError as exc:
|
|
260
|
+
print(
|
|
261
|
+
f"ember drain: skipped malformed record: JSONDecodeError: {exc}",
|
|
262
|
+
file=sys.stderr,
|
|
263
|
+
)
|
|
264
|
+
counts["skipped_malformed"] += 1
|
|
265
|
+
continue
|
|
266
|
+
|
|
267
|
+
if not isinstance(record, dict):
|
|
268
|
+
print(
|
|
269
|
+
"ember drain: skipped malformed record: not a JSON object",
|
|
270
|
+
file=sys.stderr,
|
|
271
|
+
)
|
|
272
|
+
counts["skipped_malformed"] += 1
|
|
273
|
+
continue
|
|
274
|
+
|
|
275
|
+
# 3. Required fields: content (non-empty str), project_id (str)
|
|
276
|
+
content = record.get("content")
|
|
277
|
+
rec_project_id = record.get("project_id")
|
|
278
|
+
|
|
279
|
+
if not isinstance(content, str) or not content.strip():
|
|
280
|
+
print(
|
|
281
|
+
"ember drain: skipped malformed record: missing or empty content",
|
|
282
|
+
file=sys.stderr,
|
|
283
|
+
)
|
|
284
|
+
counts["skipped_malformed"] += 1
|
|
285
|
+
continue
|
|
286
|
+
|
|
287
|
+
if not isinstance(rec_project_id, str) or not rec_project_id:
|
|
288
|
+
print(
|
|
289
|
+
"ember drain: skipped malformed record: missing or invalid project_id",
|
|
290
|
+
file=sys.stderr,
|
|
291
|
+
)
|
|
292
|
+
counts["skipped_malformed"] += 1
|
|
293
|
+
continue
|
|
294
|
+
|
|
295
|
+
# 4. Bound-scope check
|
|
296
|
+
if project_id is not None and rec_project_id != project_id:
|
|
297
|
+
print(
|
|
298
|
+
f"ember drain: skipped foreign record: project_id={rec_project_id!r} "
|
|
299
|
+
f"(bound to {project_id!r})",
|
|
300
|
+
file=sys.stderr,
|
|
301
|
+
)
|
|
302
|
+
counts["skipped_foreign"] += 1
|
|
303
|
+
continue
|
|
304
|
+
|
|
305
|
+
# 5. Secret scanner — log WITHOUT echoing content
|
|
306
|
+
if looks_like_secret_fn(content):
|
|
307
|
+
print(
|
|
308
|
+
"ember drain: skipped secret record: content looks like a credential",
|
|
309
|
+
file=sys.stderr,
|
|
310
|
+
)
|
|
311
|
+
counts["skipped_secret"] += 1
|
|
312
|
+
continue
|
|
313
|
+
|
|
314
|
+
# 6. Validate metadata
|
|
315
|
+
raw_metadata = record.get("metadata")
|
|
316
|
+
metadata = raw_metadata if isinstance(raw_metadata, dict) else None
|
|
317
|
+
|
|
318
|
+
# 7. Validate type
|
|
319
|
+
raw_type = record.get("type")
|
|
320
|
+
rec_type = raw_type if isinstance(raw_type, str) and raw_type else "diary"
|
|
321
|
+
|
|
322
|
+
# 8. Validate event timestamp
|
|
323
|
+
valid_ts = _validate_ts(record.get("ts"))
|
|
324
|
+
created_at = valid_ts # None if invalid → write() uses SQL DEFAULT now
|
|
325
|
+
|
|
326
|
+
if record.get("ts") and valid_ts is None:
|
|
327
|
+
print(
|
|
328
|
+
"ember drain: malformed ts for record — using drain time as created_at",
|
|
329
|
+
file=sys.stderr,
|
|
330
|
+
)
|
|
331
|
+
|
|
332
|
+
# 9. Write (may raise — caller handles cleanup)
|
|
333
|
+
write_fn(
|
|
334
|
+
content,
|
|
335
|
+
metadata=metadata,
|
|
336
|
+
project_id=rec_project_id,
|
|
337
|
+
store=store,
|
|
338
|
+
type=rec_type,
|
|
339
|
+
created_at=created_at,
|
|
340
|
+
)
|
|
341
|
+
counts["written"] += 1
|
ember/capture/secrets.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
"""Secret scanner for ember capture drain.
|
|
2
|
+
|
|
3
|
+
Detects credentials/secrets in content strings and prevents them from being
|
|
4
|
+
written to the store. Conservative: over-skip is acceptable; under-skip is not.
|
|
5
|
+
|
|
6
|
+
All patterns compiled at module level for performance.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import math
|
|
12
|
+
import re
|
|
13
|
+
|
|
14
|
+
# ---------------------------------------------------------------------------
|
|
15
|
+
# Compiled patterns
|
|
16
|
+
# ---------------------------------------------------------------------------
|
|
17
|
+
|
|
18
|
+
# PEM private key block
|
|
19
|
+
_RE_PEM = re.compile(r"-----BEGIN\s+(?:\w+\s+)*PRIVATE KEY-----", re.IGNORECASE)
|
|
20
|
+
|
|
21
|
+
# OpenAI API key: sk- followed by 20+ alphanumeric chars
|
|
22
|
+
_RE_OPENAI = re.compile(r"\bsk-[A-Za-z0-9]{20,}\b")
|
|
23
|
+
|
|
24
|
+
# AWS Access Key ID
|
|
25
|
+
_RE_AWS_KEY = re.compile(r"\bAKIA[0-9A-Z]{16}\b")
|
|
26
|
+
|
|
27
|
+
# AWS secret access key assignment
|
|
28
|
+
_RE_AWS_SECRET = re.compile(r"aws_secret_access_key\s*[=:]\s*\S{30,}", re.IGNORECASE)
|
|
29
|
+
|
|
30
|
+
# Bearer token (Authorization header style)
|
|
31
|
+
_RE_BEARER = re.compile(r"\bBearer\s+[A-Za-z0-9._\-]{20,}\b")
|
|
32
|
+
|
|
33
|
+
# JWT: eyJ...eyJ.... (header.payload.signature pattern)
|
|
34
|
+
_RE_JWT = re.compile(r"\beyJ[\w-]+\.eyJ[\w-]+\.[\w-]+\b")
|
|
35
|
+
|
|
36
|
+
# .env-style assignment with keyword gate: KEY=value (8+ char value)
|
|
37
|
+
# Matches KEY=VALUE assignments anywhere on a line — after 'export ', after
|
|
38
|
+
# leading whitespace, or at column 0. The keyword gate requires one of the
|
|
39
|
+
# sensitive keywords inside the variable name. No start-of-line anchor so
|
|
40
|
+
# "export API_KEY=value" and " SECRET_TOKEN=value" are caught.
|
|
41
|
+
# Keywords: KEY|TOKEN|SECRET|PASSWORD|PASSWD|PWD|API|CREDENTIAL|PRIVATE
|
|
42
|
+
_RE_DOTENV = re.compile(
|
|
43
|
+
r"(?<![A-Za-z0-9_])(?:[A-Z][A-Z0-9_]*)?(?:KEY|TOKEN|SECRET|PASSWORD|PASSWD|PWD|API|CREDENTIAL|PRIVATE)"
|
|
44
|
+
r"[A-Z0-9_]*\s*=\s*\S{8,}",
|
|
45
|
+
re.IGNORECASE,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
# Generic high-entropy hex string: 32+ hex chars
|
|
49
|
+
_RE_HEX32 = re.compile(r"\b[0-9a-fA-F]{32,}\b")
|
|
50
|
+
|
|
51
|
+
# Generic high-entropy base64 string: 40+ base64 chars
|
|
52
|
+
_RE_B64_40 = re.compile(r"[A-Za-z0-9+/]{40,}={0,2}")
|
|
53
|
+
|
|
54
|
+
# Entropy threshold (bits per character)
|
|
55
|
+
_ENTROPY_THRESHOLD = 3.5
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _shannon_entropy(s: str) -> float:
|
|
59
|
+
"""Compute Shannon entropy of a string in bits per character."""
|
|
60
|
+
if not s:
|
|
61
|
+
return 0.0
|
|
62
|
+
freq: dict[str, int] = {}
|
|
63
|
+
for c in s:
|
|
64
|
+
freq[c] = freq.get(c, 0) + 1
|
|
65
|
+
n = len(s)
|
|
66
|
+
entropy = 0.0
|
|
67
|
+
for count in freq.values():
|
|
68
|
+
p = count / n
|
|
69
|
+
entropy -= p * math.log2(p)
|
|
70
|
+
return entropy
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _has_high_entropy_hex(content: str) -> bool:
|
|
74
|
+
"""Return True if content contains a 32+ char hex string with entropy > threshold."""
|
|
75
|
+
for m in _RE_HEX32.finditer(content):
|
|
76
|
+
if _shannon_entropy(m.group()) > _ENTROPY_THRESHOLD:
|
|
77
|
+
return True
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _has_high_entropy_b64(content: str) -> bool:
|
|
82
|
+
"""Return True if content contains a 40+ char base64 string with entropy > threshold."""
|
|
83
|
+
for m in _RE_B64_40.finditer(content):
|
|
84
|
+
token = m.group().rstrip("=")
|
|
85
|
+
if len(token) >= 40 and _shannon_entropy(token) > _ENTROPY_THRESHOLD:
|
|
86
|
+
return True
|
|
87
|
+
return False
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def looks_like_secret(content: str) -> bool:
|
|
91
|
+
"""Return True if content looks like it contains a credential/secret.
|
|
92
|
+
|
|
93
|
+
Conservative: false positives (over-skip) are acceptable.
|
|
94
|
+
Never logs or echoes the content — caller must NOT log content on True.
|
|
95
|
+
|
|
96
|
+
Detectors (in order):
|
|
97
|
+
1. PEM private key block
|
|
98
|
+
2. OpenAI API key (sk-...)
|
|
99
|
+
3. AWS AKIA access key ID
|
|
100
|
+
4. AWS secret access key assignment
|
|
101
|
+
5. Bearer token
|
|
102
|
+
6. JWT (eyJ...eyJ...signature)
|
|
103
|
+
7. .env-style KEY=value with secret keyword
|
|
104
|
+
8. High-entropy hex32+ string
|
|
105
|
+
9. High-entropy base64-40+ string
|
|
106
|
+
"""
|
|
107
|
+
if not content:
|
|
108
|
+
return False
|
|
109
|
+
|
|
110
|
+
if _RE_PEM.search(content):
|
|
111
|
+
return True
|
|
112
|
+
|
|
113
|
+
if _RE_OPENAI.search(content):
|
|
114
|
+
return True
|
|
115
|
+
|
|
116
|
+
if _RE_AWS_KEY.search(content):
|
|
117
|
+
return True
|
|
118
|
+
|
|
119
|
+
if _RE_AWS_SECRET.search(content):
|
|
120
|
+
return True
|
|
121
|
+
|
|
122
|
+
if _RE_BEARER.search(content):
|
|
123
|
+
return True
|
|
124
|
+
|
|
125
|
+
if _RE_JWT.search(content):
|
|
126
|
+
return True
|
|
127
|
+
|
|
128
|
+
if _RE_DOTENV.search(content):
|
|
129
|
+
return True
|
|
130
|
+
|
|
131
|
+
if _has_high_entropy_hex(content):
|
|
132
|
+
return True
|
|
133
|
+
|
|
134
|
+
if _has_high_entropy_b64(content):
|
|
135
|
+
return True
|
|
136
|
+
|
|
137
|
+
return False
|