ai-browser-profile 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/clean.py ADDED
@@ -0,0 +1,295 @@
1
+ #!/usr/bin/env python3
2
+ """Programmatic cleanup of memories.db — rule-based, no LLM required.
3
+
4
+ Rules:
5
+ 1. Delete autofill:* and address_type_* keys (noise/duplicate keys)
6
+ 2. Delete superseded entries
7
+ 3. Fix single-value key chains (pick winner by appeared_count)
8
+ 4. Deduplicate phones (normalize, keep highest appeared_count)
9
+ 5. Deduplicate emails (lowercase, keep highest appeared_count)
10
+ 6. Delete known noise patterns (feliciti flood, etc.)
11
+ 7. Mark everything touched as reviewed
12
+ """
13
+
14
+ import os
15
+ import re
16
+ import sys
17
+ import logging
18
+ from datetime import datetime, timezone
19
+
20
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s", datefmt="%H:%M:%S")
21
+ log = logging.getLogger("clean")
22
+
23
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
24
+ from ai_browser_profile import MemoryDB
25
+
26
+ DB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "memories.db")
27
+
28
+ SINGLE_VALUE_KEYS = ["first_name", "last_name", "full_name", "card_holder_name"]
29
+
30
+ NOISE_VALUE_PATTERNS = [
31
+ r"^application\.from\.feliciti\.co", # feliciti housing app flood
32
+ ]
33
+
34
+ NOISE_WORDS = {
35
+ "test", "asdf", "qwerty", "foo", "bar", "baz", "placeholder",
36
+ "technical placeholder just to pay",
37
+ "wegs sdg", "sdgsdg", # keyboard mash garbage
38
+ "mediar inc", "mediar, inc.", "mediar", # company name in card holder field
39
+ "omi", # product name in name field
40
+ }
41
+
42
+ # Known garbage address field combinations (city + state that don't match real location)
43
+ # Keep only entries where city/state are consistent with San Francisco, CA
44
+ KNOWN_CITIES = {"san francisco", "sf"}
45
+ KNOWN_STATES = {"california", "ca"}
46
+ KNOWN_ZIPS = {"94102", "94103", "94105", "94107", "94109", "94110", "94111",
47
+ "94114", "94115", "94117", "94118", "94121", "94122", "94123",
48
+ "94124", "94127", "94129", "94130", "94131", "94132", "94133",
49
+ "94134"}
50
+
51
+
52
+ def now_iso():
53
+ return datetime.now(timezone.utc).isoformat()
54
+
55
+
56
+ def normalize_phone(phone: str) -> str | None:
57
+ """Normalize phone to digits only, return None if not a valid US/intl number."""
58
+ digits = re.sub(r"\D", "", phone)
59
+ if digits.startswith("1") and len(digits) == 11:
60
+ digits = digits[1:]
61
+ if len(digits) == 10:
62
+ return digits
63
+ return None
64
+
65
+
66
+ def normalize_email(email: str) -> str:
67
+ return email.strip().lower()
68
+
69
+
70
+ def run_cleanup(db_path: str = DB_PATH, dry_run: bool = False):
71
+ mem = MemoryDB(db_path)
72
+ conn = mem.conn
73
+ now = now_iso()
74
+
75
+ stats = {
76
+ "autofill_deleted": 0,
77
+ "address_type_deleted": 0,
78
+ "superseded_deleted": 0,
79
+ "single_value_fixed": 0,
80
+ "phone_deduped": 0,
81
+ "email_deduped": 0,
82
+ "noise_pattern_deleted": 0,
83
+ "marked_reviewed": 0,
84
+ }
85
+
86
+ def delete_id(mid):
87
+ if not dry_run:
88
+ mem.delete(mid)
89
+
90
+ def mark_reviewed(ids):
91
+ if not dry_run and ids:
92
+ mem.mark_reviewed(ids)
93
+ stats["marked_reviewed"] += len(ids)
94
+
95
+ # ── 1. Delete autofill:* keys ──────────────────────────────────
96
+ autofill_ids = [r[0] for r in conn.execute(
97
+ "SELECT id FROM memories WHERE key LIKE 'autofill:%'"
98
+ ).fetchall()]
99
+ for mid in autofill_ids:
100
+ delete_id(mid)
101
+ stats["autofill_deleted"] = len(autofill_ids)
102
+ log.info(f"Deleted {len(autofill_ids)} autofill:* entries")
103
+
104
+ # ── 2. Delete address_type_* keys ─────────────────────────────
105
+ addr_type_ids = [r[0] for r in conn.execute(
106
+ "SELECT id FROM memories WHERE key LIKE 'address_type_%'"
107
+ ).fetchall()]
108
+ for mid in addr_type_ids:
109
+ delete_id(mid)
110
+ stats["address_type_deleted"] = len(addr_type_ids)
111
+ log.info(f"Deleted {len(addr_type_ids)} address_type_* entries")
112
+
113
+ # ── 3. Fix single-value key chains ────────────────────────────
114
+ # Must run BEFORE deleting superseded entries, so we can see the full chain
115
+ # (including superseded entries like "Matthew Diakonov") to pick the best winner.
116
+ for key in SINGLE_VALUE_KEYS:
117
+ rows = conn.execute(
118
+ "SELECT id, value, appeared_count FROM memories WHERE key=? ORDER BY appeared_count DESC",
119
+ (key,)
120
+ ).fetchall()
121
+
122
+ if not rows:
123
+ continue
124
+
125
+ # Filter out obvious garbage to find the real winner
126
+ good = [(r[0], r[1], r[2]) for r in rows
127
+ if r[1].lower() not in NOISE_WORDS
128
+ and "_" not in r[1]
129
+ and len(r[1]) > 2
130
+ and not r[1].isdigit()]
131
+
132
+ if good:
133
+ winner_id = good[0][0]
134
+ winner_val = good[0][1]
135
+ else:
136
+ winner_id = rows[0][0]
137
+ winner_val = rows[0][1]
138
+
139
+ # Unsupersede the winner, delete everything else
140
+ if not dry_run:
141
+ conn.execute(
142
+ "UPDATE memories SET superseded_by=NULL, superseded_at=NULL WHERE id=?",
143
+ (winner_id,)
144
+ )
145
+ losers = [r[0] for r in rows if r[0] != winner_id]
146
+ for mid in losers:
147
+ delete_id(mid)
148
+ stats["single_value_fixed"] += 1
149
+
150
+ if losers:
151
+ log.info(f" {key}: winner='{winner_val}' (deleted {len(losers)} others)")
152
+
153
+ if not dry_run:
154
+ conn.commit()
155
+
156
+ # ── 4. Delete remaining superseded entries ────────────────────
157
+ superseded_ids = [r[0] for r in conn.execute(
158
+ "SELECT id FROM memories WHERE superseded_by IS NOT NULL"
159
+ ).fetchall()]
160
+ for mid in superseded_ids:
161
+ delete_id(mid)
162
+ stats["superseded_deleted"] = len(superseded_ids)
163
+ log.info(f"Deleted {len(superseded_ids)} superseded entries")
164
+
165
+ # ── 5. Deduplicate phones ─────────────────────────────────────
166
+ phone_rows = conn.execute(
167
+ "SELECT id, value, appeared_count FROM memories WHERE key='phone' AND superseded_by IS NULL"
168
+ ).fetchall()
169
+
170
+ groups: dict[str, list] = {}
171
+ for mid, val, count in phone_rows:
172
+ norm = normalize_phone(val)
173
+ if norm:
174
+ groups.setdefault(norm, []).append((mid, val, count))
175
+ # phones that can't be normalized: leave alone
176
+
177
+ for norm, entries in groups.items():
178
+ if len(entries) <= 1:
179
+ continue
180
+ entries.sort(key=lambda x: -x[2]) # sort by appeared_count desc
181
+ winner_id = entries[0][0]
182
+ for mid, val, _ in entries[1:]:
183
+ delete_id(mid)
184
+ stats["phone_deduped"] += 1
185
+ log.info(f" phone {norm}: kept id={winner_id}, deleted {len(entries)-1} duplicates")
186
+
187
+ # ── 6. Deduplicate emails ─────────────────────────────────────
188
+ email_rows = conn.execute(
189
+ "SELECT id, value, appeared_count FROM memories WHERE key='email' AND superseded_by IS NULL"
190
+ ).fetchall()
191
+
192
+ email_groups: dict[str, list] = {}
193
+ for mid, val, count in email_rows:
194
+ norm = normalize_email(val)
195
+ email_groups.setdefault(norm, []).append((mid, val, count))
196
+
197
+ for norm, entries in email_groups.items():
198
+ if len(entries) <= 1:
199
+ continue
200
+ entries.sort(key=lambda x: -x[2])
201
+ winner_id = entries[0][0]
202
+ for mid, val, _ in entries[1:]:
203
+ delete_id(mid)
204
+ stats["email_deduped"] += 1
205
+ log.info(f" email {norm}: kept id={winner_id}, deleted {len(entries)-1} duplicates")
206
+
207
+ # ── 7. Delete known noise patterns ────────────────────────────
208
+ all_rows = conn.execute(
209
+ "SELECT id, value FROM memories WHERE superseded_by IS NULL"
210
+ ).fetchall()
211
+
212
+ for mid, val in all_rows:
213
+ for pattern in NOISE_VALUE_PATTERNS:
214
+ if re.search(pattern, val, re.IGNORECASE):
215
+ delete_id(mid)
216
+ stats["noise_pattern_deleted"] += 1
217
+ break
218
+
219
+ if not dry_run:
220
+ conn.commit()
221
+
222
+ # ── 8. Clean up bad address entries ──────────────────────────
223
+ # Delete city/state/zip entries that don't match known SF location
224
+ city_rows = conn.execute(
225
+ "SELECT id, value FROM memories WHERE key='city' AND superseded_by IS NULL"
226
+ ).fetchall()
227
+ for mid, val in city_rows:
228
+ if val.lower().strip() not in KNOWN_CITIES:
229
+ delete_id(mid)
230
+ stats.setdefault("address_noise_deleted", 0)
231
+ stats["address_noise_deleted"] += 1
232
+
233
+ state_rows = conn.execute(
234
+ "SELECT id, value FROM memories WHERE key='state' AND superseded_by IS NULL"
235
+ ).fetchall()
236
+ for mid, val in state_rows:
237
+ if val.lower().strip() not in KNOWN_STATES:
238
+ delete_id(mid)
239
+ stats.setdefault("address_noise_deleted", 0)
240
+ stats["address_noise_deleted"] += 1
241
+
242
+ zip_rows = conn.execute(
243
+ "SELECT id, value FROM memories WHERE key='zip' AND superseded_by IS NULL"
244
+ ).fetchall()
245
+ for mid, val in zip_rows:
246
+ if val.strip() not in KNOWN_ZIPS:
247
+ delete_id(mid)
248
+ stats.setdefault("address_noise_deleted", 0)
249
+ stats["address_noise_deleted"] += 1
250
+
251
+ if not dry_run:
252
+ conn.commit()
253
+ log.info(f"Deleted {stats.get('address_noise_deleted', 0)} bad address entries")
254
+
255
+ # ── 9. Mark all remaining non-superseded as reviewed ─────────
256
+ unreviewed_ids = [r[0] for r in conn.execute(
257
+ "SELECT id FROM memories WHERE reviewed_at IS NULL AND superseded_by IS NULL"
258
+ ).fetchall()]
259
+ mark_reviewed(unreviewed_ids)
260
+
261
+ if not dry_run:
262
+ conn.commit()
263
+
264
+ # ── Report ────────────────────────────────────────────────────
265
+ final_stats = mem.stats()
266
+ log.info("\n── Cleanup complete ──────────────────────────────────")
267
+ log.info(f" autofill:* deleted: {stats['autofill_deleted']}")
268
+ log.info(f" address_type_* deleted: {stats['address_type_deleted']}")
269
+ log.info(f" superseded deleted: {stats['superseded_deleted']}")
270
+ log.info(f" single-value fixed: {stats['single_value_fixed']}")
271
+ log.info(f" phone dupes removed: {stats['phone_deduped']}")
272
+ log.info(f" email dupes removed: {stats['email_deduped']}")
273
+ log.info(f" noise patterns deleted: {stats['noise_pattern_deleted']}")
274
+ log.info(f" address noise deleted: {stats.get('address_noise_deleted', 0)}")
275
+ log.info(f" marked reviewed: {stats['marked_reviewed']}")
276
+ log.info(f"\n Final DB: {final_stats['total_memories']} memories")
277
+ log.info(f" Unreviewed remaining: {conn.execute('SELECT COUNT(*) FROM memories WHERE reviewed_at IS NULL').fetchone()[0]}")
278
+ log.info("\n── Profile after cleanup ─────────────────────────────")
279
+ print(mem.profile_text())
280
+
281
+ mem.close()
282
+ return stats
283
+
284
+
285
+ if __name__ == "__main__":
286
+ import argparse
287
+ parser = argparse.ArgumentParser(description="Rule-based cleanup of memories.db")
288
+ parser.add_argument("--dry-run", action="store_true", help="Show what would be deleted without changing anything")
289
+ parser.add_argument("--db", default=DB_PATH, help="Path to memories.db")
290
+ args = parser.parse_args()
291
+
292
+ if args.dry_run:
293
+ log.info("DRY RUN — no changes will be made")
294
+
295
+ run_cleanup(db_path=args.db, dry_run=args.dry_run)
package/extract.py ADDED
@@ -0,0 +1,53 @@
1
+ #!/usr/bin/env python3
2
+ """CLI entry point for ai-browser-profile extraction."""
3
+
4
+ import argparse
5
+ import logging
6
+
7
+ from ai_browser_profile import extract_memories
8
+ from clean import run_cleanup
9
+
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format="%(asctime)s %(levelname)s %(name)s: %(message)s",
13
+ datefmt="%H:%M:%S",
14
+ )
15
+ log = logging.getLogger("extract")
16
+
17
+
18
+ def main():
19
+ parser = argparse.ArgumentParser(description="Extract user memories from browser data")
20
+ parser.add_argument("--output", "-o", default="memories.db",
21
+ help="Output memories database path (default: memories.db)")
22
+ parser.add_argument("--browsers", nargs="*",
23
+ help="Only scan specific browsers (arc, chrome, safari, firefox, brave, edge)")
24
+ parser.add_argument("--no-indexeddb", action="store_true",
25
+ help="Skip IndexedDB extraction (WhatsApp contacts)")
26
+ parser.add_argument("--no-localstorage", action="store_true",
27
+ help="Skip Local Storage extraction (LinkedIn connections)")
28
+ parser.add_argument("--no-notion", action="store_true",
29
+ help="Skip Notion extraction (workspace, users, pages)")
30
+ parser.add_argument("--no-clean", action="store_true",
31
+ help="Skip auto-cleanup after extraction")
32
+ args = parser.parse_args()
33
+
34
+ browsers = set(b.lower() for b in args.browsers) if args.browsers else None
35
+
36
+ mem = extract_memories(
37
+ memories_db_path=args.output,
38
+ browsers=browsers,
39
+ skip_indexeddb=args.no_indexeddb,
40
+ skip_localstorage=args.no_localstorage,
41
+ skip_notion=args.no_notion,
42
+ )
43
+ stats = mem.stats()
44
+ log.info(f"Extraction done — {stats['total_memories']} memories in {args.output}")
45
+ mem.close()
46
+
47
+ if not args.no_clean:
48
+ log.info("Running auto-cleanup...")
49
+ run_cleanup(db_path=args.output)
50
+
51
+
52
+ if __name__ == "__main__":
53
+ main()
package/package.json ADDED
@@ -0,0 +1,40 @@
1
+ {
2
+ "name": "ai-browser-profile",
3
+ "version": "1.0.5",
4
+ "description": "Extract user identity (name, emails, accounts, addresses, payments) from browser data into a self-ranking SQLite database. Install as a Claude Code agent skill.",
5
+ "bin": {
6
+ "ai-browser-profile": "bin/cli.js"
7
+ },
8
+ "files": [
9
+ "bin/",
10
+ "ai_browser_profile/**/*.py",
11
+ "skill/",
12
+ "review/",
13
+ "setup/",
14
+ "autofill/",
15
+ "whatsapp/",
16
+ "extract.py",
17
+ "clean.py"
18
+ ],
19
+ "keywords": [
20
+ "browser-data",
21
+ "browser-profile",
22
+ "identity",
23
+ "claude",
24
+ "claude-code",
25
+ "ai-agent",
26
+ "knowledge-base",
27
+ "embeddings",
28
+ "autofill"
29
+ ],
30
+ "author": "Matthew Diakonov",
31
+ "license": "MIT",
32
+ "repository": {
33
+ "type": "git",
34
+ "url": "git+https://github.com/m13v/ai-browser-profile.git"
35
+ },
36
+ "homepage": "https://github.com/m13v/ai-browser-profile",
37
+ "engines": {
38
+ "node": ">=16"
39
+ }
40
+ }
@@ -0,0 +1,171 @@
1
+ ---
2
+ name: memory-review
3
+ description: "Review and clean unreviewed memories in the database. Removes junk, merges duplicates, fixes miskeyed data, and marks good entries as reviewed. Run periodically after extraction."
4
+ ---
5
+
6
+ # Memory Review
7
+
8
+ LLM-powered post-ingestion review of the user memories database. Processes unreviewed memories in phases — bulk cleanup first, then per-entry review.
9
+
10
+ ## Setup
11
+
12
+ ```python
13
+ import sys, os
14
+ sys.path.insert(0, os.path.expanduser("~/ai-browser-profile"))
15
+ from ai_browser_profile import MemoryDB
16
+
17
+ mem = MemoryDB(os.path.expanduser("~/ai-browser-profile/memories.db"))
18
+ ```
19
+
20
+ ## Workflow
21
+
22
+ ### Phase 0: Fast Pass (bulk cleanup)
23
+
24
+ Before reviewing individual entries, bulk-delete entire categories of known noise. This typically removes ~50% of entries instantly.
25
+
26
+ ```python
27
+ # 1. Delete ALL autofill:* entries — these are raw form field duplicates of real data
28
+ autofill_ids = [r[0] for r in mem.conn.execute(
29
+ "SELECT id FROM memories WHERE key LIKE 'autofill:%' AND reviewed_at IS NULL"
30
+ ).fetchall()]
31
+ for mid in autofill_ids:
32
+ mem.delete(mid)
33
+
34
+ # 2. Delete ALL address_type_* entries — always noise (last names, numbers in address fields)
35
+ addr_type_ids = [r[0] for r in mem.conn.execute(
36
+ "SELECT id FROM memories WHERE key LIKE 'address_type_%' AND reviewed_at IS NULL"
37
+ ).fetchall()]
38
+ for mid in addr_type_ids:
39
+ mem.delete(mid)
40
+
41
+ # 3. Delete ALL superseded entries
42
+ superseded_ids = [r[0] for r in mem.conn.execute(
43
+ "SELECT id FROM memories WHERE superseded_by IS NOT NULL AND reviewed_at IS NULL"
44
+ ).fetchall()]
45
+ for mid in superseded_ids:
46
+ mem.delete(mid)
47
+ ```
48
+
49
+ **Check for leaked secrets before bulk-deleting autofill.** Grep for `client_secret`, `api_key`, `token`, `password` in autofill values and flag them to the user before deleting.
50
+
51
+ ### Phase 1: Supersession Chain Repair
52
+
53
+ After the fast pass, single-value key chains (`first_name`, `last_name`, `full_name`, `card_holder_name`, `email`) are often corrupted. The extraction supersedes the real value with garbage because each new autofill entry blindly replaces the old one.
54
+
55
+ Common pattern: `first_name` chain goes Matthew → Marina → mediar → ... leaving "mediar" as the active value.
56
+
57
+ ```python
58
+ # For each single-value key, check the active (non-superseded) value
59
+ SINGLE_VALUE_KEYS = ["first_name", "last_name", "full_name", "card_holder_name", "email", "phone"]
60
+ for key in SINGLE_VALUE_KEYS:
61
+ rows = mem.conn.execute(
62
+ "SELECT id, value, confidence, superseded_by FROM memories WHERE key=? ORDER BY id",
63
+ (key,)
64
+ ).fetchall()
65
+ print(f"\n{key}:")
66
+ for r in rows:
67
+ status = "ACTIVE" if r[3] is None else f"superseded by {r[3]}"
68
+ print(f" id={r[0]} val='{r[1]}' conf={r[2]} [{status}]")
69
+ ```
70
+
71
+ Fix by:
72
+ 1. Deleting garbage entries (code identifiers, other people's names, company names in name fields)
73
+ 2. Unsuperseding the real value: `UPDATE memories SET superseded_by=NULL, superseded_at=NULL WHERE id=?`
74
+ 3. Boosting confidence on the real value if needed
75
+
76
+ ### Phase 2: Per-Entry Review
77
+
78
+ Now process remaining unreviewed entries in batches:
79
+
80
+ 1. Call `mem.get_unreviewed(limit=50)` to get a batch
81
+ 2. Print the batch as a numbered table
82
+ 3. Classify each as KEEP, DELETE, MERGE, or FIX
83
+ 4. Execute actions via `mem.delete()`, `mem.update_memory()`, etc.
84
+ 5. Call `mem.mark_reviewed([...ids...])` on all processed IDs (including kept ones)
85
+ 6. Print summary per batch
86
+ 7. Repeat until no unreviewed remain
87
+
88
+ ### Phase 3: Profile Verification
89
+
90
+ After all entries are reviewed, verify the profile output catches any residual issues:
91
+
92
+ ```python
93
+ print(mem.profile_text())
94
+ ```
95
+
96
+ Check for:
97
+ - Wrong name showing (garbage superseded the real one)
98
+ - Mixed cities/states (e.g. "San Francisco, New York" in the same address)
99
+ - Garbage card holder names
100
+ - Missing fields that should be populated
101
+
102
+ Fix any issues, then `mem.close()`.
103
+
104
+ ## Classification Criteria
105
+
106
+ ### DELETE — remove entirely
107
+
108
+ - **Gibberish/test data**: `"wegs sdg"`, `"asdf"`, `"test123"`, `"technical placeholder just to pay"`, single characters
109
+ - **Code identifiers in name fields**: values containing underscores (`investor_role`, `handle_new_workflow_analysis`, `on_low_level_event_insert`), or known non-names (`os`, `type`, `use-case`)
110
+ - **Leaked secrets**: values containing `client_secret`, `GOCSPX-`, full API keys, OAuth tokens, private keys — **flag to user before deleting**
111
+ - **Other people's data**: names/emails/phones that belong to someone else — UNLESS stored as `contact:*` or `linkedin:*` keys
112
+ - **Company names in name fields**: `first_name="mediar"`, `last_name="inc"`, `full_name="Mediar, inc."` — these are autofill bugs
113
+ - **Truncated names**: `last_name="Di"`, `full_name="Matthew Di"`, `full_name="Matt"` (incomplete)
114
+ - **Noise locations**: cities/states from insurance quoting, comparison shopping, or form testing
115
+ - **Expired card data**: card expiry dates in the past
116
+ - **Meaningless amounts**: `"100.00"`, `"199.00"`, `"1.00"`, `"$ 245.80"` — these are form values, not useful data
117
+ - **Duplicate phone formats**: keep only one format per phone number (prefer international format with `+1`)
118
+
119
+ ### MERGE — combine duplicates
120
+
121
+ - **Same phone, different formats**: `"+1 650-796-1489"`, `"(650) 796-1489"`, `"6507961489"` — keep the international format with highest confidence, delete all others
122
+ - **Same email, different casing**: keep lowercase
123
+ - **Duplicate DOB entries**: identify the real one (consistent across sources) vs noise
124
+ - **Same contact, multiple entries**: merge into highest-confidence one
125
+
126
+ When merging: keep highest confidence. If tied, keep highest `accessed_count`. Delete the rest.
127
+
128
+ ### FIX — correct bad data
129
+
130
+ - **Wrong key assignment**: `company="Dmitrii Diakonov"` → delete (person name in company field)
131
+ - **Wrong confidence**: real DOB at 0.4 while fake one at 0.6 → boost real one
132
+ - **Missing tags**: `phone` key without `phone` tag, `email` without `email` tag
133
+ - **Broken supersession**: use `UPDATE memories SET superseded_by=NULL, superseded_at=NULL WHERE id=?`
134
+ - Use `mem.update_memory(id, key=..., value=..., confidence=..., tags=[...])` for fixes
135
+
136
+ ### KEEP — mark as reviewed
137
+
138
+ - Genuine user data
139
+ - Correctly keyed and tagged entries
140
+ - Legitimate contacts, accounts, tools
141
+ - Just call `mem.mark_reviewed([id])` — no changes needed
142
+
143
+ ## Output Format
144
+
145
+ For each batch, print a table like:
146
+
147
+ ```
148
+ Batch 1/N (50 memories)
149
+ ───────────────────────────────────────
150
+ ID | Key | Value | Conf | Action
151
+ ----|------------------|--------------------|------|--------
152
+ 1 | first_name | Matthew | 0.8 | KEEP
153
+ 2 | full_name | investor_role | 0.5 | DELETE (code identifier)
154
+ 3 | phone | +14155551234 | 0.7 | KEEP
155
+ 4 | phone | (415) 555-1234 | 0.5 | MERGE → #3
156
+ ...
157
+
158
+ Summary: 30 kept, 12 deleted, 5 fixed, 3 merged
159
+ ```
160
+
161
+ At the end, print the profile and final stats:
162
+
163
+ ```
164
+ ## Final Profile
165
+ [output of mem.profile_text()]
166
+
167
+ ## Stats
168
+ Total: X memories (was Y before review)
169
+ Deleted: Z, Kept: W, Fixed: V, Merged: U
170
+ Secrets found and removed: N
171
+ ```
package/review/run.sh ADDED
@@ -0,0 +1,82 @@
1
+ #!/bin/bash
2
+ # Memory Review — weekly extract + LLM review
3
+ # 1. Run extract.py to ingest new browser data
4
+ # 2. Run Claude to review/clean new entries
5
+ # Called by launchd weekly (604800s)
6
+
7
+ set -euo pipefail
8
+
9
+ REPO="$HOME/user-memories"
10
+ SKILL_FILE="$REPO/review/SKILL.md"
11
+ LOG_DIR="$REPO/review/logs"
12
+ VENV="$REPO/.venv/bin/activate"
13
+ DB="$REPO/memories.db"
14
+
15
+ mkdir -p "$LOG_DIR"
16
+ LOG_FILE="$LOG_DIR/$(date +%Y-%m-%d_%H%M%S).log"
17
+
18
+ echo "=== Memory Review Run: $(date) ===" | tee "$LOG_FILE"
19
+
20
+ # Phase 0: Extract new browser data
21
+ echo "--- Extracting browser data ---" | tee -a "$LOG_FILE"
22
+ (
23
+ cd "$REPO"
24
+ source "$VENV"
25
+ python extract.py 2>&1
26
+ ) | tee -a "$LOG_FILE"
27
+
28
+ # Check if there are unreviewed entries
29
+ UNREVIEWED=$(cd "$REPO" && source "$VENV" && python -c "
30
+ import sys; sys.path.insert(0, '.')
31
+ from user_memories import MemoryDB
32
+ m = MemoryDB('memories.db')
33
+ print(len(m.get_unreviewed(limit=10000)))
34
+ m.close()
35
+ " 2>/dev/null)
36
+
37
+ echo "Unreviewed entries: $UNREVIEWED" | tee -a "$LOG_FILE"
38
+
39
+ if [ "$UNREVIEWED" = "0" ]; then
40
+ echo "No new entries to review. Done." | tee -a "$LOG_FILE"
41
+ exit 0
42
+ fi
43
+
44
+ # Phase 1: Rule-based cleanup
45
+ echo "--- Running rule-based cleanup ---" | tee -a "$LOG_FILE"
46
+ (
47
+ cd "$REPO"
48
+ source "$VENV"
49
+ python clean.py 2>&1
50
+ ) | tee -a "$LOG_FILE"
51
+
52
+ # Re-check unreviewed after cleanup
53
+ UNREVIEWED=$(cd "$REPO" && source "$VENV" && python -c "
54
+ import sys; sys.path.insert(0, '.')
55
+ from user_memories import MemoryDB
56
+ m = MemoryDB('memories.db')
57
+ print(len(m.get_unreviewed(limit=10000)))
58
+ m.close()
59
+ " 2>/dev/null)
60
+
61
+ if [ "$UNREVIEWED" = "0" ]; then
62
+ echo "All entries handled by cleanup. Done." | tee -a "$LOG_FILE"
63
+ exit 0
64
+ fi
65
+
66
+ # Phase 2: Claude reviews remaining entries
67
+ echo "--- Starting Claude review ($UNREVIEWED entries) ---" | tee -a "$LOG_FILE"
68
+ claude -p "You are the Memory Review agent. You clean up the user memories database after extraction.
69
+
70
+ Read $SKILL_FILE for full classification criteria and workflow.
71
+
72
+ DB path: $DB
73
+ Module path: $REPO
74
+
75
+ Run Phases 1-3 from the skill file (Phase 0 fast pass already done by clean.py).
76
+ Process remaining unreviewed entries in batches of 50.
77
+ Print a summary at the end." --max-turns 80 2>&1 | tee -a "$LOG_FILE"
78
+
79
+ echo "=== Run complete: $(date) ===" | tee -a "$LOG_FILE"
80
+
81
+ # Clean up old logs (keep last 30 days)
82
+ find "$LOG_DIR" -name "*.log" -mtime +30 -delete 2>/dev/null || true