ai-browser-profile 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +118 -0
- package/ai_browser_profile/__init__.py +6 -0
- package/ai_browser_profile/db.py +929 -0
- package/ai_browser_profile/embeddings.py +196 -0
- package/ai_browser_profile/extract.py +108 -0
- package/ai_browser_profile/ingestors/__init__.py +0 -0
- package/ai_browser_profile/ingestors/bookmarks.py +185 -0
- package/ai_browser_profile/ingestors/browser_detect.py +100 -0
- package/ai_browser_profile/ingestors/constants.py +208 -0
- package/ai_browser_profile/ingestors/history.py +123 -0
- package/ai_browser_profile/ingestors/indexeddb.py +203 -0
- package/ai_browser_profile/ingestors/localstorage.py +66 -0
- package/ai_browser_profile/ingestors/logins.py +46 -0
- package/ai_browser_profile/ingestors/messages.py +151 -0
- package/ai_browser_profile/ingestors/notion.py +313 -0
- package/ai_browser_profile/ingestors/webdata.py +134 -0
- package/autofill/SKILL.md +252 -0
- package/bin/cli.js +315 -0
- package/clean.py +295 -0
- package/extract.py +53 -0
- package/package.json +40 -0
- package/review/SKILL.md +171 -0
- package/review/run.sh +82 -0
- package/setup/SKILL.md +177 -0
- package/skill/SKILL.md +180 -0
- package/whatsapp/SKILL.md +321 -0
package/clean.py
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Programmatic cleanup of memories.db — rule-based, no LLM required.
|
|
3
|
+
|
|
4
|
+
Rules:
|
|
5
|
+
1. Delete autofill:* and address_type_* keys (noise/duplicate keys)
|
|
6
|
+
2. Delete superseded entries
|
|
7
|
+
3. Fix single-value key chains (pick winner by appeared_count)
|
|
8
|
+
4. Deduplicate phones (normalize, keep highest appeared_count)
|
|
9
|
+
5. Deduplicate emails (lowercase, keep highest appeared_count)
|
|
10
|
+
6. Delete known noise patterns (feliciti flood, etc.)
|
|
11
|
+
7. Mark everything touched as reviewed
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
import sys
|
|
17
|
+
import logging
|
|
18
|
+
from datetime import datetime, timezone
|
|
19
|
+
|
|
20
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s", datefmt="%H:%M:%S")
|
|
21
|
+
log = logging.getLogger("clean")
|
|
22
|
+
|
|
23
|
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
24
|
+
from ai_browser_profile import MemoryDB
|
|
25
|
+
|
|
26
|
+
DB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "memories.db")
|
|
27
|
+
|
|
28
|
+
SINGLE_VALUE_KEYS = ["first_name", "last_name", "full_name", "card_holder_name"]
|
|
29
|
+
|
|
30
|
+
NOISE_VALUE_PATTERNS = [
|
|
31
|
+
r"^application\.from\.feliciti\.co", # feliciti housing app flood
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
NOISE_WORDS = {
|
|
35
|
+
"test", "asdf", "qwerty", "foo", "bar", "baz", "placeholder",
|
|
36
|
+
"technical placeholder just to pay",
|
|
37
|
+
"wegs sdg", "sdgsdg", # keyboard mash garbage
|
|
38
|
+
"mediar inc", "mediar, inc.", "mediar", # company name in card holder field
|
|
39
|
+
"omi", # product name in name field
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
# Known garbage address field combinations (city + state that don't match real location)
|
|
43
|
+
# Keep only entries where city/state are consistent with San Francisco, CA
|
|
44
|
+
KNOWN_CITIES = {"san francisco", "sf"}
|
|
45
|
+
KNOWN_STATES = {"california", "ca"}
|
|
46
|
+
KNOWN_ZIPS = {"94102", "94103", "94105", "94107", "94109", "94110", "94111",
|
|
47
|
+
"94114", "94115", "94117", "94118", "94121", "94122", "94123",
|
|
48
|
+
"94124", "94127", "94129", "94130", "94131", "94132", "94133",
|
|
49
|
+
"94134"}
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def now_iso():
|
|
53
|
+
return datetime.now(timezone.utc).isoformat()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def normalize_phone(phone: str) -> str | None:
|
|
57
|
+
"""Normalize phone to digits only, return None if not a valid US/intl number."""
|
|
58
|
+
digits = re.sub(r"\D", "", phone)
|
|
59
|
+
if digits.startswith("1") and len(digits) == 11:
|
|
60
|
+
digits = digits[1:]
|
|
61
|
+
if len(digits) == 10:
|
|
62
|
+
return digits
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def normalize_email(email: str) -> str:
|
|
67
|
+
return email.strip().lower()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def run_cleanup(db_path: str = DB_PATH, dry_run: bool = False):
|
|
71
|
+
mem = MemoryDB(db_path)
|
|
72
|
+
conn = mem.conn
|
|
73
|
+
now = now_iso()
|
|
74
|
+
|
|
75
|
+
stats = {
|
|
76
|
+
"autofill_deleted": 0,
|
|
77
|
+
"address_type_deleted": 0,
|
|
78
|
+
"superseded_deleted": 0,
|
|
79
|
+
"single_value_fixed": 0,
|
|
80
|
+
"phone_deduped": 0,
|
|
81
|
+
"email_deduped": 0,
|
|
82
|
+
"noise_pattern_deleted": 0,
|
|
83
|
+
"marked_reviewed": 0,
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
def delete_id(mid):
|
|
87
|
+
if not dry_run:
|
|
88
|
+
mem.delete(mid)
|
|
89
|
+
|
|
90
|
+
def mark_reviewed(ids):
|
|
91
|
+
if not dry_run and ids:
|
|
92
|
+
mem.mark_reviewed(ids)
|
|
93
|
+
stats["marked_reviewed"] += len(ids)
|
|
94
|
+
|
|
95
|
+
# ── 1. Delete autofill:* keys ──────────────────────────────────
|
|
96
|
+
autofill_ids = [r[0] for r in conn.execute(
|
|
97
|
+
"SELECT id FROM memories WHERE key LIKE 'autofill:%'"
|
|
98
|
+
).fetchall()]
|
|
99
|
+
for mid in autofill_ids:
|
|
100
|
+
delete_id(mid)
|
|
101
|
+
stats["autofill_deleted"] = len(autofill_ids)
|
|
102
|
+
log.info(f"Deleted {len(autofill_ids)} autofill:* entries")
|
|
103
|
+
|
|
104
|
+
# ── 2. Delete address_type_* keys ─────────────────────────────
|
|
105
|
+
addr_type_ids = [r[0] for r in conn.execute(
|
|
106
|
+
"SELECT id FROM memories WHERE key LIKE 'address_type_%'"
|
|
107
|
+
).fetchall()]
|
|
108
|
+
for mid in addr_type_ids:
|
|
109
|
+
delete_id(mid)
|
|
110
|
+
stats["address_type_deleted"] = len(addr_type_ids)
|
|
111
|
+
log.info(f"Deleted {len(addr_type_ids)} address_type_* entries")
|
|
112
|
+
|
|
113
|
+
# ── 3. Fix single-value key chains ────────────────────────────
|
|
114
|
+
# Must run BEFORE deleting superseded entries, so we can see the full chain
|
|
115
|
+
# (including superseded entries like "Matthew Diakonov") to pick the best winner.
|
|
116
|
+
for key in SINGLE_VALUE_KEYS:
|
|
117
|
+
rows = conn.execute(
|
|
118
|
+
"SELECT id, value, appeared_count FROM memories WHERE key=? ORDER BY appeared_count DESC",
|
|
119
|
+
(key,)
|
|
120
|
+
).fetchall()
|
|
121
|
+
|
|
122
|
+
if not rows:
|
|
123
|
+
continue
|
|
124
|
+
|
|
125
|
+
# Filter out obvious garbage to find the real winner
|
|
126
|
+
good = [(r[0], r[1], r[2]) for r in rows
|
|
127
|
+
if r[1].lower() not in NOISE_WORDS
|
|
128
|
+
and "_" not in r[1]
|
|
129
|
+
and len(r[1]) > 2
|
|
130
|
+
and not r[1].isdigit()]
|
|
131
|
+
|
|
132
|
+
if good:
|
|
133
|
+
winner_id = good[0][0]
|
|
134
|
+
winner_val = good[0][1]
|
|
135
|
+
else:
|
|
136
|
+
winner_id = rows[0][0]
|
|
137
|
+
winner_val = rows[0][1]
|
|
138
|
+
|
|
139
|
+
# Unsupersede the winner, delete everything else
|
|
140
|
+
if not dry_run:
|
|
141
|
+
conn.execute(
|
|
142
|
+
"UPDATE memories SET superseded_by=NULL, superseded_at=NULL WHERE id=?",
|
|
143
|
+
(winner_id,)
|
|
144
|
+
)
|
|
145
|
+
losers = [r[0] for r in rows if r[0] != winner_id]
|
|
146
|
+
for mid in losers:
|
|
147
|
+
delete_id(mid)
|
|
148
|
+
stats["single_value_fixed"] += 1
|
|
149
|
+
|
|
150
|
+
if losers:
|
|
151
|
+
log.info(f" {key}: winner='{winner_val}' (deleted {len(losers)} others)")
|
|
152
|
+
|
|
153
|
+
if not dry_run:
|
|
154
|
+
conn.commit()
|
|
155
|
+
|
|
156
|
+
# ── 4. Delete remaining superseded entries ────────────────────
|
|
157
|
+
superseded_ids = [r[0] for r in conn.execute(
|
|
158
|
+
"SELECT id FROM memories WHERE superseded_by IS NOT NULL"
|
|
159
|
+
).fetchall()]
|
|
160
|
+
for mid in superseded_ids:
|
|
161
|
+
delete_id(mid)
|
|
162
|
+
stats["superseded_deleted"] = len(superseded_ids)
|
|
163
|
+
log.info(f"Deleted {len(superseded_ids)} superseded entries")
|
|
164
|
+
|
|
165
|
+
# ── 5. Deduplicate phones ─────────────────────────────────────
|
|
166
|
+
phone_rows = conn.execute(
|
|
167
|
+
"SELECT id, value, appeared_count FROM memories WHERE key='phone' AND superseded_by IS NULL"
|
|
168
|
+
).fetchall()
|
|
169
|
+
|
|
170
|
+
groups: dict[str, list] = {}
|
|
171
|
+
for mid, val, count in phone_rows:
|
|
172
|
+
norm = normalize_phone(val)
|
|
173
|
+
if norm:
|
|
174
|
+
groups.setdefault(norm, []).append((mid, val, count))
|
|
175
|
+
# phones that can't be normalized: leave alone
|
|
176
|
+
|
|
177
|
+
for norm, entries in groups.items():
|
|
178
|
+
if len(entries) <= 1:
|
|
179
|
+
continue
|
|
180
|
+
entries.sort(key=lambda x: -x[2]) # sort by appeared_count desc
|
|
181
|
+
winner_id = entries[0][0]
|
|
182
|
+
for mid, val, _ in entries[1:]:
|
|
183
|
+
delete_id(mid)
|
|
184
|
+
stats["phone_deduped"] += 1
|
|
185
|
+
log.info(f" phone {norm}: kept id={winner_id}, deleted {len(entries)-1} duplicates")
|
|
186
|
+
|
|
187
|
+
# ── 6. Deduplicate emails ─────────────────────────────────────
|
|
188
|
+
email_rows = conn.execute(
|
|
189
|
+
"SELECT id, value, appeared_count FROM memories WHERE key='email' AND superseded_by IS NULL"
|
|
190
|
+
).fetchall()
|
|
191
|
+
|
|
192
|
+
email_groups: dict[str, list] = {}
|
|
193
|
+
for mid, val, count in email_rows:
|
|
194
|
+
norm = normalize_email(val)
|
|
195
|
+
email_groups.setdefault(norm, []).append((mid, val, count))
|
|
196
|
+
|
|
197
|
+
for norm, entries in email_groups.items():
|
|
198
|
+
if len(entries) <= 1:
|
|
199
|
+
continue
|
|
200
|
+
entries.sort(key=lambda x: -x[2])
|
|
201
|
+
winner_id = entries[0][0]
|
|
202
|
+
for mid, val, _ in entries[1:]:
|
|
203
|
+
delete_id(mid)
|
|
204
|
+
stats["email_deduped"] += 1
|
|
205
|
+
log.info(f" email {norm}: kept id={winner_id}, deleted {len(entries)-1} duplicates")
|
|
206
|
+
|
|
207
|
+
# ── 7. Delete known noise patterns ────────────────────────────
|
|
208
|
+
all_rows = conn.execute(
|
|
209
|
+
"SELECT id, value FROM memories WHERE superseded_by IS NULL"
|
|
210
|
+
).fetchall()
|
|
211
|
+
|
|
212
|
+
for mid, val in all_rows:
|
|
213
|
+
for pattern in NOISE_VALUE_PATTERNS:
|
|
214
|
+
if re.search(pattern, val, re.IGNORECASE):
|
|
215
|
+
delete_id(mid)
|
|
216
|
+
stats["noise_pattern_deleted"] += 1
|
|
217
|
+
break
|
|
218
|
+
|
|
219
|
+
if not dry_run:
|
|
220
|
+
conn.commit()
|
|
221
|
+
|
|
222
|
+
# ── 8. Clean up bad address entries ──────────────────────────
|
|
223
|
+
# Delete city/state/zip entries that don't match known SF location
|
|
224
|
+
city_rows = conn.execute(
|
|
225
|
+
"SELECT id, value FROM memories WHERE key='city' AND superseded_by IS NULL"
|
|
226
|
+
).fetchall()
|
|
227
|
+
for mid, val in city_rows:
|
|
228
|
+
if val.lower().strip() not in KNOWN_CITIES:
|
|
229
|
+
delete_id(mid)
|
|
230
|
+
stats.setdefault("address_noise_deleted", 0)
|
|
231
|
+
stats["address_noise_deleted"] += 1
|
|
232
|
+
|
|
233
|
+
state_rows = conn.execute(
|
|
234
|
+
"SELECT id, value FROM memories WHERE key='state' AND superseded_by IS NULL"
|
|
235
|
+
).fetchall()
|
|
236
|
+
for mid, val in state_rows:
|
|
237
|
+
if val.lower().strip() not in KNOWN_STATES:
|
|
238
|
+
delete_id(mid)
|
|
239
|
+
stats.setdefault("address_noise_deleted", 0)
|
|
240
|
+
stats["address_noise_deleted"] += 1
|
|
241
|
+
|
|
242
|
+
zip_rows = conn.execute(
|
|
243
|
+
"SELECT id, value FROM memories WHERE key='zip' AND superseded_by IS NULL"
|
|
244
|
+
).fetchall()
|
|
245
|
+
for mid, val in zip_rows:
|
|
246
|
+
if val.strip() not in KNOWN_ZIPS:
|
|
247
|
+
delete_id(mid)
|
|
248
|
+
stats.setdefault("address_noise_deleted", 0)
|
|
249
|
+
stats["address_noise_deleted"] += 1
|
|
250
|
+
|
|
251
|
+
if not dry_run:
|
|
252
|
+
conn.commit()
|
|
253
|
+
log.info(f"Deleted {stats.get('address_noise_deleted', 0)} bad address entries")
|
|
254
|
+
|
|
255
|
+
# ── 9. Mark all remaining non-superseded as reviewed ─────────
|
|
256
|
+
unreviewed_ids = [r[0] for r in conn.execute(
|
|
257
|
+
"SELECT id FROM memories WHERE reviewed_at IS NULL AND superseded_by IS NULL"
|
|
258
|
+
).fetchall()]
|
|
259
|
+
mark_reviewed(unreviewed_ids)
|
|
260
|
+
|
|
261
|
+
if not dry_run:
|
|
262
|
+
conn.commit()
|
|
263
|
+
|
|
264
|
+
# ── Report ────────────────────────────────────────────────────
|
|
265
|
+
final_stats = mem.stats()
|
|
266
|
+
log.info("\n── Cleanup complete ──────────────────────────────────")
|
|
267
|
+
log.info(f" autofill:* deleted: {stats['autofill_deleted']}")
|
|
268
|
+
log.info(f" address_type_* deleted: {stats['address_type_deleted']}")
|
|
269
|
+
log.info(f" superseded deleted: {stats['superseded_deleted']}")
|
|
270
|
+
log.info(f" single-value fixed: {stats['single_value_fixed']}")
|
|
271
|
+
log.info(f" phone dupes removed: {stats['phone_deduped']}")
|
|
272
|
+
log.info(f" email dupes removed: {stats['email_deduped']}")
|
|
273
|
+
log.info(f" noise patterns deleted: {stats['noise_pattern_deleted']}")
|
|
274
|
+
log.info(f" address noise deleted: {stats.get('address_noise_deleted', 0)}")
|
|
275
|
+
log.info(f" marked reviewed: {stats['marked_reviewed']}")
|
|
276
|
+
log.info(f"\n Final DB: {final_stats['total_memories']} memories")
|
|
277
|
+
log.info(f" Unreviewed remaining: {conn.execute('SELECT COUNT(*) FROM memories WHERE reviewed_at IS NULL').fetchone()[0]}")
|
|
278
|
+
log.info("\n── Profile after cleanup ─────────────────────────────")
|
|
279
|
+
print(mem.profile_text())
|
|
280
|
+
|
|
281
|
+
mem.close()
|
|
282
|
+
return stats
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
if __name__ == "__main__":
|
|
286
|
+
import argparse
|
|
287
|
+
parser = argparse.ArgumentParser(description="Rule-based cleanup of memories.db")
|
|
288
|
+
parser.add_argument("--dry-run", action="store_true", help="Show what would be deleted without changing anything")
|
|
289
|
+
parser.add_argument("--db", default=DB_PATH, help="Path to memories.db")
|
|
290
|
+
args = parser.parse_args()
|
|
291
|
+
|
|
292
|
+
if args.dry_run:
|
|
293
|
+
log.info("DRY RUN — no changes will be made")
|
|
294
|
+
|
|
295
|
+
run_cleanup(db_path=args.db, dry_run=args.dry_run)
|
package/extract.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""CLI entry point for ai-browser-profile extraction."""
|
|
3
|
+
|
|
4
|
+
import argparse
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
from ai_browser_profile import extract_memories
|
|
8
|
+
from clean import run_cleanup
|
|
9
|
+
|
|
10
|
+
logging.basicConfig(
|
|
11
|
+
level=logging.INFO,
|
|
12
|
+
format="%(asctime)s %(levelname)s %(name)s: %(message)s",
|
|
13
|
+
datefmt="%H:%M:%S",
|
|
14
|
+
)
|
|
15
|
+
log = logging.getLogger("extract")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def main():
|
|
19
|
+
parser = argparse.ArgumentParser(description="Extract user memories from browser data")
|
|
20
|
+
parser.add_argument("--output", "-o", default="memories.db",
|
|
21
|
+
help="Output memories database path (default: memories.db)")
|
|
22
|
+
parser.add_argument("--browsers", nargs="*",
|
|
23
|
+
help="Only scan specific browsers (arc, chrome, safari, firefox, brave, edge)")
|
|
24
|
+
parser.add_argument("--no-indexeddb", action="store_true",
|
|
25
|
+
help="Skip IndexedDB extraction (WhatsApp contacts)")
|
|
26
|
+
parser.add_argument("--no-localstorage", action="store_true",
|
|
27
|
+
help="Skip Local Storage extraction (LinkedIn connections)")
|
|
28
|
+
parser.add_argument("--no-notion", action="store_true",
|
|
29
|
+
help="Skip Notion extraction (workspace, users, pages)")
|
|
30
|
+
parser.add_argument("--no-clean", action="store_true",
|
|
31
|
+
help="Skip auto-cleanup after extraction")
|
|
32
|
+
args = parser.parse_args()
|
|
33
|
+
|
|
34
|
+
browsers = set(b.lower() for b in args.browsers) if args.browsers else None
|
|
35
|
+
|
|
36
|
+
mem = extract_memories(
|
|
37
|
+
memories_db_path=args.output,
|
|
38
|
+
browsers=browsers,
|
|
39
|
+
skip_indexeddb=args.no_indexeddb,
|
|
40
|
+
skip_localstorage=args.no_localstorage,
|
|
41
|
+
skip_notion=args.no_notion,
|
|
42
|
+
)
|
|
43
|
+
stats = mem.stats()
|
|
44
|
+
log.info(f"Extraction done — {stats['total_memories']} memories in {args.output}")
|
|
45
|
+
mem.close()
|
|
46
|
+
|
|
47
|
+
if not args.no_clean:
|
|
48
|
+
log.info("Running auto-cleanup...")
|
|
49
|
+
run_cleanup(db_path=args.output)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
if __name__ == "__main__":
|
|
53
|
+
main()
|
package/package.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "ai-browser-profile",
|
|
3
|
+
"version": "1.0.5",
|
|
4
|
+
"description": "Extract user identity (name, emails, accounts, addresses, payments) from browser data into a self-ranking SQLite database. Install as a Claude Code agent skill.",
|
|
5
|
+
"bin": {
|
|
6
|
+
"ai-browser-profile": "bin/cli.js"
|
|
7
|
+
},
|
|
8
|
+
"files": [
|
|
9
|
+
"bin/",
|
|
10
|
+
"ai_browser_profile/**/*.py",
|
|
11
|
+
"skill/",
|
|
12
|
+
"review/",
|
|
13
|
+
"setup/",
|
|
14
|
+
"autofill/",
|
|
15
|
+
"whatsapp/",
|
|
16
|
+
"extract.py",
|
|
17
|
+
"clean.py"
|
|
18
|
+
],
|
|
19
|
+
"keywords": [
|
|
20
|
+
"browser-data",
|
|
21
|
+
"browser-profile",
|
|
22
|
+
"identity",
|
|
23
|
+
"claude",
|
|
24
|
+
"claude-code",
|
|
25
|
+
"ai-agent",
|
|
26
|
+
"knowledge-base",
|
|
27
|
+
"embeddings",
|
|
28
|
+
"autofill"
|
|
29
|
+
],
|
|
30
|
+
"author": "Matthew Diakonov",
|
|
31
|
+
"license": "MIT",
|
|
32
|
+
"repository": {
|
|
33
|
+
"type": "git",
|
|
34
|
+
"url": "git+https://github.com/m13v/ai-browser-profile.git"
|
|
35
|
+
},
|
|
36
|
+
"homepage": "https://github.com/m13v/ai-browser-profile",
|
|
37
|
+
"engines": {
|
|
38
|
+
"node": ">=16"
|
|
39
|
+
}
|
|
40
|
+
}
|
package/review/SKILL.md
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: memory-review
|
|
3
|
+
description: "Review and clean unreviewed memories in the database. Removes junk, merges duplicates, fixes miskeyed data, and marks good entries as reviewed. Run periodically after extraction."
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Memory Review
|
|
7
|
+
|
|
8
|
+
LLM-powered post-ingestion review of the user memories database. Processes unreviewed memories in phases — bulk cleanup first, then per-entry review.
|
|
9
|
+
|
|
10
|
+
## Setup
|
|
11
|
+
|
|
12
|
+
```python
|
|
13
|
+
import sys, os
|
|
14
|
+
sys.path.insert(0, os.path.expanduser("~/ai-browser-profile"))
|
|
15
|
+
from ai_browser_profile import MemoryDB
|
|
16
|
+
|
|
17
|
+
mem = MemoryDB(os.path.expanduser("~/ai-browser-profile/memories.db"))
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Workflow
|
|
21
|
+
|
|
22
|
+
### Phase 0: Fast Pass (bulk cleanup)
|
|
23
|
+
|
|
24
|
+
Before reviewing individual entries, bulk-delete entire categories of known noise. This typically removes ~50% of entries instantly.
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
# 1. Delete ALL autofill:* entries — these are raw form field duplicates of real data
|
|
28
|
+
autofill_ids = [r[0] for r in mem.conn.execute(
|
|
29
|
+
"SELECT id FROM memories WHERE key LIKE 'autofill:%' AND reviewed_at IS NULL"
|
|
30
|
+
).fetchall()]
|
|
31
|
+
for mid in autofill_ids:
|
|
32
|
+
mem.delete(mid)
|
|
33
|
+
|
|
34
|
+
# 2. Delete ALL address_type_* entries — always noise (last names, numbers in address fields)
|
|
35
|
+
addr_type_ids = [r[0] for r in mem.conn.execute(
|
|
36
|
+
"SELECT id FROM memories WHERE key LIKE 'address_type_%' AND reviewed_at IS NULL"
|
|
37
|
+
).fetchall()]
|
|
38
|
+
for mid in addr_type_ids:
|
|
39
|
+
mem.delete(mid)
|
|
40
|
+
|
|
41
|
+
# 3. Delete ALL superseded entries
|
|
42
|
+
superseded_ids = [r[0] for r in mem.conn.execute(
|
|
43
|
+
"SELECT id FROM memories WHERE superseded_by IS NOT NULL AND reviewed_at IS NULL"
|
|
44
|
+
).fetchall()]
|
|
45
|
+
for mid in superseded_ids:
|
|
46
|
+
mem.delete(mid)
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
**Check for leaked secrets before bulk-deleting autofill.** Grep for `client_secret`, `api_key`, `token`, `password` in autofill values and flag them to the user before deleting.
|
|
50
|
+
|
|
51
|
+
### Phase 1: Supersession Chain Repair
|
|
52
|
+
|
|
53
|
+
After the fast pass, single-value key chains (`first_name`, `last_name`, `full_name`, `card_holder_name`, `email`) are often corrupted. The extraction supersedes the real value with garbage because each new autofill entry blindly replaces the old one.
|
|
54
|
+
|
|
55
|
+
Common pattern: `first_name` chain goes Matthew → Marina → mediar → ... leaving "mediar" as the active value.
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
# For each single-value key, check the active (non-superseded) value
|
|
59
|
+
SINGLE_VALUE_KEYS = ["first_name", "last_name", "full_name", "card_holder_name", "email", "phone"]
|
|
60
|
+
for key in SINGLE_VALUE_KEYS:
|
|
61
|
+
rows = mem.conn.execute(
|
|
62
|
+
"SELECT id, value, confidence, superseded_by FROM memories WHERE key=? ORDER BY id",
|
|
63
|
+
(key,)
|
|
64
|
+
).fetchall()
|
|
65
|
+
print(f"\n{key}:")
|
|
66
|
+
for r in rows:
|
|
67
|
+
status = "ACTIVE" if r[3] is None else f"superseded by {r[3]}"
|
|
68
|
+
print(f" id={r[0]} val='{r[1]}' conf={r[2]} [{status}]")
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Fix by:
|
|
72
|
+
1. Deleting garbage entries (code identifiers, other people's names, company names in name fields)
|
|
73
|
+
2. Unsuperseding the real value: `UPDATE memories SET superseded_by=NULL, superseded_at=NULL WHERE id=?`
|
|
74
|
+
3. Boosting confidence on the real value if needed
|
|
75
|
+
|
|
76
|
+
### Phase 2: Per-Entry Review
|
|
77
|
+
|
|
78
|
+
Now process remaining unreviewed entries in batches:
|
|
79
|
+
|
|
80
|
+
1. Call `mem.get_unreviewed(limit=50)` to get a batch
|
|
81
|
+
2. Print the batch as a numbered table
|
|
82
|
+
3. Classify each as KEEP, DELETE, MERGE, or FIX
|
|
83
|
+
4. Execute actions via `mem.delete()`, `mem.update_memory()`, etc.
|
|
84
|
+
5. Call `mem.mark_reviewed([...ids...])` on all processed IDs (including kept ones)
|
|
85
|
+
6. Print summary per batch
|
|
86
|
+
7. Repeat until no unreviewed remain
|
|
87
|
+
|
|
88
|
+
### Phase 3: Profile Verification
|
|
89
|
+
|
|
90
|
+
After all entries are reviewed, verify the profile output catches any residual issues:
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
print(mem.profile_text())
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Check for:
|
|
97
|
+
- Wrong name showing (garbage superseded the real one)
|
|
98
|
+
- Mixed cities/states (e.g. "San Francisco, New York" in the same address)
|
|
99
|
+
- Garbage card holder names
|
|
100
|
+
- Missing fields that should be populated
|
|
101
|
+
|
|
102
|
+
Fix any issues, then `mem.close()`.
|
|
103
|
+
|
|
104
|
+
## Classification Criteria
|
|
105
|
+
|
|
106
|
+
### DELETE — remove entirely
|
|
107
|
+
|
|
108
|
+
- **Gibberish/test data**: `"wegs sdg"`, `"asdf"`, `"test123"`, `"technical placeholder just to pay"`, single characters
|
|
109
|
+
- **Code identifiers in name fields**: values containing underscores (`investor_role`, `handle_new_workflow_analysis`, `on_low_level_event_insert`), or known non-names (`os`, `type`, `use-case`)
|
|
110
|
+
- **Leaked secrets**: values containing `client_secret`, `GOCSPX-`, full API keys, OAuth tokens, private keys — **flag to user before deleting**
|
|
111
|
+
- **Other people's data**: names/emails/phones that belong to someone else — UNLESS stored as `contact:*` or `linkedin:*` keys
|
|
112
|
+
- **Company names in name fields**: `first_name="mediar"`, `last_name="inc"`, `full_name="Mediar, inc."` — these are autofill bugs
|
|
113
|
+
- **Truncated names**: `last_name="Di"`, `full_name="Matthew Di"`, `full_name="Matt"` (incomplete)
|
|
114
|
+
- **Noise locations**: cities/states from insurance quoting, comparison shopping, or form testing
|
|
115
|
+
- **Expired card data**: card expiry dates in the past
|
|
116
|
+
- **Meaningless amounts**: `"100.00"`, `"199.00"`, `"1.00"`, `"$ 245.80"` — these are form values, not useful data
|
|
117
|
+
- **Duplicate phone formats**: keep only one format per phone number (prefer international format with `+1`)
|
|
118
|
+
|
|
119
|
+
### MERGE — combine duplicates
|
|
120
|
+
|
|
121
|
+
- **Same phone, different formats**: `"+1 650-796-1489"`, `"(650) 796-1489"`, `"6507961489"` — keep the international format with highest confidence, delete all others
|
|
122
|
+
- **Same email, different casing**: keep lowercase
|
|
123
|
+
- **Duplicate DOB entries**: identify the real one (consistent across sources) vs noise
|
|
124
|
+
- **Same contact, multiple entries**: merge into highest-confidence one
|
|
125
|
+
|
|
126
|
+
When merging: keep highest confidence. If tied, keep highest `accessed_count`. Delete the rest.
|
|
127
|
+
|
|
128
|
+
### FIX — correct bad data
|
|
129
|
+
|
|
130
|
+
- **Wrong key assignment**: `company="Dmitrii Diakonov"` → delete (person name in company field)
|
|
131
|
+
- **Wrong confidence**: real DOB at 0.4 while fake one at 0.6 → boost real one
|
|
132
|
+
- **Missing tags**: `phone` key without `phone` tag, `email` without `email` tag
|
|
133
|
+
- **Broken supersession**: use `UPDATE memories SET superseded_by=NULL, superseded_at=NULL WHERE id=?`
|
|
134
|
+
- Use `mem.update_memory(id, key=..., value=..., confidence=..., tags=[...])` for fixes
|
|
135
|
+
|
|
136
|
+
### KEEP — mark as reviewed
|
|
137
|
+
|
|
138
|
+
- Genuine user data
|
|
139
|
+
- Correctly keyed and tagged entries
|
|
140
|
+
- Legitimate contacts, accounts, tools
|
|
141
|
+
- Just call `mem.mark_reviewed([id])` — no changes needed
|
|
142
|
+
|
|
143
|
+
## Output Format
|
|
144
|
+
|
|
145
|
+
For each batch, print a table like:
|
|
146
|
+
|
|
147
|
+
```
|
|
148
|
+
Batch 1/N (50 memories)
|
|
149
|
+
───────────────────────────────────────
|
|
150
|
+
ID | Key | Value | Conf | Action
|
|
151
|
+
----|------------------|--------------------|------|--------
|
|
152
|
+
1 | first_name | Matthew | 0.8 | KEEP
|
|
153
|
+
2 | full_name | investor_role | 0.5 | DELETE (code identifier)
|
|
154
|
+
3 | phone | +14155551234 | 0.7 | KEEP
|
|
155
|
+
4 | phone | (415) 555-1234 | 0.5 | MERGE → #3
|
|
156
|
+
...
|
|
157
|
+
|
|
158
|
+
Summary: 30 kept, 12 deleted, 5 fixed, 3 merged
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
At the end, print the profile and final stats:
|
|
162
|
+
|
|
163
|
+
```
|
|
164
|
+
## Final Profile
|
|
165
|
+
[output of mem.profile_text()]
|
|
166
|
+
|
|
167
|
+
## Stats
|
|
168
|
+
Total: X memories (was Y before review)
|
|
169
|
+
Deleted: Z, Kept: W, Fixed: V, Merged: U
|
|
170
|
+
Secrets found and removed: N
|
|
171
|
+
```
|
package/review/run.sh
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Memory Review — weekly extract + LLM review
|
|
3
|
+
# 1. Run extract.py to ingest new browser data
|
|
4
|
+
# 2. Run Claude to review/clean new entries
|
|
5
|
+
# Called by launchd weekly (604800s)
|
|
6
|
+
|
|
7
|
+
set -euo pipefail
|
|
8
|
+
|
|
9
|
+
REPO="$HOME/user-memories"
|
|
10
|
+
SKILL_FILE="$REPO/review/SKILL.md"
|
|
11
|
+
LOG_DIR="$REPO/review/logs"
|
|
12
|
+
VENV="$REPO/.venv/bin/activate"
|
|
13
|
+
DB="$REPO/memories.db"
|
|
14
|
+
|
|
15
|
+
mkdir -p "$LOG_DIR"
|
|
16
|
+
LOG_FILE="$LOG_DIR/$(date +%Y-%m-%d_%H%M%S).log"
|
|
17
|
+
|
|
18
|
+
echo "=== Memory Review Run: $(date) ===" | tee "$LOG_FILE"
|
|
19
|
+
|
|
20
|
+
# Phase 0: Extract new browser data
|
|
21
|
+
echo "--- Extracting browser data ---" | tee -a "$LOG_FILE"
|
|
22
|
+
(
|
|
23
|
+
cd "$REPO"
|
|
24
|
+
source "$VENV"
|
|
25
|
+
python extract.py 2>&1
|
|
26
|
+
) | tee -a "$LOG_FILE"
|
|
27
|
+
|
|
28
|
+
# Check if there are unreviewed entries
|
|
29
|
+
UNREVIEWED=$(cd "$REPO" && source "$VENV" && python -c "
|
|
30
|
+
import sys; sys.path.insert(0, '.')
|
|
31
|
+
from user_memories import MemoryDB
|
|
32
|
+
m = MemoryDB('memories.db')
|
|
33
|
+
print(len(m.get_unreviewed(limit=10000)))
|
|
34
|
+
m.close()
|
|
35
|
+
" 2>/dev/null)
|
|
36
|
+
|
|
37
|
+
echo "Unreviewed entries: $UNREVIEWED" | tee -a "$LOG_FILE"
|
|
38
|
+
|
|
39
|
+
if [ "$UNREVIEWED" = "0" ]; then
|
|
40
|
+
echo "No new entries to review. Done." | tee -a "$LOG_FILE"
|
|
41
|
+
exit 0
|
|
42
|
+
fi
|
|
43
|
+
|
|
44
|
+
# Phase 1: Rule-based cleanup
|
|
45
|
+
echo "--- Running rule-based cleanup ---" | tee -a "$LOG_FILE"
|
|
46
|
+
(
|
|
47
|
+
cd "$REPO"
|
|
48
|
+
source "$VENV"
|
|
49
|
+
python clean.py 2>&1
|
|
50
|
+
) | tee -a "$LOG_FILE"
|
|
51
|
+
|
|
52
|
+
# Re-check unreviewed after cleanup
|
|
53
|
+
UNREVIEWED=$(cd "$REPO" && source "$VENV" && python -c "
|
|
54
|
+
import sys; sys.path.insert(0, '.')
|
|
55
|
+
from user_memories import MemoryDB
|
|
56
|
+
m = MemoryDB('memories.db')
|
|
57
|
+
print(len(m.get_unreviewed(limit=10000)))
|
|
58
|
+
m.close()
|
|
59
|
+
" 2>/dev/null)
|
|
60
|
+
|
|
61
|
+
if [ "$UNREVIEWED" = "0" ]; then
|
|
62
|
+
echo "All entries handled by cleanup. Done." | tee -a "$LOG_FILE"
|
|
63
|
+
exit 0
|
|
64
|
+
fi
|
|
65
|
+
|
|
66
|
+
# Phase 2: Claude reviews remaining entries
|
|
67
|
+
echo "--- Starting Claude review ($UNREVIEWED entries) ---" | tee -a "$LOG_FILE"
|
|
68
|
+
claude -p "You are the Memory Review agent. You clean up the user memories database after extraction.
|
|
69
|
+
|
|
70
|
+
Read $SKILL_FILE for full classification criteria and workflow.
|
|
71
|
+
|
|
72
|
+
DB path: $DB
|
|
73
|
+
Module path: $REPO
|
|
74
|
+
|
|
75
|
+
Run Phases 1-3 from the skill file (Phase 0 fast pass already done by clean.py).
|
|
76
|
+
Process remaining unreviewed entries in batches of 50.
|
|
77
|
+
Print a summary at the end." --max-turns 80 2>&1 | tee -a "$LOG_FILE"
|
|
78
|
+
|
|
79
|
+
echo "=== Run complete: $(date) ===" | tee -a "$LOG_FILE"
|
|
80
|
+
|
|
81
|
+
# Clean up old logs (keep last 30 days)
|
|
82
|
+
find "$LOG_DIR" -name "*.log" -mtime +30 -delete 2>/dev/null || true
|