ai-browser-profile 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,208 @@
1
+ """Lookup maps and browser paths for memory extraction. Self-contained, no external imports."""
2
+
3
+ import re
4
+
5
+ from pathlib import Path
6
+
7
+ APP_SUPPORT = Path.home() / "Library" / "Application Support"
8
+
9
+ # Chromium address_type_tokens type codes -> (key_name, tags)
10
+ ADDRESS_TYPE_MAP = {
11
+ 3: ("first_name", ["identity"]),
12
+ 5: ("last_name", ["identity"]),
13
+ 7: ("full_name", ["identity"]),
14
+ 9: ("email", ["identity", "email", "communication"]),
15
+ 14: ("phone", ["identity", "phone", "communication"]),
16
+ 33: ("city", ["address", "location"]),
17
+ 34: ("state", ["address", "location"]),
18
+ 35: ("zip", ["address", "location"]),
19
+ 36: ("country", ["address", "location"]),
20
+ 60: ("company", ["identity", "company", "work"]),
21
+ 77: ("street_address", ["address", "location"]),
22
+ 79: ("address_line_2", ["address", "location"]),
23
+ }
24
+
25
+ # Autofill form field names -> (normalized key, tags)
26
+ # Used for normalization — unmapped fields are still ingested under cleaned names
27
+ AUTOFILL_FIELD_MAP = {
28
+ "email": ("email", ["identity", "email", "communication"]),
29
+ "e-mail": ("email", ["identity", "email", "communication"]),
30
+ "email_address": ("email", ["identity", "email", "communication"]),
31
+ "emailaddress": ("email", ["identity", "email", "communication"]),
32
+ "email-form-field": ("email", ["identity", "email", "communication"]),
33
+ "name": ("full_name", ["identity"]),
34
+ "fullname": ("full_name", ["identity"]),
35
+ "full_name": ("full_name", ["identity"]),
36
+ "full-name": ("full_name", ["identity"]),
37
+ "firstname": ("first_name", ["identity"]),
38
+ "first_name": ("first_name", ["identity"]),
39
+ "first-name": ("first_name", ["identity"]),
40
+ "given-name": ("first_name", ["identity"]),
41
+ "lastname": ("last_name", ["identity"]),
42
+ "last_name": ("last_name", ["identity"]),
43
+ "last-name": ("last_name", ["identity"]),
44
+ "family-name": ("last_name", ["identity"]),
45
+ "phone": ("phone", ["identity", "phone", "communication"]),
46
+ "tel": ("phone", ["identity", "phone", "communication"]),
47
+ "telephone": ("phone", ["identity", "phone", "communication"]),
48
+ "mobile": ("phone", ["identity", "phone", "communication"]),
49
+ "phonenumber": ("phone", ["identity", "phone", "communication"]),
50
+ "phone_number": ("phone", ["identity", "phone", "communication"]),
51
+ "mobilenumber": ("phone", ["identity", "phone", "communication"]),
52
+ "city": ("city", ["address", "location"]),
53
+ "state": ("state", ["address", "location"]),
54
+ "zip": ("zip", ["address", "location"]),
55
+ "zipcode": ("zip", ["address", "location"]),
56
+ "postal": ("zip", ["address", "location"]),
57
+ "postalcode": ("zip", ["address", "location"]),
58
+ "postal_code": ("zip", ["address", "location"]),
59
+ "country": ("country", ["address", "location"]),
60
+ "address": ("street_address", ["address", "location"]),
61
+ "street": ("street_address", ["address", "location"]),
62
+ "address1": ("street_address", ["address", "location"]),
63
+ "company": ("company", ["identity", "company", "work"]),
64
+ "companyname": ("company", ["identity", "company", "work"]),
65
+ "company_name": ("company", ["identity", "company", "work"]),
66
+ "organization": ("company", ["identity", "company", "work"]),
67
+ "username": ("username", ["identity", "account", "credential"]),
68
+ "login": ("username", ["identity", "account", "credential"]),
69
+ "identifier": ("username", ["identity", "account", "credential"]),
70
+ "dob": ("date_of_birth", ["identity"]),
71
+ "dateofbirth": ("date_of_birth", ["identity"]),
72
+ "date_of_birth": ("date_of_birth", ["identity"]),
73
+ "date-of-birth": ("date_of_birth", ["identity"]),
74
+ "birth-date": ("date_of_birth", ["identity"]),
75
+ "birthdate": ("date_of_birth", ["identity"]),
76
+ "birthday": ("date_of_birth", ["identity"]),
77
+ "gender": ("gender", ["identity"]),
78
+ "sex": ("gender", ["identity"]),
79
+ }
80
+
81
+ # Keywords in field names → tags (for unmapped fields)
82
+ TAG_KEYWORDS = {
83
+ "identity": ["birth", "dob", "gender", "sex", "name", "first", "last", "middle", "suffix",
84
+ "passport", "nationality", "citizen", "ssn"],
85
+ "travel": ["travel", "flyer", "frequent", "airline", "seat", "meal", "tsa",
86
+ "known_traveler", "traveler", "passenger", "boarding", "flight", "loyalty"],
87
+ "address": ["address", "street", "city", "state", "zip", "postal", "country", "apt"],
88
+ "payment": ["card", "payment", "billing", "cvv", "expir"],
89
+ "communication": ["email", "phone", "tel", "mobile", "fax"],
90
+ "account": ["username", "login", "password", "account"],
91
+ "work": ["company", "organization", "employer", "job", "title", "occupation"],
92
+ }
93
+
94
+ # Regex for noise detection in field names
95
+ _UUID_RE = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-')
96
+ _PURE_DIGITS_RE = re.compile(r'^\d+$')
97
+ _TIMESTAMP_RE = re.compile(r'^\d{10,}$')
98
+ _SELECTOR_RE = re.compile(r'^(selectors\.|role:|#\d)')
99
+ _CELL_RE = re.compile(r'^cell[-_]\d+[-_]\d+', re.IGNORECASE)
100
+ _INTERNAL_RE = re.compile(r'^(react_aria|emoji_popover|_\drif_|docs_findandreplace|input\d+_\d+|single_line_text_form_component|single_typeahead_entity_form_component)')
101
+ _MONTH_YEAR_RE = re.compile(r'^(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s+\d{4}$', re.IGNORECASE)
102
+
103
+
104
+ def clean_field_name(raw: str) -> str:
105
+ """Normalize a raw autofill field name to a clean key.
106
+
107
+ Examples:
108
+ 'rtiTraveler.travelers[0].firstName' → 'firstname'
109
+ 'dateofbirth-mercury-utils-id-304' → 'dateofbirth'
110
+ '1-date_of_birth' → 'date_of_birth'
111
+ 'BirthDayM' → 'birthdaym'
112
+ """
113
+ field = raw.strip()
114
+ # Extract last segment of dotted paths
115
+ if "." in field:
116
+ field = field.rsplit(".", 1)[-1]
117
+ # Strip array indices
118
+ field = re.sub(r'\[\d+\]', '', field)
119
+ # Strip leading numeric prefixes like "1-" or "0-1/"
120
+ field = re.sub(r'^[\d]+-', '', field)
121
+ field = re.sub(r'^[\d]+-[\d]+/', '', field)
122
+ # Strip trailing ID suffixes like "-mercury-utils-id-304"
123
+ field = re.sub(r'-[a-z]+-[a-z]+-[a-z]+-\d+$', '', field)
124
+ # Lowercase
125
+ field = field.lower().strip()
126
+ # Replace separators with underscore for lookup
127
+ normalized = re.sub(r'[-/]', '_', field)
128
+ return normalized
129
+
130
+
131
+ def is_noise_field(raw: str) -> bool:
132
+ """Return True if the field name is noise (UUIDs, timestamps, selectors, spreadsheet cells)."""
133
+ return bool(
134
+ _UUID_RE.match(raw)
135
+ or _PURE_DIGITS_RE.match(raw)
136
+ or _TIMESTAMP_RE.match(raw)
137
+ or _SELECTOR_RE.match(raw)
138
+ or _CELL_RE.match(raw)
139
+ or _INTERNAL_RE.match(raw)
140
+ or _MONTH_YEAR_RE.match(raw)
141
+ )
142
+
143
+
144
+ def infer_tags(field: str) -> list[str]:
145
+ """Infer tags from a cleaned field name using keyword matching."""
146
+ tags = set()
147
+ for tag, keywords in TAG_KEYWORDS.items():
148
+ for kw in keywords:
149
+ if kw in field:
150
+ tags.add(tag)
151
+ break
152
+ if not tags:
153
+ tags.add("autofill")
154
+ return list(tags)
155
+
156
+ # Domains -> friendly service names for tool/account detection
157
+ SERVICE_NAMES = {
158
+ "github.com": "GitHub", "gitlab.com": "GitLab", "stackoverflow.com": "Stack Overflow",
159
+ "figma.com": "Figma", "notion.so": "Notion", "trello.com": "Trello",
160
+ "slack.com": "Slack", "app.slack.com": "Slack",
161
+ "linear.app": "Linear", "vercel.com": "Vercel", "netlify.com": "Netlify",
162
+ "aws.amazon.com": "AWS", "console.cloud.google.com": "GCP",
163
+ "portal.azure.com": "Azure", "chatgpt.com": "ChatGPT", "chat.openai.com": "ChatGPT",
164
+ "claude.ai": "Claude", "console.anthropic.com": "Anthropic Console",
165
+ "docs.google.com": "Google Docs", "sheets.google.com": "Google Sheets",
166
+ "drive.google.com": "Google Drive", "mail.google.com": "Gmail",
167
+ "calendar.google.com": "Google Calendar", "meet.google.com": "Google Meet",
168
+ "twitter.com": "X/Twitter", "x.com": "X/Twitter",
169
+ "linkedin.com": "LinkedIn", "www.linkedin.com": "LinkedIn",
170
+ "instagram.com": "Instagram", "www.instagram.com": "Instagram",
171
+ "facebook.com": "Facebook", "www.facebook.com": "Facebook",
172
+ "reddit.com": "Reddit", "www.reddit.com": "Reddit",
173
+ "youtube.com": "YouTube", "www.youtube.com": "YouTube",
174
+ "open.spotify.com": "Spotify",
175
+ "dashboard.stripe.com": "Stripe", "stripe.com": "Stripe",
176
+ "supabase.com": "Supabase",
177
+ "firebase.google.com": "Firebase",
178
+ "sentry.io": "Sentry",
179
+ "posthog.com": "PostHog", "us.posthog.com": "PostHog",
180
+ "mixpanel.com": "Mixpanel",
181
+ "app.apollo.io": "Apollo",
182
+ "quickbooks.intuit.com": "QuickBooks",
183
+ "web.whatsapp.com": "WhatsApp",
184
+ "discord.com": "Discord",
185
+ "teams.microsoft.com": "Microsoft Teams",
186
+ "canva.com": "Canva", "www.canva.com": "Canva",
187
+ "excalidraw.com": "Excalidraw",
188
+ "codesandbox.io": "CodeSandbox",
189
+ "codepen.io": "CodePen",
190
+ "app.cal.com": "Cal.com",
191
+ "calendly.com": "Calendly",
192
+ "my.openphone.com": "OpenPhone",
193
+ "mail.missiveapp.com": "Missive",
194
+ "app.gusto.com": "Gusto",
195
+ "coinbase.com": "Coinbase", "www.coinbase.com": "Coinbase",
196
+ "polymarket.com": "Polymarket",
197
+ "producthunt.com": "Product Hunt", "www.producthunt.com": "Product Hunt",
198
+ "upwork.com": "Upwork", "www.upwork.com": "Upwork",
199
+ "fiverr.com": "Fiverr", "www.fiverr.com": "Fiverr",
200
+ }
201
+
202
+ # Chromium browser data directories
203
+ BROWSER_PATHS = {
204
+ "arc": APP_SUPPORT / "Arc" / "User Data",
205
+ "chrome": APP_SUPPORT / "Google" / "Chrome",
206
+ "brave": APP_SUPPORT / "BraveSoftware" / "Brave-Browser",
207
+ "edge": APP_SUPPORT / "Microsoft Edge",
208
+ }
@@ -0,0 +1,123 @@
1
+ """Ingest tool/service memories from browser history."""
2
+
3
+ import shutil
4
+ import sqlite3
5
+ import logging
6
+ from datetime import datetime, timedelta, timezone
7
+ from pathlib import Path
8
+
9
+ from ai_browser_profile.db import MemoryDB
10
+ from ai_browser_profile.ingestors.browser_detect import BrowserProfile, copy_db, domain
11
+ from ai_browser_profile.ingestors.constants import SERVICE_NAMES
12
+
13
+ log = logging.getLogger(__name__)
14
+
15
+ CHROME_EPOCH = datetime(1601, 1, 1, tzinfo=timezone.utc)
16
+ MACOS_EPOCH = datetime(2001, 1, 1, tzinfo=timezone.utc)
17
+
18
+
19
+ def _chromium_history(profile: BrowserProfile) -> dict[str, int]:
20
+ """Read domain visit counts from Chromium History SQLite."""
21
+ counts: dict[str, int] = {}
22
+ tmp = copy_db(profile.path / "History")
23
+ if not tmp:
24
+ return counts
25
+ try:
26
+ conn = sqlite3.connect(f"file:{tmp}?mode=ro", uri=True)
27
+ conn.row_factory = sqlite3.Row
28
+ for row in conn.execute(
29
+ "SELECT url, visit_count FROM urls ORDER BY last_visit_time DESC LIMIT 10000"
30
+ ):
31
+ d = domain(row["url"])
32
+ if d:
33
+ counts[d] = counts.get(d, 0) + (row["visit_count"] or 1)
34
+ conn.close()
35
+ except Exception as e:
36
+ log.warning(f"Failed to read History for {profile.browser}/{profile.name}: {e}")
37
+ finally:
38
+ shutil.rmtree(tmp.parent, ignore_errors=True)
39
+ return counts
40
+
41
+
42
+ def _safari_history(profile: BrowserProfile) -> dict[str, int]:
43
+ """Read domain visit counts from Safari History.db."""
44
+ counts: dict[str, int] = {}
45
+ tmp = copy_db(profile.path / "History.db")
46
+ if not tmp:
47
+ return counts
48
+ try:
49
+ conn = sqlite3.connect(f"file:{tmp}?mode=ro", uri=True)
50
+ conn.row_factory = sqlite3.Row
51
+ for row in conn.execute(
52
+ "SELECT url, visit_count FROM history_items ORDER BY visit_count DESC LIMIT 10000"
53
+ ):
54
+ d = domain(row["url"])
55
+ if d:
56
+ counts[d] = counts.get(d, 0) + (row["visit_count"] or 1)
57
+ conn.close()
58
+ except Exception as e:
59
+ log.warning(f"Failed to read Safari History: {e}")
60
+ finally:
61
+ shutil.rmtree(tmp.parent, ignore_errors=True)
62
+ return counts
63
+
64
+
65
+ def _firefox_history(profile: BrowserProfile) -> dict[str, int]:
66
+ """Read domain visit counts from Firefox places.sqlite."""
67
+ counts: dict[str, int] = {}
68
+ tmp = copy_db(profile.path / "places.sqlite")
69
+ if not tmp:
70
+ return counts
71
+ try:
72
+ conn = sqlite3.connect(f"file:{tmp}?mode=ro", uri=True)
73
+ conn.row_factory = sqlite3.Row
74
+ for row in conn.execute(
75
+ "SELECT url, visit_count FROM moz_places WHERE visit_count > 0 ORDER BY visit_count DESC LIMIT 10000"
76
+ ):
77
+ d = domain(row["url"])
78
+ if d:
79
+ counts[d] = counts.get(d, 0) + (row["visit_count"] or 1)
80
+ conn.close()
81
+ except Exception as e:
82
+ log.warning(f"Failed to read Firefox places.sqlite: {e}")
83
+ finally:
84
+ shutil.rmtree(tmp.parent, ignore_errors=True)
85
+ return counts
86
+
87
+
88
+ def ingest_history(mem: MemoryDB, profiles: list[BrowserProfile]):
89
+ """Extract tool/service memories from browser history across all profiles."""
90
+ # Aggregate domain counts across all profiles
91
+ totals: dict[str, int] = {}
92
+ for profile in profiles:
93
+ if profile.browser in ("arc", "chrome", "brave", "edge"):
94
+ counts = _chromium_history(profile)
95
+ elif profile.browser == "safari":
96
+ counts = _safari_history(profile)
97
+ elif profile.browser == "firefox":
98
+ counts = _firefox_history(profile)
99
+ else:
100
+ continue
101
+ for d, c in counts.items():
102
+ totals[d] = totals.get(d, 0) + c
103
+
104
+ # Convert to tool/service memories
105
+ for d, total in sorted(totals.items(), key=lambda x: -x[1])[:200]:
106
+ if d not in SERVICE_NAMES:
107
+ continue
108
+ service = SERVICE_NAMES[d]
109
+ tags = ["account", "tool"]
110
+ if service in ("GitHub", "GitLab", "Vercel", "Netlify", "Supabase", "Firebase", "CodeSandbox"):
111
+ tags.append("work")
112
+ tags.append("dev")
113
+ elif service in ("Gmail", "Slack", "WhatsApp", "Discord", "Microsoft Teams", "Missive", "OpenPhone"):
114
+ tags.append("communication")
115
+ elif service in ("LinkedIn", "X/Twitter", "Instagram", "Facebook", "Reddit", "YouTube", "Product Hunt"):
116
+ tags.append("social")
117
+ elif service in ("Stripe", "QuickBooks", "Coinbase", "Gusto", "Polymarket"):
118
+ tags.append("finance")
119
+ elif service in ("ChatGPT", "Claude", "Anthropic Console"):
120
+ tags.append("ai")
121
+ mem.upsert(f"tool:{service}", str(total), tags, source=f"history:{d}")
122
+
123
+ log.info(f" History: {len(totals)} domains, {sum(1 for d in totals if d in SERVICE_NAMES)} known services")
@@ -0,0 +1,203 @@
1
+ """Ingest WhatsApp contacts from Chromium IndexedDB (LevelDB)."""
2
+
3
+ import json
4
+ import shutil
5
+ import tempfile
6
+ import logging
7
+ from pathlib import Path
8
+
9
+ from ai_browser_profile.db import MemoryDB
10
+ from ai_browser_profile.ingestors.browser_detect import BrowserProfile
11
+
12
+ log = logging.getLogger(__name__)
13
+
14
+
15
+ def _copy_dir(src: Path) -> Path:
16
+ """Copy a directory to temp to avoid browser locks."""
17
+ tmp = Path(tempfile.mkdtemp(prefix="ai_browser_profile_idb_"))
18
+ dst = tmp / src.name
19
+ shutil.copytree(src, dst)
20
+ return dst
21
+
22
+
23
+ def _serialize_value(val, depth=0):
24
+ """Recursively convert ccl IndexedDB value to JSON-safe dict."""
25
+ if depth > 20:
26
+ return "<nested too deep>"
27
+ if val is None:
28
+ return None
29
+ if isinstance(val, (bool, int, float, str)):
30
+ return val
31
+ if isinstance(val, bytes):
32
+ try:
33
+ return val.decode("utf-8")
34
+ except UnicodeDecodeError:
35
+ return f"<binary {len(val)} bytes>"
36
+ if isinstance(val, dict):
37
+ return {str(k): _serialize_value(v, depth + 1) for k, v in val.items()}
38
+ if isinstance(val, (list, tuple)):
39
+ return [_serialize_value(v, depth + 1) for v in val]
40
+ if hasattr(val, "value"):
41
+ return _serialize_value(val.value, depth + 1)
42
+ return str(val)
43
+
44
+
45
+ def _normalize_phone(raw: str) -> str:
46
+ """Normalize a phone number to digits-only with leading +."""
47
+ digits = "".join(c for c in raw if c.isdigit())
48
+ if not digits:
49
+ return raw
50
+ if not digits.startswith("+"):
51
+ digits = "+" + digits
52
+ return digits
53
+
54
+
55
+ def _extract_phone(data: dict) -> str:
56
+ """Extract and normalize a phone number from a WhatsApp contact record."""
57
+ phone = data.get("phoneNumber") or ""
58
+ jid = data.get("id") or ""
59
+
60
+ if not phone and "@" in str(jid):
61
+ # Extract digits from JID (works for both @c.us and @s.whatsapp.net)
62
+ phone = str(jid).split("@")[0]
63
+
64
+ if phone:
65
+ return _normalize_phone(phone)
66
+ return ""
67
+
68
+
69
+ def ingest_indexeddb(mem: MemoryDB, profiles: list[BrowserProfile]):
70
+ """Extract WhatsApp contacts from Chromium IndexedDB.
71
+
72
+ Deduplicates contacts by normalized phone number — WhatsApp stores each
73
+ contact under both @c.us and @s.whatsapp.net JIDs, which previously
74
+ inflated the database by ~44%.
75
+ """
76
+ from ccl_chromium_reader import ccl_chromium_indexeddb
77
+
78
+ # Collect all contacts first, dedup by (name, phone)
79
+ seen: dict[tuple[str, str], dict] = {} # (name, phone) -> {tags, value}
80
+
81
+ for profile in profiles:
82
+ if profile.browser in ("safari", "firefox"):
83
+ continue
84
+
85
+ idb_root = profile.path / "IndexedDB"
86
+ if not idb_root.exists():
87
+ continue
88
+
89
+ for db_dir in sorted(idb_root.glob("*whatsapp*_0.indexeddb.leveldb")):
90
+ blob_dir = db_dir.parent / db_dir.name.replace(".leveldb", ".blob")
91
+
92
+ tmp_db = _copy_dir(db_dir)
93
+ tmp_blob = _copy_dir(blob_dir) if blob_dir.exists() else None
94
+
95
+ try:
96
+ wrapper = ccl_chromium_indexeddb.WrappedIndexDB(
97
+ str(tmp_db),
98
+ str(tmp_blob) if tmp_blob else None,
99
+ )
100
+
101
+ for db_id in wrapper.database_ids:
102
+ try:
103
+ db = wrapper[db_id.name, db_id.origin]
104
+ except Exception:
105
+ continue
106
+
107
+ if "contact" not in db.object_store_names:
108
+ continue
109
+
110
+ for record in db["contact"].iterate_records():
111
+ try:
112
+ data = _serialize_value(record.value)
113
+ if not isinstance(data, dict):
114
+ continue
115
+
116
+ name = data.get("name") or data.get("pushname") or data.get("verifiedName") or ""
117
+ if not name:
118
+ continue
119
+
120
+ # Skip junk names
121
+ stripped = name.strip()
122
+ if not stripped or stripped == "." or stripped == "<Undefined>":
123
+ continue
124
+ # Skip emoji-only names (no alphanumeric chars)
125
+ if not any(c.isalnum() for c in stripped):
126
+ continue
127
+
128
+ phone = _extract_phone(data)
129
+ jid = data.get("id") or ""
130
+
131
+ tags = ["contact", "communication"]
132
+ if data.get("isBusiness") or data.get("isEnterprise"):
133
+ tags.append("work")
134
+
135
+ value = phone if phone else str(jid)
136
+ dedup_key = (name, phone if phone else str(jid))
137
+
138
+ if dedup_key not in seen:
139
+ seen[dedup_key] = {"tags": tags, "value": value}
140
+ else:
141
+ # Merge tags (e.g. one record has "work", the other doesn't)
142
+ for t in tags:
143
+ if t not in seen[dedup_key]["tags"]:
144
+ seen[dedup_key]["tags"].append(t)
145
+
146
+ except Exception:
147
+ continue
148
+
149
+ except Exception as e:
150
+ log.warning(f"Failed to read WhatsApp IndexedDB for {profile.browser}/{profile.name}: {e}")
151
+ finally:
152
+ shutil.rmtree(tmp_db.parent, ignore_errors=True)
153
+ if tmp_blob:
154
+ shutil.rmtree(tmp_blob.parent, ignore_errors=True)
155
+
156
+ # Upsert deduplicated contacts
157
+ for (name, _phone), entry in seen.items():
158
+ mem.upsert(f"contact:{name}", entry["value"], entry["tags"], source="whatsapp")
159
+
160
+ # Clean up old JID-format entries (@c.us, @s.whatsapp.net) that now have normalized phone values
161
+ old_jid_rows = mem.conn.execute("""
162
+ SELECT id, key, value FROM memories
163
+ WHERE source = 'whatsapp'
164
+ AND (value LIKE '%@c.us' OR value LIKE '%@s.whatsapp.net')
165
+ AND superseded_by IS NULL
166
+ """).fetchall()
167
+
168
+ cleaned = 0
169
+ for old_id, old_key, old_value in old_jid_rows:
170
+ # Check if a normalized version exists for the same contact name
171
+ normalized = mem.conn.execute("""
172
+ SELECT id FROM memories
173
+ WHERE key = ? AND source = 'whatsapp'
174
+ AND value NOT LIKE '%@c.us' AND value NOT LIKE '%@s.whatsapp.net'
175
+ AND superseded_by IS NULL
176
+ LIMIT 1
177
+ """, (old_key,)).fetchone()
178
+ if normalized:
179
+ mem.conn.execute(
180
+ "UPDATE memories SET superseded_by = ? WHERE id = ?",
181
+ (normalized[0], old_id),
182
+ )
183
+ cleaned += 1
184
+ else:
185
+ # No normalized version — delete the junk entry entirely
186
+ mem.conn.execute("DELETE FROM memories WHERE id = ?", (old_id,))
187
+ cleaned += 1
188
+
189
+ if cleaned:
190
+ mem.conn.commit()
191
+ log.info(f" Cleaned {cleaned} old JID-format WhatsApp entries")
192
+
193
+ # Also clean up junk names that slipped through previous extractions
194
+ junk = mem.conn.execute("""
195
+ DELETE FROM memories
196
+ WHERE source = 'whatsapp' AND superseded_by IS NULL
197
+ AND (value = '<Undefined>' OR key = 'contact:.' OR key = 'contact:<Undefined>')
198
+ """).rowcount
199
+ if junk:
200
+ mem.conn.commit()
201
+ log.info(f" Deleted {junk} junk WhatsApp entries")
202
+
203
+ log.info(f" IndexedDB: {len(seen)} WhatsApp contacts (deduplicated)")
@@ -0,0 +1,66 @@
1
+ """Ingest LinkedIn connections from Chromium Local Storage (LevelDB)."""
2
+
3
+ import json
4
+ import shutil
5
+ import tempfile
6
+ import logging
7
+ from pathlib import Path
8
+
9
+ from ai_browser_profile.db import MemoryDB
10
+ from ai_browser_profile.ingestors.browser_detect import BrowserProfile
11
+
12
+ log = logging.getLogger(__name__)
13
+
14
+
15
+ def ingest_localstorage(mem: MemoryDB, profiles: list[BrowserProfile]):
16
+ """Extract LinkedIn connections from Chromium Local Storage."""
17
+ from ccl_chromium_reader import ccl_chromium_localstorage
18
+
19
+ total = 0
20
+ for profile in profiles:
21
+ if profile.browser in ("safari", "firefox"):
22
+ continue
23
+
24
+ ls_dir = profile.path / "Local Storage" / "leveldb"
25
+ if not ls_dir.exists():
26
+ continue
27
+
28
+ tmp = Path(tempfile.mkdtemp(prefix="ai_browser_profile_ls_"))
29
+ tmp_ls = tmp / "leveldb"
30
+ try:
31
+ shutil.copytree(ls_dir, tmp_ls)
32
+ except Exception as e:
33
+ log.warning(f"Failed to copy Local Storage for {profile.browser}/{profile.name}: {e}")
34
+ shutil.rmtree(tmp, ignore_errors=True)
35
+ continue
36
+
37
+ try:
38
+ ldb = ccl_chromium_localstorage.LocalStoreDb(tmp_ls)
39
+ for record in ldb.iter_all_records():
40
+ try:
41
+ origin = record.storage_key or ""
42
+ key = record.script_key or ""
43
+
44
+ # LinkedIn connections
45
+ if "linkedin" in origin and key == "linkedin_assistant_profiles":
46
+ value = record.value or ""
47
+ data = json.loads(value)
48
+ profiles_data = data.get("profiles", {})
49
+ for url, p in profiles_data.items():
50
+ name = p.get("name", "")
51
+ title = p.get("title", "")
52
+ if not name:
53
+ continue
54
+ value_str = title if title else url
55
+ mem.upsert(f"linkedin:{name}", value_str,
56
+ ["contact", "work", "social"], source="linkedin")
57
+ total += 1
58
+ except Exception:
59
+ continue
60
+
61
+ except Exception as e:
62
+ log.warning(f"Failed to read Local Storage for {profile.browser}/{profile.name}: {e}")
63
+ finally:
64
+ shutil.rmtree(tmp, ignore_errors=True)
65
+
66
+ log.info(f" Local Storage: {total} LinkedIn connections")
@@ -0,0 +1,46 @@
1
+ """Ingest account/email memories from Chromium Login Data."""
2
+
3
+ import shutil
4
+ import sqlite3
5
+ import logging
6
+
7
+ from ai_browser_profile.db import MemoryDB
8
+ from ai_browser_profile.ingestors.browser_detect import BrowserProfile, copy_db, domain
9
+
10
+ log = logging.getLogger(__name__)
11
+
12
+
13
+ def ingest_logins(mem: MemoryDB, profiles: list[BrowserProfile]):
14
+ """Extract account and email memories from Chromium Login Data files."""
15
+ total = 0
16
+ for profile in profiles:
17
+ if profile.browser in ("safari", "firefox"):
18
+ continue # No Login Data SQLite for these
19
+
20
+ tmp = copy_db(profile.path / "Login Data")
21
+ if not tmp:
22
+ continue
23
+ try:
24
+ conn = sqlite3.connect(f"file:{tmp}?mode=ro", uri=True)
25
+ conn.row_factory = sqlite3.Row
26
+ for row in conn.execute(
27
+ "SELECT origin_url, username_value, times_used FROM logins "
28
+ "WHERE username_value != '' ORDER BY times_used DESC LIMIT 200"
29
+ ):
30
+ d = domain(row["origin_url"])
31
+ username = row["username_value"]
32
+ use_count = row["times_used"] or 0
33
+ mem.upsert(f"account:{d}", username,
34
+ ["account"], source=f"login:{d}")
35
+
36
+ if "@" in username:
37
+ mem.upsert("email", username, ["identity", "contact_info", "communication"],
38
+ source=f"login:{d}")
39
+ total += 1
40
+ conn.close()
41
+ except Exception as e:
42
+ log.warning(f"Failed to read Login Data for {profile.browser}/{profile.name}: {e}")
43
+ finally:
44
+ shutil.rmtree(tmp.parent, ignore_errors=True)
45
+
46
+ log.info(f" Logins: {total} account entries")