ai-browser-profile 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +118 -0
- package/ai_browser_profile/__init__.py +6 -0
- package/ai_browser_profile/db.py +929 -0
- package/ai_browser_profile/embeddings.py +196 -0
- package/ai_browser_profile/extract.py +108 -0
- package/ai_browser_profile/ingestors/__init__.py +0 -0
- package/ai_browser_profile/ingestors/bookmarks.py +185 -0
- package/ai_browser_profile/ingestors/browser_detect.py +100 -0
- package/ai_browser_profile/ingestors/constants.py +208 -0
- package/ai_browser_profile/ingestors/history.py +123 -0
- package/ai_browser_profile/ingestors/indexeddb.py +203 -0
- package/ai_browser_profile/ingestors/localstorage.py +66 -0
- package/ai_browser_profile/ingestors/logins.py +46 -0
- package/ai_browser_profile/ingestors/messages.py +151 -0
- package/ai_browser_profile/ingestors/notion.py +313 -0
- package/ai_browser_profile/ingestors/webdata.py +134 -0
- package/autofill/SKILL.md +252 -0
- package/bin/cli.js +315 -0
- package/clean.py +295 -0
- package/extract.py +53 -0
- package/package.json +40 -0
- package/review/SKILL.md +171 -0
- package/review/run.sh +82 -0
- package/setup/SKILL.md +177 -0
- package/skill/SKILL.md +180 -0
- package/whatsapp/SKILL.md +321 -0
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
"""Lookup maps and browser paths for memory extraction. Self-contained, no external imports."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
APP_SUPPORT = Path.home() / "Library" / "Application Support"
|
|
8
|
+
|
|
9
|
+
# Chromium address_type_tokens type codes -> (key_name, tags)
|
|
10
|
+
ADDRESS_TYPE_MAP = {
|
|
11
|
+
3: ("first_name", ["identity"]),
|
|
12
|
+
5: ("last_name", ["identity"]),
|
|
13
|
+
7: ("full_name", ["identity"]),
|
|
14
|
+
9: ("email", ["identity", "email", "communication"]),
|
|
15
|
+
14: ("phone", ["identity", "phone", "communication"]),
|
|
16
|
+
33: ("city", ["address", "location"]),
|
|
17
|
+
34: ("state", ["address", "location"]),
|
|
18
|
+
35: ("zip", ["address", "location"]),
|
|
19
|
+
36: ("country", ["address", "location"]),
|
|
20
|
+
60: ("company", ["identity", "company", "work"]),
|
|
21
|
+
77: ("street_address", ["address", "location"]),
|
|
22
|
+
79: ("address_line_2", ["address", "location"]),
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
# Autofill form field names -> (normalized key, tags)
|
|
26
|
+
# Used for normalization — unmapped fields are still ingested under cleaned names
|
|
27
|
+
AUTOFILL_FIELD_MAP = {
|
|
28
|
+
"email": ("email", ["identity", "email", "communication"]),
|
|
29
|
+
"e-mail": ("email", ["identity", "email", "communication"]),
|
|
30
|
+
"email_address": ("email", ["identity", "email", "communication"]),
|
|
31
|
+
"emailaddress": ("email", ["identity", "email", "communication"]),
|
|
32
|
+
"email-form-field": ("email", ["identity", "email", "communication"]),
|
|
33
|
+
"name": ("full_name", ["identity"]),
|
|
34
|
+
"fullname": ("full_name", ["identity"]),
|
|
35
|
+
"full_name": ("full_name", ["identity"]),
|
|
36
|
+
"full-name": ("full_name", ["identity"]),
|
|
37
|
+
"firstname": ("first_name", ["identity"]),
|
|
38
|
+
"first_name": ("first_name", ["identity"]),
|
|
39
|
+
"first-name": ("first_name", ["identity"]),
|
|
40
|
+
"given-name": ("first_name", ["identity"]),
|
|
41
|
+
"lastname": ("last_name", ["identity"]),
|
|
42
|
+
"last_name": ("last_name", ["identity"]),
|
|
43
|
+
"last-name": ("last_name", ["identity"]),
|
|
44
|
+
"family-name": ("last_name", ["identity"]),
|
|
45
|
+
"phone": ("phone", ["identity", "phone", "communication"]),
|
|
46
|
+
"tel": ("phone", ["identity", "phone", "communication"]),
|
|
47
|
+
"telephone": ("phone", ["identity", "phone", "communication"]),
|
|
48
|
+
"mobile": ("phone", ["identity", "phone", "communication"]),
|
|
49
|
+
"phonenumber": ("phone", ["identity", "phone", "communication"]),
|
|
50
|
+
"phone_number": ("phone", ["identity", "phone", "communication"]),
|
|
51
|
+
"mobilenumber": ("phone", ["identity", "phone", "communication"]),
|
|
52
|
+
"city": ("city", ["address", "location"]),
|
|
53
|
+
"state": ("state", ["address", "location"]),
|
|
54
|
+
"zip": ("zip", ["address", "location"]),
|
|
55
|
+
"zipcode": ("zip", ["address", "location"]),
|
|
56
|
+
"postal": ("zip", ["address", "location"]),
|
|
57
|
+
"postalcode": ("zip", ["address", "location"]),
|
|
58
|
+
"postal_code": ("zip", ["address", "location"]),
|
|
59
|
+
"country": ("country", ["address", "location"]),
|
|
60
|
+
"address": ("street_address", ["address", "location"]),
|
|
61
|
+
"street": ("street_address", ["address", "location"]),
|
|
62
|
+
"address1": ("street_address", ["address", "location"]),
|
|
63
|
+
"company": ("company", ["identity", "company", "work"]),
|
|
64
|
+
"companyname": ("company", ["identity", "company", "work"]),
|
|
65
|
+
"company_name": ("company", ["identity", "company", "work"]),
|
|
66
|
+
"organization": ("company", ["identity", "company", "work"]),
|
|
67
|
+
"username": ("username", ["identity", "account", "credential"]),
|
|
68
|
+
"login": ("username", ["identity", "account", "credential"]),
|
|
69
|
+
"identifier": ("username", ["identity", "account", "credential"]),
|
|
70
|
+
"dob": ("date_of_birth", ["identity"]),
|
|
71
|
+
"dateofbirth": ("date_of_birth", ["identity"]),
|
|
72
|
+
"date_of_birth": ("date_of_birth", ["identity"]),
|
|
73
|
+
"date-of-birth": ("date_of_birth", ["identity"]),
|
|
74
|
+
"birth-date": ("date_of_birth", ["identity"]),
|
|
75
|
+
"birthdate": ("date_of_birth", ["identity"]),
|
|
76
|
+
"birthday": ("date_of_birth", ["identity"]),
|
|
77
|
+
"gender": ("gender", ["identity"]),
|
|
78
|
+
"sex": ("gender", ["identity"]),
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
# Keywords in field names → tags (for unmapped fields)
|
|
82
|
+
TAG_KEYWORDS = {
|
|
83
|
+
"identity": ["birth", "dob", "gender", "sex", "name", "first", "last", "middle", "suffix",
|
|
84
|
+
"passport", "nationality", "citizen", "ssn"],
|
|
85
|
+
"travel": ["travel", "flyer", "frequent", "airline", "seat", "meal", "tsa",
|
|
86
|
+
"known_traveler", "traveler", "passenger", "boarding", "flight", "loyalty"],
|
|
87
|
+
"address": ["address", "street", "city", "state", "zip", "postal", "country", "apt"],
|
|
88
|
+
"payment": ["card", "payment", "billing", "cvv", "expir"],
|
|
89
|
+
"communication": ["email", "phone", "tel", "mobile", "fax"],
|
|
90
|
+
"account": ["username", "login", "password", "account"],
|
|
91
|
+
"work": ["company", "organization", "employer", "job", "title", "occupation"],
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
# Regex for noise detection in field names
|
|
95
|
+
_UUID_RE = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-')
|
|
96
|
+
_PURE_DIGITS_RE = re.compile(r'^\d+$')
|
|
97
|
+
_TIMESTAMP_RE = re.compile(r'^\d{10,}$')
|
|
98
|
+
_SELECTOR_RE = re.compile(r'^(selectors\.|role:|#\d)')
|
|
99
|
+
_CELL_RE = re.compile(r'^cell[-_]\d+[-_]\d+', re.IGNORECASE)
|
|
100
|
+
_INTERNAL_RE = re.compile(r'^(react_aria|emoji_popover|_\drif_|docs_findandreplace|input\d+_\d+|single_line_text_form_component|single_typeahead_entity_form_component)')
|
|
101
|
+
_MONTH_YEAR_RE = re.compile(r'^(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s+\d{4}$', re.IGNORECASE)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def clean_field_name(raw: str) -> str:
|
|
105
|
+
"""Normalize a raw autofill field name to a clean key.
|
|
106
|
+
|
|
107
|
+
Examples:
|
|
108
|
+
'rtiTraveler.travelers[0].firstName' → 'firstname'
|
|
109
|
+
'dateofbirth-mercury-utils-id-304' → 'dateofbirth'
|
|
110
|
+
'1-date_of_birth' → 'date_of_birth'
|
|
111
|
+
'BirthDayM' → 'birthdaym'
|
|
112
|
+
"""
|
|
113
|
+
field = raw.strip()
|
|
114
|
+
# Extract last segment of dotted paths
|
|
115
|
+
if "." in field:
|
|
116
|
+
field = field.rsplit(".", 1)[-1]
|
|
117
|
+
# Strip array indices
|
|
118
|
+
field = re.sub(r'\[\d+\]', '', field)
|
|
119
|
+
# Strip leading numeric prefixes like "1-" or "0-1/"
|
|
120
|
+
field = re.sub(r'^[\d]+-', '', field)
|
|
121
|
+
field = re.sub(r'^[\d]+-[\d]+/', '', field)
|
|
122
|
+
# Strip trailing ID suffixes like "-mercury-utils-id-304"
|
|
123
|
+
field = re.sub(r'-[a-z]+-[a-z]+-[a-z]+-\d+$', '', field)
|
|
124
|
+
# Lowercase
|
|
125
|
+
field = field.lower().strip()
|
|
126
|
+
# Replace separators with underscore for lookup
|
|
127
|
+
normalized = re.sub(r'[-/]', '_', field)
|
|
128
|
+
return normalized
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def is_noise_field(raw: str) -> bool:
|
|
132
|
+
"""Return True if the field name is noise (UUIDs, timestamps, selectors, spreadsheet cells)."""
|
|
133
|
+
return bool(
|
|
134
|
+
_UUID_RE.match(raw)
|
|
135
|
+
or _PURE_DIGITS_RE.match(raw)
|
|
136
|
+
or _TIMESTAMP_RE.match(raw)
|
|
137
|
+
or _SELECTOR_RE.match(raw)
|
|
138
|
+
or _CELL_RE.match(raw)
|
|
139
|
+
or _INTERNAL_RE.match(raw)
|
|
140
|
+
or _MONTH_YEAR_RE.match(raw)
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def infer_tags(field: str) -> list[str]:
|
|
145
|
+
"""Infer tags from a cleaned field name using keyword matching."""
|
|
146
|
+
tags = set()
|
|
147
|
+
for tag, keywords in TAG_KEYWORDS.items():
|
|
148
|
+
for kw in keywords:
|
|
149
|
+
if kw in field:
|
|
150
|
+
tags.add(tag)
|
|
151
|
+
break
|
|
152
|
+
if not tags:
|
|
153
|
+
tags.add("autofill")
|
|
154
|
+
return list(tags)
|
|
155
|
+
|
|
156
|
+
# Domains -> friendly service names for tool/account detection
|
|
157
|
+
SERVICE_NAMES = {
|
|
158
|
+
"github.com": "GitHub", "gitlab.com": "GitLab", "stackoverflow.com": "Stack Overflow",
|
|
159
|
+
"figma.com": "Figma", "notion.so": "Notion", "trello.com": "Trello",
|
|
160
|
+
"slack.com": "Slack", "app.slack.com": "Slack",
|
|
161
|
+
"linear.app": "Linear", "vercel.com": "Vercel", "netlify.com": "Netlify",
|
|
162
|
+
"aws.amazon.com": "AWS", "console.cloud.google.com": "GCP",
|
|
163
|
+
"portal.azure.com": "Azure", "chatgpt.com": "ChatGPT", "chat.openai.com": "ChatGPT",
|
|
164
|
+
"claude.ai": "Claude", "console.anthropic.com": "Anthropic Console",
|
|
165
|
+
"docs.google.com": "Google Docs", "sheets.google.com": "Google Sheets",
|
|
166
|
+
"drive.google.com": "Google Drive", "mail.google.com": "Gmail",
|
|
167
|
+
"calendar.google.com": "Google Calendar", "meet.google.com": "Google Meet",
|
|
168
|
+
"twitter.com": "X/Twitter", "x.com": "X/Twitter",
|
|
169
|
+
"linkedin.com": "LinkedIn", "www.linkedin.com": "LinkedIn",
|
|
170
|
+
"instagram.com": "Instagram", "www.instagram.com": "Instagram",
|
|
171
|
+
"facebook.com": "Facebook", "www.facebook.com": "Facebook",
|
|
172
|
+
"reddit.com": "Reddit", "www.reddit.com": "Reddit",
|
|
173
|
+
"youtube.com": "YouTube", "www.youtube.com": "YouTube",
|
|
174
|
+
"open.spotify.com": "Spotify",
|
|
175
|
+
"dashboard.stripe.com": "Stripe", "stripe.com": "Stripe",
|
|
176
|
+
"supabase.com": "Supabase",
|
|
177
|
+
"firebase.google.com": "Firebase",
|
|
178
|
+
"sentry.io": "Sentry",
|
|
179
|
+
"posthog.com": "PostHog", "us.posthog.com": "PostHog",
|
|
180
|
+
"mixpanel.com": "Mixpanel",
|
|
181
|
+
"app.apollo.io": "Apollo",
|
|
182
|
+
"quickbooks.intuit.com": "QuickBooks",
|
|
183
|
+
"web.whatsapp.com": "WhatsApp",
|
|
184
|
+
"discord.com": "Discord",
|
|
185
|
+
"teams.microsoft.com": "Microsoft Teams",
|
|
186
|
+
"canva.com": "Canva", "www.canva.com": "Canva",
|
|
187
|
+
"excalidraw.com": "Excalidraw",
|
|
188
|
+
"codesandbox.io": "CodeSandbox",
|
|
189
|
+
"codepen.io": "CodePen",
|
|
190
|
+
"app.cal.com": "Cal.com",
|
|
191
|
+
"calendly.com": "Calendly",
|
|
192
|
+
"my.openphone.com": "OpenPhone",
|
|
193
|
+
"mail.missiveapp.com": "Missive",
|
|
194
|
+
"app.gusto.com": "Gusto",
|
|
195
|
+
"coinbase.com": "Coinbase", "www.coinbase.com": "Coinbase",
|
|
196
|
+
"polymarket.com": "Polymarket",
|
|
197
|
+
"producthunt.com": "Product Hunt", "www.producthunt.com": "Product Hunt",
|
|
198
|
+
"upwork.com": "Upwork", "www.upwork.com": "Upwork",
|
|
199
|
+
"fiverr.com": "Fiverr", "www.fiverr.com": "Fiverr",
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
# Chromium browser data directories
|
|
203
|
+
BROWSER_PATHS = {
|
|
204
|
+
"arc": APP_SUPPORT / "Arc" / "User Data",
|
|
205
|
+
"chrome": APP_SUPPORT / "Google" / "Chrome",
|
|
206
|
+
"brave": APP_SUPPORT / "BraveSoftware" / "Brave-Browser",
|
|
207
|
+
"edge": APP_SUPPORT / "Microsoft Edge",
|
|
208
|
+
}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
"""Ingest tool/service memories from browser history."""
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
import sqlite3
|
|
5
|
+
import logging
|
|
6
|
+
from datetime import datetime, timedelta, timezone
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from ai_browser_profile.db import MemoryDB
|
|
10
|
+
from ai_browser_profile.ingestors.browser_detect import BrowserProfile, copy_db, domain
|
|
11
|
+
from ai_browser_profile.ingestors.constants import SERVICE_NAMES
|
|
12
|
+
|
|
13
|
+
log = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
CHROME_EPOCH = datetime(1601, 1, 1, tzinfo=timezone.utc)
|
|
16
|
+
MACOS_EPOCH = datetime(2001, 1, 1, tzinfo=timezone.utc)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _chromium_history(profile: BrowserProfile) -> dict[str, int]:
|
|
20
|
+
"""Read domain visit counts from Chromium History SQLite."""
|
|
21
|
+
counts: dict[str, int] = {}
|
|
22
|
+
tmp = copy_db(profile.path / "History")
|
|
23
|
+
if not tmp:
|
|
24
|
+
return counts
|
|
25
|
+
try:
|
|
26
|
+
conn = sqlite3.connect(f"file:{tmp}?mode=ro", uri=True)
|
|
27
|
+
conn.row_factory = sqlite3.Row
|
|
28
|
+
for row in conn.execute(
|
|
29
|
+
"SELECT url, visit_count FROM urls ORDER BY last_visit_time DESC LIMIT 10000"
|
|
30
|
+
):
|
|
31
|
+
d = domain(row["url"])
|
|
32
|
+
if d:
|
|
33
|
+
counts[d] = counts.get(d, 0) + (row["visit_count"] or 1)
|
|
34
|
+
conn.close()
|
|
35
|
+
except Exception as e:
|
|
36
|
+
log.warning(f"Failed to read History for {profile.browser}/{profile.name}: {e}")
|
|
37
|
+
finally:
|
|
38
|
+
shutil.rmtree(tmp.parent, ignore_errors=True)
|
|
39
|
+
return counts
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _safari_history(profile: BrowserProfile) -> dict[str, int]:
|
|
43
|
+
"""Read domain visit counts from Safari History.db."""
|
|
44
|
+
counts: dict[str, int] = {}
|
|
45
|
+
tmp = copy_db(profile.path / "History.db")
|
|
46
|
+
if not tmp:
|
|
47
|
+
return counts
|
|
48
|
+
try:
|
|
49
|
+
conn = sqlite3.connect(f"file:{tmp}?mode=ro", uri=True)
|
|
50
|
+
conn.row_factory = sqlite3.Row
|
|
51
|
+
for row in conn.execute(
|
|
52
|
+
"SELECT url, visit_count FROM history_items ORDER BY visit_count DESC LIMIT 10000"
|
|
53
|
+
):
|
|
54
|
+
d = domain(row["url"])
|
|
55
|
+
if d:
|
|
56
|
+
counts[d] = counts.get(d, 0) + (row["visit_count"] or 1)
|
|
57
|
+
conn.close()
|
|
58
|
+
except Exception as e:
|
|
59
|
+
log.warning(f"Failed to read Safari History: {e}")
|
|
60
|
+
finally:
|
|
61
|
+
shutil.rmtree(tmp.parent, ignore_errors=True)
|
|
62
|
+
return counts
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _firefox_history(profile: BrowserProfile) -> dict[str, int]:
|
|
66
|
+
"""Read domain visit counts from Firefox places.sqlite."""
|
|
67
|
+
counts: dict[str, int] = {}
|
|
68
|
+
tmp = copy_db(profile.path / "places.sqlite")
|
|
69
|
+
if not tmp:
|
|
70
|
+
return counts
|
|
71
|
+
try:
|
|
72
|
+
conn = sqlite3.connect(f"file:{tmp}?mode=ro", uri=True)
|
|
73
|
+
conn.row_factory = sqlite3.Row
|
|
74
|
+
for row in conn.execute(
|
|
75
|
+
"SELECT url, visit_count FROM moz_places WHERE visit_count > 0 ORDER BY visit_count DESC LIMIT 10000"
|
|
76
|
+
):
|
|
77
|
+
d = domain(row["url"])
|
|
78
|
+
if d:
|
|
79
|
+
counts[d] = counts.get(d, 0) + (row["visit_count"] or 1)
|
|
80
|
+
conn.close()
|
|
81
|
+
except Exception as e:
|
|
82
|
+
log.warning(f"Failed to read Firefox places.sqlite: {e}")
|
|
83
|
+
finally:
|
|
84
|
+
shutil.rmtree(tmp.parent, ignore_errors=True)
|
|
85
|
+
return counts
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def ingest_history(mem: MemoryDB, profiles: list[BrowserProfile]):
|
|
89
|
+
"""Extract tool/service memories from browser history across all profiles."""
|
|
90
|
+
# Aggregate domain counts across all profiles
|
|
91
|
+
totals: dict[str, int] = {}
|
|
92
|
+
for profile in profiles:
|
|
93
|
+
if profile.browser in ("arc", "chrome", "brave", "edge"):
|
|
94
|
+
counts = _chromium_history(profile)
|
|
95
|
+
elif profile.browser == "safari":
|
|
96
|
+
counts = _safari_history(profile)
|
|
97
|
+
elif profile.browser == "firefox":
|
|
98
|
+
counts = _firefox_history(profile)
|
|
99
|
+
else:
|
|
100
|
+
continue
|
|
101
|
+
for d, c in counts.items():
|
|
102
|
+
totals[d] = totals.get(d, 0) + c
|
|
103
|
+
|
|
104
|
+
# Convert to tool/service memories
|
|
105
|
+
for d, total in sorted(totals.items(), key=lambda x: -x[1])[:200]:
|
|
106
|
+
if d not in SERVICE_NAMES:
|
|
107
|
+
continue
|
|
108
|
+
service = SERVICE_NAMES[d]
|
|
109
|
+
tags = ["account", "tool"]
|
|
110
|
+
if service in ("GitHub", "GitLab", "Vercel", "Netlify", "Supabase", "Firebase", "CodeSandbox"):
|
|
111
|
+
tags.append("work")
|
|
112
|
+
tags.append("dev")
|
|
113
|
+
elif service in ("Gmail", "Slack", "WhatsApp", "Discord", "Microsoft Teams", "Missive", "OpenPhone"):
|
|
114
|
+
tags.append("communication")
|
|
115
|
+
elif service in ("LinkedIn", "X/Twitter", "Instagram", "Facebook", "Reddit", "YouTube", "Product Hunt"):
|
|
116
|
+
tags.append("social")
|
|
117
|
+
elif service in ("Stripe", "QuickBooks", "Coinbase", "Gusto", "Polymarket"):
|
|
118
|
+
tags.append("finance")
|
|
119
|
+
elif service in ("ChatGPT", "Claude", "Anthropic Console"):
|
|
120
|
+
tags.append("ai")
|
|
121
|
+
mem.upsert(f"tool:{service}", str(total), tags, source=f"history:{d}")
|
|
122
|
+
|
|
123
|
+
log.info(f" History: {len(totals)} domains, {sum(1 for d in totals if d in SERVICE_NAMES)} known services")
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
"""Ingest WhatsApp contacts from Chromium IndexedDB (LevelDB)."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import shutil
|
|
5
|
+
import tempfile
|
|
6
|
+
import logging
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from ai_browser_profile.db import MemoryDB
|
|
10
|
+
from ai_browser_profile.ingestors.browser_detect import BrowserProfile
|
|
11
|
+
|
|
12
|
+
log = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _copy_dir(src: Path) -> Path:
|
|
16
|
+
"""Copy a directory to temp to avoid browser locks."""
|
|
17
|
+
tmp = Path(tempfile.mkdtemp(prefix="ai_browser_profile_idb_"))
|
|
18
|
+
dst = tmp / src.name
|
|
19
|
+
shutil.copytree(src, dst)
|
|
20
|
+
return dst
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _serialize_value(val, depth=0):
|
|
24
|
+
"""Recursively convert ccl IndexedDB value to JSON-safe dict."""
|
|
25
|
+
if depth > 20:
|
|
26
|
+
return "<nested too deep>"
|
|
27
|
+
if val is None:
|
|
28
|
+
return None
|
|
29
|
+
if isinstance(val, (bool, int, float, str)):
|
|
30
|
+
return val
|
|
31
|
+
if isinstance(val, bytes):
|
|
32
|
+
try:
|
|
33
|
+
return val.decode("utf-8")
|
|
34
|
+
except UnicodeDecodeError:
|
|
35
|
+
return f"<binary {len(val)} bytes>"
|
|
36
|
+
if isinstance(val, dict):
|
|
37
|
+
return {str(k): _serialize_value(v, depth + 1) for k, v in val.items()}
|
|
38
|
+
if isinstance(val, (list, tuple)):
|
|
39
|
+
return [_serialize_value(v, depth + 1) for v in val]
|
|
40
|
+
if hasattr(val, "value"):
|
|
41
|
+
return _serialize_value(val.value, depth + 1)
|
|
42
|
+
return str(val)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _normalize_phone(raw: str) -> str:
|
|
46
|
+
"""Normalize a phone number to digits-only with leading +."""
|
|
47
|
+
digits = "".join(c for c in raw if c.isdigit())
|
|
48
|
+
if not digits:
|
|
49
|
+
return raw
|
|
50
|
+
if not digits.startswith("+"):
|
|
51
|
+
digits = "+" + digits
|
|
52
|
+
return digits
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _extract_phone(data: dict) -> str:
|
|
56
|
+
"""Extract and normalize a phone number from a WhatsApp contact record."""
|
|
57
|
+
phone = data.get("phoneNumber") or ""
|
|
58
|
+
jid = data.get("id") or ""
|
|
59
|
+
|
|
60
|
+
if not phone and "@" in str(jid):
|
|
61
|
+
# Extract digits from JID (works for both @c.us and @s.whatsapp.net)
|
|
62
|
+
phone = str(jid).split("@")[0]
|
|
63
|
+
|
|
64
|
+
if phone:
|
|
65
|
+
return _normalize_phone(phone)
|
|
66
|
+
return ""
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def ingest_indexeddb(mem: MemoryDB, profiles: list[BrowserProfile]):
|
|
70
|
+
"""Extract WhatsApp contacts from Chromium IndexedDB.
|
|
71
|
+
|
|
72
|
+
Deduplicates contacts by normalized phone number — WhatsApp stores each
|
|
73
|
+
contact under both @c.us and @s.whatsapp.net JIDs, which previously
|
|
74
|
+
inflated the database by ~44%.
|
|
75
|
+
"""
|
|
76
|
+
from ccl_chromium_reader import ccl_chromium_indexeddb
|
|
77
|
+
|
|
78
|
+
# Collect all contacts first, dedup by (name, phone)
|
|
79
|
+
seen: dict[tuple[str, str], dict] = {} # (name, phone) -> {tags, value}
|
|
80
|
+
|
|
81
|
+
for profile in profiles:
|
|
82
|
+
if profile.browser in ("safari", "firefox"):
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
idb_root = profile.path / "IndexedDB"
|
|
86
|
+
if not idb_root.exists():
|
|
87
|
+
continue
|
|
88
|
+
|
|
89
|
+
for db_dir in sorted(idb_root.glob("*whatsapp*_0.indexeddb.leveldb")):
|
|
90
|
+
blob_dir = db_dir.parent / db_dir.name.replace(".leveldb", ".blob")
|
|
91
|
+
|
|
92
|
+
tmp_db = _copy_dir(db_dir)
|
|
93
|
+
tmp_blob = _copy_dir(blob_dir) if blob_dir.exists() else None
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
wrapper = ccl_chromium_indexeddb.WrappedIndexDB(
|
|
97
|
+
str(tmp_db),
|
|
98
|
+
str(tmp_blob) if tmp_blob else None,
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
for db_id in wrapper.database_ids:
|
|
102
|
+
try:
|
|
103
|
+
db = wrapper[db_id.name, db_id.origin]
|
|
104
|
+
except Exception:
|
|
105
|
+
continue
|
|
106
|
+
|
|
107
|
+
if "contact" not in db.object_store_names:
|
|
108
|
+
continue
|
|
109
|
+
|
|
110
|
+
for record in db["contact"].iterate_records():
|
|
111
|
+
try:
|
|
112
|
+
data = _serialize_value(record.value)
|
|
113
|
+
if not isinstance(data, dict):
|
|
114
|
+
continue
|
|
115
|
+
|
|
116
|
+
name = data.get("name") or data.get("pushname") or data.get("verifiedName") or ""
|
|
117
|
+
if not name:
|
|
118
|
+
continue
|
|
119
|
+
|
|
120
|
+
# Skip junk names
|
|
121
|
+
stripped = name.strip()
|
|
122
|
+
if not stripped or stripped == "." or stripped == "<Undefined>":
|
|
123
|
+
continue
|
|
124
|
+
# Skip emoji-only names (no alphanumeric chars)
|
|
125
|
+
if not any(c.isalnum() for c in stripped):
|
|
126
|
+
continue
|
|
127
|
+
|
|
128
|
+
phone = _extract_phone(data)
|
|
129
|
+
jid = data.get("id") or ""
|
|
130
|
+
|
|
131
|
+
tags = ["contact", "communication"]
|
|
132
|
+
if data.get("isBusiness") or data.get("isEnterprise"):
|
|
133
|
+
tags.append("work")
|
|
134
|
+
|
|
135
|
+
value = phone if phone else str(jid)
|
|
136
|
+
dedup_key = (name, phone if phone else str(jid))
|
|
137
|
+
|
|
138
|
+
if dedup_key not in seen:
|
|
139
|
+
seen[dedup_key] = {"tags": tags, "value": value}
|
|
140
|
+
else:
|
|
141
|
+
# Merge tags (e.g. one record has "work", the other doesn't)
|
|
142
|
+
for t in tags:
|
|
143
|
+
if t not in seen[dedup_key]["tags"]:
|
|
144
|
+
seen[dedup_key]["tags"].append(t)
|
|
145
|
+
|
|
146
|
+
except Exception:
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
except Exception as e:
|
|
150
|
+
log.warning(f"Failed to read WhatsApp IndexedDB for {profile.browser}/{profile.name}: {e}")
|
|
151
|
+
finally:
|
|
152
|
+
shutil.rmtree(tmp_db.parent, ignore_errors=True)
|
|
153
|
+
if tmp_blob:
|
|
154
|
+
shutil.rmtree(tmp_blob.parent, ignore_errors=True)
|
|
155
|
+
|
|
156
|
+
# Upsert deduplicated contacts
|
|
157
|
+
for (name, _phone), entry in seen.items():
|
|
158
|
+
mem.upsert(f"contact:{name}", entry["value"], entry["tags"], source="whatsapp")
|
|
159
|
+
|
|
160
|
+
# Clean up old JID-format entries (@c.us, @s.whatsapp.net) that now have normalized phone values
|
|
161
|
+
old_jid_rows = mem.conn.execute("""
|
|
162
|
+
SELECT id, key, value FROM memories
|
|
163
|
+
WHERE source = 'whatsapp'
|
|
164
|
+
AND (value LIKE '%@c.us' OR value LIKE '%@s.whatsapp.net')
|
|
165
|
+
AND superseded_by IS NULL
|
|
166
|
+
""").fetchall()
|
|
167
|
+
|
|
168
|
+
cleaned = 0
|
|
169
|
+
for old_id, old_key, old_value in old_jid_rows:
|
|
170
|
+
# Check if a normalized version exists for the same contact name
|
|
171
|
+
normalized = mem.conn.execute("""
|
|
172
|
+
SELECT id FROM memories
|
|
173
|
+
WHERE key = ? AND source = 'whatsapp'
|
|
174
|
+
AND value NOT LIKE '%@c.us' AND value NOT LIKE '%@s.whatsapp.net'
|
|
175
|
+
AND superseded_by IS NULL
|
|
176
|
+
LIMIT 1
|
|
177
|
+
""", (old_key,)).fetchone()
|
|
178
|
+
if normalized:
|
|
179
|
+
mem.conn.execute(
|
|
180
|
+
"UPDATE memories SET superseded_by = ? WHERE id = ?",
|
|
181
|
+
(normalized[0], old_id),
|
|
182
|
+
)
|
|
183
|
+
cleaned += 1
|
|
184
|
+
else:
|
|
185
|
+
# No normalized version — delete the junk entry entirely
|
|
186
|
+
mem.conn.execute("DELETE FROM memories WHERE id = ?", (old_id,))
|
|
187
|
+
cleaned += 1
|
|
188
|
+
|
|
189
|
+
if cleaned:
|
|
190
|
+
mem.conn.commit()
|
|
191
|
+
log.info(f" Cleaned {cleaned} old JID-format WhatsApp entries")
|
|
192
|
+
|
|
193
|
+
# Also clean up junk names that slipped through previous extractions
|
|
194
|
+
junk = mem.conn.execute("""
|
|
195
|
+
DELETE FROM memories
|
|
196
|
+
WHERE source = 'whatsapp' AND superseded_by IS NULL
|
|
197
|
+
AND (value = '<Undefined>' OR key = 'contact:.' OR key = 'contact:<Undefined>')
|
|
198
|
+
""").rowcount
|
|
199
|
+
if junk:
|
|
200
|
+
mem.conn.commit()
|
|
201
|
+
log.info(f" Deleted {junk} junk WhatsApp entries")
|
|
202
|
+
|
|
203
|
+
log.info(f" IndexedDB: {len(seen)} WhatsApp contacts (deduplicated)")
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""Ingest LinkedIn connections from Chromium Local Storage (LevelDB)."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import shutil
|
|
5
|
+
import tempfile
|
|
6
|
+
import logging
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from ai_browser_profile.db import MemoryDB
|
|
10
|
+
from ai_browser_profile.ingestors.browser_detect import BrowserProfile
|
|
11
|
+
|
|
12
|
+
log = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def ingest_localstorage(mem: MemoryDB, profiles: list[BrowserProfile]):
|
|
16
|
+
"""Extract LinkedIn connections from Chromium Local Storage."""
|
|
17
|
+
from ccl_chromium_reader import ccl_chromium_localstorage
|
|
18
|
+
|
|
19
|
+
total = 0
|
|
20
|
+
for profile in profiles:
|
|
21
|
+
if profile.browser in ("safari", "firefox"):
|
|
22
|
+
continue
|
|
23
|
+
|
|
24
|
+
ls_dir = profile.path / "Local Storage" / "leveldb"
|
|
25
|
+
if not ls_dir.exists():
|
|
26
|
+
continue
|
|
27
|
+
|
|
28
|
+
tmp = Path(tempfile.mkdtemp(prefix="ai_browser_profile_ls_"))
|
|
29
|
+
tmp_ls = tmp / "leveldb"
|
|
30
|
+
try:
|
|
31
|
+
shutil.copytree(ls_dir, tmp_ls)
|
|
32
|
+
except Exception as e:
|
|
33
|
+
log.warning(f"Failed to copy Local Storage for {profile.browser}/{profile.name}: {e}")
|
|
34
|
+
shutil.rmtree(tmp, ignore_errors=True)
|
|
35
|
+
continue
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
ldb = ccl_chromium_localstorage.LocalStoreDb(tmp_ls)
|
|
39
|
+
for record in ldb.iter_all_records():
|
|
40
|
+
try:
|
|
41
|
+
origin = record.storage_key or ""
|
|
42
|
+
key = record.script_key or ""
|
|
43
|
+
|
|
44
|
+
# LinkedIn connections
|
|
45
|
+
if "linkedin" in origin and key == "linkedin_assistant_profiles":
|
|
46
|
+
value = record.value or ""
|
|
47
|
+
data = json.loads(value)
|
|
48
|
+
profiles_data = data.get("profiles", {})
|
|
49
|
+
for url, p in profiles_data.items():
|
|
50
|
+
name = p.get("name", "")
|
|
51
|
+
title = p.get("title", "")
|
|
52
|
+
if not name:
|
|
53
|
+
continue
|
|
54
|
+
value_str = title if title else url
|
|
55
|
+
mem.upsert(f"linkedin:{name}", value_str,
|
|
56
|
+
["contact", "work", "social"], source="linkedin")
|
|
57
|
+
total += 1
|
|
58
|
+
except Exception:
|
|
59
|
+
continue
|
|
60
|
+
|
|
61
|
+
except Exception as e:
|
|
62
|
+
log.warning(f"Failed to read Local Storage for {profile.browser}/{profile.name}: {e}")
|
|
63
|
+
finally:
|
|
64
|
+
shutil.rmtree(tmp, ignore_errors=True)
|
|
65
|
+
|
|
66
|
+
log.info(f" Local Storage: {total} LinkedIn connections")
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Ingest account/email memories from Chromium Login Data."""
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
import sqlite3
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
from ai_browser_profile.db import MemoryDB
|
|
8
|
+
from ai_browser_profile.ingestors.browser_detect import BrowserProfile, copy_db, domain
|
|
9
|
+
|
|
10
|
+
log = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def ingest_logins(mem: MemoryDB, profiles: list[BrowserProfile]):
|
|
14
|
+
"""Extract account and email memories from Chromium Login Data files."""
|
|
15
|
+
total = 0
|
|
16
|
+
for profile in profiles:
|
|
17
|
+
if profile.browser in ("safari", "firefox"):
|
|
18
|
+
continue # No Login Data SQLite for these
|
|
19
|
+
|
|
20
|
+
tmp = copy_db(profile.path / "Login Data")
|
|
21
|
+
if not tmp:
|
|
22
|
+
continue
|
|
23
|
+
try:
|
|
24
|
+
conn = sqlite3.connect(f"file:{tmp}?mode=ro", uri=True)
|
|
25
|
+
conn.row_factory = sqlite3.Row
|
|
26
|
+
for row in conn.execute(
|
|
27
|
+
"SELECT origin_url, username_value, times_used FROM logins "
|
|
28
|
+
"WHERE username_value != '' ORDER BY times_used DESC LIMIT 200"
|
|
29
|
+
):
|
|
30
|
+
d = domain(row["origin_url"])
|
|
31
|
+
username = row["username_value"]
|
|
32
|
+
use_count = row["times_used"] or 0
|
|
33
|
+
mem.upsert(f"account:{d}", username,
|
|
34
|
+
["account"], source=f"login:{d}")
|
|
35
|
+
|
|
36
|
+
if "@" in username:
|
|
37
|
+
mem.upsert("email", username, ["identity", "contact_info", "communication"],
|
|
38
|
+
source=f"login:{d}")
|
|
39
|
+
total += 1
|
|
40
|
+
conn.close()
|
|
41
|
+
except Exception as e:
|
|
42
|
+
log.warning(f"Failed to read Login Data for {profile.browser}/{profile.name}: {e}")
|
|
43
|
+
finally:
|
|
44
|
+
shutil.rmtree(tmp.parent, ignore_errors=True)
|
|
45
|
+
|
|
46
|
+
log.info(f" Logins: {total} account entries")
|