ai-browser-profile 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +118 -0
- package/ai_browser_profile/__init__.py +6 -0
- package/ai_browser_profile/db.py +929 -0
- package/ai_browser_profile/embeddings.py +196 -0
- package/ai_browser_profile/extract.py +108 -0
- package/ai_browser_profile/ingestors/__init__.py +0 -0
- package/ai_browser_profile/ingestors/bookmarks.py +185 -0
- package/ai_browser_profile/ingestors/browser_detect.py +100 -0
- package/ai_browser_profile/ingestors/constants.py +208 -0
- package/ai_browser_profile/ingestors/history.py +123 -0
- package/ai_browser_profile/ingestors/indexeddb.py +203 -0
- package/ai_browser_profile/ingestors/localstorage.py +66 -0
- package/ai_browser_profile/ingestors/logins.py +46 -0
- package/ai_browser_profile/ingestors/messages.py +151 -0
- package/ai_browser_profile/ingestors/notion.py +313 -0
- package/ai_browser_profile/ingestors/webdata.py +134 -0
- package/autofill/SKILL.md +252 -0
- package/bin/cli.js +315 -0
- package/clean.py +295 -0
- package/extract.py +53 -0
- package/package.json +40 -0
- package/review/SKILL.md +171 -0
- package/review/run.sh +82 -0
- package/setup/SKILL.md +177 -0
- package/skill/SKILL.md +180 -0
- package/whatsapp/SKILL.md +321 -0
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""Store and query decrypted WhatsApp messages."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import sqlite3
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from ai_browser_profile.db import MemoryDB
|
|
10
|
+
|
|
11
|
+
log = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
MESSAGES_SCHEMA = """
|
|
14
|
+
CREATE TABLE IF NOT EXISTS messages (
|
|
15
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
16
|
+
sender TEXT,
|
|
17
|
+
sender_name TEXT,
|
|
18
|
+
text TEXT NOT NULL,
|
|
19
|
+
captured_at TEXT,
|
|
20
|
+
source TEXT DEFAULT 'whatsapp'
|
|
21
|
+
);
|
|
22
|
+
CREATE INDEX IF NOT EXISTS idx_messages_sender ON messages(sender);
|
|
23
|
+
CREATE INDEX IF NOT EXISTS idx_messages_captured ON messages(captured_at);
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _ensure_table(conn: sqlite3.Connection):
|
|
28
|
+
"""Create messages table if it doesn't exist."""
|
|
29
|
+
conn.executescript(MESSAGES_SCHEMA)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _resolve_sender(mem: MemoryDB, sender_jid: Optional[str]) -> Optional[str]:
|
|
33
|
+
"""Look up contact name from sender JID using existing memories."""
|
|
34
|
+
if not sender_jid:
|
|
35
|
+
return None
|
|
36
|
+
phone = sender_jid.split("@")[0]
|
|
37
|
+
if not phone.startswith("+") and len(phone) > 5:
|
|
38
|
+
phone = "+" + phone
|
|
39
|
+
|
|
40
|
+
# Search for contact:* memories with this phone number as value
|
|
41
|
+
try:
|
|
42
|
+
row = mem.conn.execute(
|
|
43
|
+
"SELECT key FROM memories WHERE key LIKE 'contact:%' AND value LIKE ? AND superseded_by IS NULL LIMIT 1",
|
|
44
|
+
(f"%{sender_jid.split('@')[0]}%",)
|
|
45
|
+
).fetchone()
|
|
46
|
+
if row:
|
|
47
|
+
return row[0].replace("contact:", "")
|
|
48
|
+
except Exception:
|
|
49
|
+
pass
|
|
50
|
+
return None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def ingest_messages(mem: MemoryDB, messages: list[dict]) -> int:
|
|
54
|
+
"""Store decrypted messages into the messages table, deduplicating.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
mem: MemoryDB instance (uses its connection).
|
|
58
|
+
messages: List of dicts with keys: text, sender (JID or None), ts (epoch ms).
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
Number of new messages inserted.
|
|
62
|
+
"""
|
|
63
|
+
_ensure_table(mem.conn)
|
|
64
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
65
|
+
|
|
66
|
+
# Deduplicate against existing messages
|
|
67
|
+
existing = set()
|
|
68
|
+
try:
|
|
69
|
+
for row in mem.conn.execute("SELECT sender, substr(text, 1, 100) FROM messages"):
|
|
70
|
+
existing.add((row[0], row[1]))
|
|
71
|
+
except sqlite3.OperationalError:
|
|
72
|
+
pass
|
|
73
|
+
|
|
74
|
+
inserted = 0
|
|
75
|
+
for m in messages:
|
|
76
|
+
text = m.get("text", "").strip()
|
|
77
|
+
if not text:
|
|
78
|
+
continue
|
|
79
|
+
|
|
80
|
+
sender = m.get("sender")
|
|
81
|
+
dedup_key = (sender, text[:100])
|
|
82
|
+
if dedup_key in existing:
|
|
83
|
+
continue
|
|
84
|
+
existing.add(dedup_key)
|
|
85
|
+
|
|
86
|
+
# Fix mojibake (Latin-1 encoded UTF-8)
|
|
87
|
+
try:
|
|
88
|
+
text = text.encode("latin-1").decode("utf-8")
|
|
89
|
+
except (UnicodeDecodeError, UnicodeEncodeError):
|
|
90
|
+
pass
|
|
91
|
+
|
|
92
|
+
sender_name = _resolve_sender(mem, sender)
|
|
93
|
+
ts = m.get("ts")
|
|
94
|
+
captured_at = datetime.fromtimestamp(ts / 1000, tz=timezone.utc).isoformat() if ts else now
|
|
95
|
+
|
|
96
|
+
mem.conn.execute(
|
|
97
|
+
"INSERT INTO messages (sender, sender_name, text, captured_at, source) VALUES (?, ?, ?, ?, ?)",
|
|
98
|
+
(sender, sender_name, text, captured_at, "whatsapp"),
|
|
99
|
+
)
|
|
100
|
+
inserted += 1
|
|
101
|
+
|
|
102
|
+
mem.conn.commit()
|
|
103
|
+
log.info(f" Messages: {inserted} new, {len(existing)} total")
|
|
104
|
+
return inserted
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_messages(mem: MemoryDB, sender: Optional[str] = None,
|
|
108
|
+
search: Optional[str] = None, limit: int = 100) -> list[dict]:
|
|
109
|
+
"""Query stored messages.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
sender: Filter by sender JID or name.
|
|
113
|
+
search: Full-text search in message text.
|
|
114
|
+
limit: Max results.
|
|
115
|
+
"""
|
|
116
|
+
_ensure_table(mem.conn)
|
|
117
|
+
|
|
118
|
+
query = "SELECT id, sender, sender_name, text, captured_at, source FROM messages WHERE 1=1"
|
|
119
|
+
params = []
|
|
120
|
+
|
|
121
|
+
if sender:
|
|
122
|
+
query += " AND (sender LIKE ? OR sender_name LIKE ?)"
|
|
123
|
+
params.extend([f"%{sender}%", f"%{sender}%"])
|
|
124
|
+
|
|
125
|
+
if search:
|
|
126
|
+
query += " AND text LIKE ?"
|
|
127
|
+
params.append(f"%{search}%")
|
|
128
|
+
|
|
129
|
+
query += " ORDER BY captured_at DESC LIMIT ?"
|
|
130
|
+
params.append(limit)
|
|
131
|
+
|
|
132
|
+
rows = mem.conn.execute(query, params).fetchall()
|
|
133
|
+
return [
|
|
134
|
+
{"id": r[0], "sender": r[1], "sender_name": r[2],
|
|
135
|
+
"text": r[3], "captured_at": r[4], "source": r[5]}
|
|
136
|
+
for r in rows
|
|
137
|
+
]
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def message_stats(mem: MemoryDB) -> dict:
|
|
141
|
+
"""Get message statistics."""
|
|
142
|
+
_ensure_table(mem.conn)
|
|
143
|
+
total = mem.conn.execute("SELECT COUNT(*) FROM messages").fetchone()[0]
|
|
144
|
+
senders = mem.conn.execute(
|
|
145
|
+
"SELECT COALESCE(sender_name, sender, 'unknown'), COUNT(*) FROM messages "
|
|
146
|
+
"GROUP BY COALESCE(sender_name, sender, 'unknown') ORDER BY COUNT(*) DESC LIMIT 20"
|
|
147
|
+
).fetchall()
|
|
148
|
+
return {
|
|
149
|
+
"total_messages": total,
|
|
150
|
+
"top_senders": {r[0]: r[1] for r in senders},
|
|
151
|
+
}
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
"""Notion desktop app ingestor — extracts memories from notion.db SQLite mirror.
|
|
2
|
+
|
|
3
|
+
Tier 1 (heuristic, no LLM): workspace info, users as contacts, page titles.
|
|
4
|
+
Tier 2 (LLM via claude -p): handled by run.sh, uses dump_changed_pages() to
|
|
5
|
+
reconstruct pages and pass to a Claude Code session for extraction.
|
|
6
|
+
|
|
7
|
+
Sync tracking: notion_last_sync_ts in MemoryDB metadata.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
import shutil
|
|
13
|
+
import sqlite3
|
|
14
|
+
import tempfile
|
|
15
|
+
from collections import deque
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Optional
|
|
18
|
+
|
|
19
|
+
from ai_browser_profile.db import MemoryDB
|
|
20
|
+
|
|
21
|
+
log = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
NOTION_DB = Path.home() / "Library" / "Application Support" / "Notion" / "notion.db"
|
|
24
|
+
|
|
25
|
+
# Block type → markdown prefix mapping
|
|
26
|
+
BLOCK_MD = {
|
|
27
|
+
"header": "#",
|
|
28
|
+
"sub_header": "##",
|
|
29
|
+
"sub_sub_header": "###",
|
|
30
|
+
"bulleted_list": "-",
|
|
31
|
+
"numbered_list": "1.",
|
|
32
|
+
"to_do": "- [ ]",
|
|
33
|
+
"quote": ">",
|
|
34
|
+
"callout": ">",
|
|
35
|
+
"toggle": "-",
|
|
36
|
+
"text": "",
|
|
37
|
+
"code": "```",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
# Block types to skip entirely
|
|
41
|
+
SKIP_TYPES = {
|
|
42
|
+
"page", "collection_view_page", "collection_view",
|
|
43
|
+
"image", "video", "bookmark", "divider", "table_of_contents",
|
|
44
|
+
"column_list", "column", "table", "table_row",
|
|
45
|
+
"external_object_instance", "ai_block", "personal_home_page",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
# Valid key prefixes for LLM extraction
|
|
49
|
+
VALID_PREFIXES = {
|
|
50
|
+
"contact", "project", "business", "work",
|
|
51
|
+
"relationship", "product", "company", "activity",
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
# Valid tags for LLM extraction
|
|
55
|
+
VALID_TAGS = {"contact", "work", "knowledge", "finance", "tool"}
|
|
56
|
+
|
|
57
|
+
MAX_BLOCKS_PER_PAGE = 500
|
|
58
|
+
MAX_CHARS_TO_LLM = 8000
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _copy_notion_db() -> Optional[Path]:
|
|
62
|
+
"""Copy notion.db + WAL/SHM to temp dir (same pattern as copy_db)."""
|
|
63
|
+
if not NOTION_DB.exists():
|
|
64
|
+
return None
|
|
65
|
+
tmp = Path(tempfile.mkdtemp(prefix="ai_browser_profile_notion_"))
|
|
66
|
+
dst = tmp / NOTION_DB.name
|
|
67
|
+
shutil.copy2(NOTION_DB, dst)
|
|
68
|
+
for suffix in ["-wal", "-shm"]:
|
|
69
|
+
wal = NOTION_DB.parent / (NOTION_DB.name + suffix)
|
|
70
|
+
if wal.exists():
|
|
71
|
+
shutil.copy2(wal, tmp / (NOTION_DB.name + suffix))
|
|
72
|
+
return dst
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _extract_title(properties_json: Optional[str]) -> str:
|
|
76
|
+
"""Parse Notion rich-text title from properties JSON.
|
|
77
|
+
|
|
78
|
+
Format: {"title": [["plain text"], ["bold text", [["b"]]], ...]}
|
|
79
|
+
"""
|
|
80
|
+
if not properties_json:
|
|
81
|
+
return ""
|
|
82
|
+
try:
|
|
83
|
+
props = json.loads(properties_json)
|
|
84
|
+
except (json.JSONDecodeError, TypeError):
|
|
85
|
+
return ""
|
|
86
|
+
title_parts = props.get("title", [])
|
|
87
|
+
if not title_parts:
|
|
88
|
+
return ""
|
|
89
|
+
segments = []
|
|
90
|
+
for part in title_parts:
|
|
91
|
+
if isinstance(part, list) and len(part) >= 1 and isinstance(part[0], str):
|
|
92
|
+
segments.append(part[0])
|
|
93
|
+
return "".join(segments).strip()
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _block_to_markdown(block_type: str, properties_json: Optional[str], depth: int = 0) -> Optional[str]:
|
|
97
|
+
"""Convert a single block to a markdown line."""
|
|
98
|
+
if block_type in SKIP_TYPES:
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
title = _extract_title(properties_json)
|
|
102
|
+
if not title:
|
|
103
|
+
return None
|
|
104
|
+
|
|
105
|
+
prefix = BLOCK_MD.get(block_type, "")
|
|
106
|
+
indent = " " * depth
|
|
107
|
+
|
|
108
|
+
# Handle to_do checked state
|
|
109
|
+
if block_type == "to_do":
|
|
110
|
+
try:
|
|
111
|
+
props = json.loads(properties_json) if properties_json else {}
|
|
112
|
+
checked = props.get("checked", [])
|
|
113
|
+
if checked and checked[0][0] == "Yes":
|
|
114
|
+
prefix = "- [x]"
|
|
115
|
+
except (json.JSONDecodeError, TypeError, IndexError):
|
|
116
|
+
pass
|
|
117
|
+
|
|
118
|
+
# Handle code blocks
|
|
119
|
+
if block_type == "code":
|
|
120
|
+
return f"{indent}```\n{indent}{title}\n{indent}```"
|
|
121
|
+
|
|
122
|
+
if prefix:
|
|
123
|
+
return f"{indent}{prefix} {title}"
|
|
124
|
+
return f"{indent}{title}"
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _reconstruct_page(conn: sqlite3.Connection, page_id: str, page_title: str) -> str:
|
|
128
|
+
"""BFS walk child blocks via parent_id, assemble markdown. Cap at MAX_BLOCKS_PER_PAGE."""
|
|
129
|
+
lines = [f"# {page_title}", ""]
|
|
130
|
+
|
|
131
|
+
# BFS: queue of (block_id, depth)
|
|
132
|
+
queue = deque()
|
|
133
|
+
block_count = 0
|
|
134
|
+
|
|
135
|
+
# Get direct children of the page
|
|
136
|
+
rows = conn.execute(
|
|
137
|
+
"SELECT id, type, properties FROM block "
|
|
138
|
+
"WHERE parent_id = ? AND alive = 1 ORDER BY rowid",
|
|
139
|
+
(page_id,)
|
|
140
|
+
).fetchall()
|
|
141
|
+
|
|
142
|
+
for row in rows:
|
|
143
|
+
queue.append((row["id"], row["type"], row["properties"], 0))
|
|
144
|
+
|
|
145
|
+
while queue and block_count < MAX_BLOCKS_PER_PAGE:
|
|
146
|
+
block_id, btype, props, depth = queue.popleft()
|
|
147
|
+
block_count += 1
|
|
148
|
+
|
|
149
|
+
md_line = _block_to_markdown(btype, props, depth)
|
|
150
|
+
if md_line:
|
|
151
|
+
lines.append(md_line)
|
|
152
|
+
|
|
153
|
+
# Add children to queue
|
|
154
|
+
children = conn.execute(
|
|
155
|
+
"SELECT id, type, properties FROM block "
|
|
156
|
+
"WHERE parent_id = ? AND alive = 1 ORDER BY rowid",
|
|
157
|
+
(block_id,)
|
|
158
|
+
).fetchall()
|
|
159
|
+
for child in children:
|
|
160
|
+
queue.append((child["id"], child["type"], child["properties"], depth + 1))
|
|
161
|
+
|
|
162
|
+
return "\n".join(lines)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _ingest_workspace(mem: MemoryDB, conn: sqlite3.Connection) -> int:
|
|
166
|
+
"""Tier 1: space table → company: memories."""
|
|
167
|
+
count = 0
|
|
168
|
+
for row in conn.execute("SELECT name FROM space WHERE name IS NOT NULL AND name != ''"):
|
|
169
|
+
mem.upsert(
|
|
170
|
+
f"company:notion_workspace",
|
|
171
|
+
row["name"],
|
|
172
|
+
tags=["work"],
|
|
173
|
+
source="notion:workspace",
|
|
174
|
+
)
|
|
175
|
+
count += 1
|
|
176
|
+
return count
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _ingest_users(mem: MemoryDB, conn: sqlite3.Connection) -> int:
|
|
180
|
+
"""Tier 1: notion_user → contact:Name memories."""
|
|
181
|
+
count = 0
|
|
182
|
+
for row in conn.execute(
|
|
183
|
+
"SELECT name, email, given_name, family_name FROM notion_user "
|
|
184
|
+
"WHERE name IS NOT NULL AND name != '' AND email IS NOT NULL AND email != ''"
|
|
185
|
+
):
|
|
186
|
+
name = row["name"].strip()
|
|
187
|
+
email = row["email"].strip()
|
|
188
|
+
if not name or not email:
|
|
189
|
+
continue
|
|
190
|
+
mem.upsert(
|
|
191
|
+
f"contact:{name}",
|
|
192
|
+
email,
|
|
193
|
+
tags=["contact", "work"],
|
|
194
|
+
source="notion:user",
|
|
195
|
+
)
|
|
196
|
+
count += 1
|
|
197
|
+
return count
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def _ingest_page_titles(mem: MemoryDB, conn: sqlite3.Connection) -> int:
|
|
201
|
+
"""Tier 1: page/transcription titles → project:/activity: memories."""
|
|
202
|
+
count = 0
|
|
203
|
+
for row in conn.execute(
|
|
204
|
+
"SELECT id, type, properties FROM block "
|
|
205
|
+
"WHERE type IN ('page', 'transcription') AND alive = 1 "
|
|
206
|
+
"AND properties IS NOT NULL AND parent_table = 'space'"
|
|
207
|
+
):
|
|
208
|
+
title = _extract_title(row["properties"])
|
|
209
|
+
if not title or len(title) < 3:
|
|
210
|
+
continue
|
|
211
|
+
|
|
212
|
+
if row["type"] == "transcription":
|
|
213
|
+
mem.upsert(
|
|
214
|
+
f"activity:meeting:{title[:80]}",
|
|
215
|
+
title,
|
|
216
|
+
tags=["work", "contact"],
|
|
217
|
+
source="notion:transcription",
|
|
218
|
+
)
|
|
219
|
+
else:
|
|
220
|
+
mem.upsert(
|
|
221
|
+
f"project:{title[:80]}",
|
|
222
|
+
title,
|
|
223
|
+
tags=["work", "knowledge"],
|
|
224
|
+
source="notion:page",
|
|
225
|
+
)
|
|
226
|
+
count += 1
|
|
227
|
+
return count
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def dump_changed_pages(mem: MemoryDB, limit: int = 50, min_blocks: int = 5) -> str:
|
|
231
|
+
"""Reconstruct changed Notion pages as markdown for LLM extraction.
|
|
232
|
+
|
|
233
|
+
Returns concatenated markdown of pages changed since last sync,
|
|
234
|
+
filtered to pages with at least min_blocks child blocks.
|
|
235
|
+
Used by run.sh to pass content to a claude -p session.
|
|
236
|
+
"""
|
|
237
|
+
tmp = _copy_notion_db()
|
|
238
|
+
if not tmp:
|
|
239
|
+
return ""
|
|
240
|
+
|
|
241
|
+
try:
|
|
242
|
+
conn = sqlite3.connect(f"file:{tmp}?mode=ro", uri=True)
|
|
243
|
+
conn.row_factory = sqlite3.Row
|
|
244
|
+
|
|
245
|
+
since_ts = float(mem.get_meta("notion_last_sync_ts") or "0")
|
|
246
|
+
|
|
247
|
+
rows = conn.execute(
|
|
248
|
+
"SELECT id, type, properties, last_edited_time FROM block "
|
|
249
|
+
"WHERE type IN ('page', 'transcription') AND alive = 1 "
|
|
250
|
+
"AND properties IS NOT NULL AND last_edited_time > ? "
|
|
251
|
+
"ORDER BY last_edited_time DESC LIMIT ?",
|
|
252
|
+
(since_ts, limit * 3), # fetch extra to filter by block count
|
|
253
|
+
).fetchall()
|
|
254
|
+
|
|
255
|
+
pages = []
|
|
256
|
+
for row in rows:
|
|
257
|
+
title = _extract_title(row["properties"])
|
|
258
|
+
if not title or len(title) < 3:
|
|
259
|
+
continue
|
|
260
|
+
child_count = conn.execute(
|
|
261
|
+
"SELECT COUNT(*) FROM block WHERE parent_id = ? AND alive = 1",
|
|
262
|
+
(row["id"],)
|
|
263
|
+
).fetchone()[0]
|
|
264
|
+
if child_count < min_blocks:
|
|
265
|
+
continue
|
|
266
|
+
page_md = _reconstruct_page(conn, row["id"], title)
|
|
267
|
+
if len(page_md) > 50:
|
|
268
|
+
pages.append(page_md[:MAX_CHARS_TO_LLM])
|
|
269
|
+
if len(pages) >= limit:
|
|
270
|
+
break
|
|
271
|
+
|
|
272
|
+
conn.close()
|
|
273
|
+
return "\n\n---\n\n".join(pages)
|
|
274
|
+
finally:
|
|
275
|
+
shutil.rmtree(tmp.parent, ignore_errors=True)
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def ingest_notion(mem: MemoryDB):
|
|
279
|
+
"""Main entry point. Copies DB, runs Tier 1, updates high-water mark."""
|
|
280
|
+
tmp = _copy_notion_db()
|
|
281
|
+
if not tmp:
|
|
282
|
+
log.warning("Notion DB not found — skipping Notion ingestor")
|
|
283
|
+
return
|
|
284
|
+
|
|
285
|
+
try:
|
|
286
|
+
conn = sqlite3.connect(f"file:{tmp}?mode=ro", uri=True)
|
|
287
|
+
conn.row_factory = sqlite3.Row
|
|
288
|
+
|
|
289
|
+
# Get max last_edited_time for new high-water mark
|
|
290
|
+
row = conn.execute(
|
|
291
|
+
"SELECT MAX(last_edited_time) as max_ts FROM block"
|
|
292
|
+
).fetchone()
|
|
293
|
+
new_ts = row["max_ts"] if row and row["max_ts"] else 0
|
|
294
|
+
|
|
295
|
+
# Tier 1: always runs
|
|
296
|
+
ws_count = _ingest_workspace(mem, conn)
|
|
297
|
+
user_count = _ingest_users(mem, conn)
|
|
298
|
+
title_count = _ingest_page_titles(mem, conn)
|
|
299
|
+
log.info(
|
|
300
|
+
f"Notion Tier 1: {ws_count} workspaces, {user_count} users, "
|
|
301
|
+
f"{title_count} page titles"
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
# Update high-water mark
|
|
305
|
+
if new_ts:
|
|
306
|
+
mem.set_meta("notion_last_sync_ts", str(new_ts))
|
|
307
|
+
|
|
308
|
+
conn.close()
|
|
309
|
+
except Exception as e:
|
|
310
|
+
log.warning(f"Notion ingestor error: {e}")
|
|
311
|
+
raise
|
|
312
|
+
finally:
|
|
313
|
+
shutil.rmtree(tmp.parent, ignore_errors=True)
|
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
"""Ingest memories directly from Chromium Web Data files (address profiles, autofill, cards)."""
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
import sqlite3
|
|
5
|
+
import tempfile
|
|
6
|
+
import logging
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Optional
|
|
9
|
+
|
|
10
|
+
from ai_browser_profile.db import MemoryDB
|
|
11
|
+
from ai_browser_profile.ingestors.constants import (
|
|
12
|
+
ADDRESS_TYPE_MAP, AUTOFILL_FIELD_MAP, BROWSER_PATHS,
|
|
13
|
+
clean_field_name, is_noise_field, infer_tags,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
log = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _copy_db(src: Path) -> Optional[Path]:
|
|
20
|
+
"""Copy a SQLite DB to temp dir to avoid browser locks."""
|
|
21
|
+
if not src.exists():
|
|
22
|
+
return None
|
|
23
|
+
tmp = Path(tempfile.mkdtemp(prefix="ai_browser_profile_"))
|
|
24
|
+
dst = tmp / src.name
|
|
25
|
+
shutil.copy2(src, dst)
|
|
26
|
+
for suffix in ["-wal", "-shm"]:
|
|
27
|
+
wal = src.parent / (src.name + suffix)
|
|
28
|
+
if wal.exists():
|
|
29
|
+
shutil.copy2(wal, tmp / (src.name + suffix))
|
|
30
|
+
return dst
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _extract_webdata(mem: MemoryDB, browser: str, profile: str, webdata_path: Path):
|
|
34
|
+
"""Extract address profiles, form autofill, and credit card info from Web Data."""
|
|
35
|
+
tmp_db = _copy_db(webdata_path)
|
|
36
|
+
if not tmp_db:
|
|
37
|
+
return
|
|
38
|
+
source_prefix = f"autofill:{browser}:{profile}"
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
conn = sqlite3.connect(f"file:{tmp_db}?mode=ro", uri=True)
|
|
42
|
+
conn.row_factory = sqlite3.Row
|
|
43
|
+
|
|
44
|
+
# --- Structured address profiles (all type codes) ---
|
|
45
|
+
use_counts = {}
|
|
46
|
+
try:
|
|
47
|
+
for row in conn.execute("SELECT guid, use_count FROM addresses"):
|
|
48
|
+
use_counts[row["guid"]] = row["use_count"]
|
|
49
|
+
except sqlite3.OperationalError:
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
for row in conn.execute("SELECT guid, type, value FROM address_type_tokens WHERE value != ''"):
|
|
54
|
+
type_code = row["type"]
|
|
55
|
+
use_count = use_counts.get(row["guid"], 0)
|
|
56
|
+
|
|
57
|
+
if type_code in ADDRESS_TYPE_MAP:
|
|
58
|
+
key_name, tags = ADDRESS_TYPE_MAP[type_code]
|
|
59
|
+
else:
|
|
60
|
+
key_name = f"address_type_{type_code}"
|
|
61
|
+
tags = ["address"]
|
|
62
|
+
|
|
63
|
+
mem.upsert(key_name, row["value"], tags, source=source_prefix)
|
|
64
|
+
except sqlite3.OperationalError:
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
# --- Form autofill entries (ALL fields, not just mapped ones) ---
|
|
68
|
+
try:
|
|
69
|
+
for row in conn.execute("SELECT name, value, count FROM autofill WHERE value != '' ORDER BY count DESC"):
|
|
70
|
+
raw_field = row["name"]
|
|
71
|
+
value = row["value"]
|
|
72
|
+
use_count = row["count"]
|
|
73
|
+
|
|
74
|
+
# Skip noise: pure numbers, UUIDs, timestamps, CSS selectors
|
|
75
|
+
if is_noise_field(raw_field):
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
# Skip very low usage (likely accidental fills)
|
|
79
|
+
if use_count < 2:
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
# Skip very long values (likely not user data)
|
|
83
|
+
if len(value) > 500:
|
|
84
|
+
continue
|
|
85
|
+
|
|
86
|
+
# Clean the field name
|
|
87
|
+
cleaned = clean_field_name(raw_field)
|
|
88
|
+
if not cleaned or len(cleaned) < 2:
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
# Try to map to a known normalized key
|
|
92
|
+
if cleaned in AUTOFILL_FIELD_MAP:
|
|
93
|
+
key_name, tags = AUTOFILL_FIELD_MAP[cleaned]
|
|
94
|
+
else:
|
|
95
|
+
key_name = f"autofill:{cleaned}"
|
|
96
|
+
tags = infer_tags(cleaned)
|
|
97
|
+
|
|
98
|
+
mem.upsert(key_name, value, tags, source=f"form:{browser}:{profile}")
|
|
99
|
+
except sqlite3.OperationalError:
|
|
100
|
+
pass
|
|
101
|
+
|
|
102
|
+
# --- Credit cards (metadata only, no card numbers) ---
|
|
103
|
+
try:
|
|
104
|
+
for row in conn.execute("SELECT name_on_card, expiration_month, expiration_year, nickname FROM credit_cards"):
|
|
105
|
+
if row["name_on_card"]:
|
|
106
|
+
mem.upsert("card_holder_name", row["name_on_card"],
|
|
107
|
+
["payment", "identity"], source=f"card:{browser}:{profile}")
|
|
108
|
+
if row["expiration_month"] and row["expiration_year"]:
|
|
109
|
+
mem.upsert("card_expiry", f"{row['expiration_month']:02d}/{row['expiration_year']}",
|
|
110
|
+
["payment"], source=f"card:{browser}:{profile}")
|
|
111
|
+
if row["nickname"]:
|
|
112
|
+
mem.upsert("card_nickname", row["nickname"],
|
|
113
|
+
["payment"], source=f"card:{browser}:{profile}")
|
|
114
|
+
except sqlite3.OperationalError:
|
|
115
|
+
pass
|
|
116
|
+
|
|
117
|
+
conn.close()
|
|
118
|
+
except Exception as e:
|
|
119
|
+
log.warning(f"Failed to extract Web Data for {browser}/{profile}: {e}")
|
|
120
|
+
finally:
|
|
121
|
+
shutil.rmtree(tmp_db.parent, ignore_errors=True)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def ingest_webdata(mem: MemoryDB):
|
|
125
|
+
"""Extract memories from all Chromium Web Data files."""
|
|
126
|
+
for browser, base in BROWSER_PATHS.items():
|
|
127
|
+
if not base.exists():
|
|
128
|
+
continue
|
|
129
|
+
for d in sorted(base.iterdir()):
|
|
130
|
+
if d.is_dir() and (d.name == "Default" or d.name.startswith("Profile ")):
|
|
131
|
+
webdata = d / "Web Data"
|
|
132
|
+
if webdata.exists():
|
|
133
|
+
log.info(f" Web Data: {browser}/{d.name}")
|
|
134
|
+
_extract_webdata(mem, browser, d.name, webdata)
|