ai-browser-profile 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,151 @@
1
+ """Store and query decrypted WhatsApp messages."""
2
+
3
+ import json
4
+ import logging
5
+ import sqlite3
6
+ from datetime import datetime, timezone
7
+ from typing import Optional
8
+
9
+ from ai_browser_profile.db import MemoryDB
10
+
11
+ log = logging.getLogger(__name__)
12
+
13
+ MESSAGES_SCHEMA = """
14
+ CREATE TABLE IF NOT EXISTS messages (
15
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
16
+ sender TEXT,
17
+ sender_name TEXT,
18
+ text TEXT NOT NULL,
19
+ captured_at TEXT,
20
+ source TEXT DEFAULT 'whatsapp'
21
+ );
22
+ CREATE INDEX IF NOT EXISTS idx_messages_sender ON messages(sender);
23
+ CREATE INDEX IF NOT EXISTS idx_messages_captured ON messages(captured_at);
24
+ """
25
+
26
+
27
+ def _ensure_table(conn: sqlite3.Connection):
28
+ """Create messages table if it doesn't exist."""
29
+ conn.executescript(MESSAGES_SCHEMA)
30
+
31
+
32
+ def _resolve_sender(mem: MemoryDB, sender_jid: Optional[str]) -> Optional[str]:
33
+ """Look up contact name from sender JID using existing memories."""
34
+ if not sender_jid:
35
+ return None
36
+ phone = sender_jid.split("@")[0]
37
+ if not phone.startswith("+") and len(phone) > 5:
38
+ phone = "+" + phone
39
+
40
+ # Search for contact:* memories with this phone number as value
41
+ try:
42
+ row = mem.conn.execute(
43
+ "SELECT key FROM memories WHERE key LIKE 'contact:%' AND value LIKE ? AND superseded_by IS NULL LIMIT 1",
44
+ (f"%{sender_jid.split('@')[0]}%",)
45
+ ).fetchone()
46
+ if row:
47
+ return row[0].replace("contact:", "")
48
+ except Exception:
49
+ pass
50
+ return None
51
+
52
+
53
+ def ingest_messages(mem: MemoryDB, messages: list[dict]) -> int:
54
+ """Store decrypted messages into the messages table, deduplicating.
55
+
56
+ Args:
57
+ mem: MemoryDB instance (uses its connection).
58
+ messages: List of dicts with keys: text, sender (JID or None), ts (epoch ms).
59
+
60
+ Returns:
61
+ Number of new messages inserted.
62
+ """
63
+ _ensure_table(mem.conn)
64
+ now = datetime.now(timezone.utc).isoformat()
65
+
66
+ # Deduplicate against existing messages
67
+ existing = set()
68
+ try:
69
+ for row in mem.conn.execute("SELECT sender, substr(text, 1, 100) FROM messages"):
70
+ existing.add((row[0], row[1]))
71
+ except sqlite3.OperationalError:
72
+ pass
73
+
74
+ inserted = 0
75
+ for m in messages:
76
+ text = m.get("text", "").strip()
77
+ if not text:
78
+ continue
79
+
80
+ sender = m.get("sender")
81
+ dedup_key = (sender, text[:100])
82
+ if dedup_key in existing:
83
+ continue
84
+ existing.add(dedup_key)
85
+
86
+ # Fix mojibake (Latin-1 encoded UTF-8)
87
+ try:
88
+ text = text.encode("latin-1").decode("utf-8")
89
+ except (UnicodeDecodeError, UnicodeEncodeError):
90
+ pass
91
+
92
+ sender_name = _resolve_sender(mem, sender)
93
+ ts = m.get("ts")
94
+ captured_at = datetime.fromtimestamp(ts / 1000, tz=timezone.utc).isoformat() if ts else now
95
+
96
+ mem.conn.execute(
97
+ "INSERT INTO messages (sender, sender_name, text, captured_at, source) VALUES (?, ?, ?, ?, ?)",
98
+ (sender, sender_name, text, captured_at, "whatsapp"),
99
+ )
100
+ inserted += 1
101
+
102
+ mem.conn.commit()
103
+ log.info(f" Messages: {inserted} new, {len(existing)} total")
104
+ return inserted
105
+
106
+
107
+ def get_messages(mem: MemoryDB, sender: Optional[str] = None,
108
+ search: Optional[str] = None, limit: int = 100) -> list[dict]:
109
+ """Query stored messages.
110
+
111
+ Args:
112
+ sender: Filter by sender JID or name.
113
+ search: Full-text search in message text.
114
+ limit: Max results.
115
+ """
116
+ _ensure_table(mem.conn)
117
+
118
+ query = "SELECT id, sender, sender_name, text, captured_at, source FROM messages WHERE 1=1"
119
+ params = []
120
+
121
+ if sender:
122
+ query += " AND (sender LIKE ? OR sender_name LIKE ?)"
123
+ params.extend([f"%{sender}%", f"%{sender}%"])
124
+
125
+ if search:
126
+ query += " AND text LIKE ?"
127
+ params.append(f"%{search}%")
128
+
129
+ query += " ORDER BY captured_at DESC LIMIT ?"
130
+ params.append(limit)
131
+
132
+ rows = mem.conn.execute(query, params).fetchall()
133
+ return [
134
+ {"id": r[0], "sender": r[1], "sender_name": r[2],
135
+ "text": r[3], "captured_at": r[4], "source": r[5]}
136
+ for r in rows
137
+ ]
138
+
139
+
140
+ def message_stats(mem: MemoryDB) -> dict:
141
+ """Get message statistics."""
142
+ _ensure_table(mem.conn)
143
+ total = mem.conn.execute("SELECT COUNT(*) FROM messages").fetchone()[0]
144
+ senders = mem.conn.execute(
145
+ "SELECT COALESCE(sender_name, sender, 'unknown'), COUNT(*) FROM messages "
146
+ "GROUP BY COALESCE(sender_name, sender, 'unknown') ORDER BY COUNT(*) DESC LIMIT 20"
147
+ ).fetchall()
148
+ return {
149
+ "total_messages": total,
150
+ "top_senders": {r[0]: r[1] for r in senders},
151
+ }
@@ -0,0 +1,313 @@
1
+ """Notion desktop app ingestor — extracts memories from notion.db SQLite mirror.
2
+
3
+ Tier 1 (heuristic, no LLM): workspace info, users as contacts, page titles.
4
+ Tier 2 (LLM via claude -p): handled by run.sh, uses dump_changed_pages() to
5
+ reconstruct pages and pass to a Claude Code session for extraction.
6
+
7
+ Sync tracking: notion_last_sync_ts in MemoryDB metadata.
8
+ """
9
+
10
+ import json
11
+ import logging
12
+ import shutil
13
+ import sqlite3
14
+ import tempfile
15
+ from collections import deque
16
+ from pathlib import Path
17
+ from typing import Optional
18
+
19
+ from ai_browser_profile.db import MemoryDB
20
+
21
+ log = logging.getLogger(__name__)
22
+
23
+ NOTION_DB = Path.home() / "Library" / "Application Support" / "Notion" / "notion.db"
24
+
25
+ # Block type → markdown prefix mapping
26
+ BLOCK_MD = {
27
+ "header": "#",
28
+ "sub_header": "##",
29
+ "sub_sub_header": "###",
30
+ "bulleted_list": "-",
31
+ "numbered_list": "1.",
32
+ "to_do": "- [ ]",
33
+ "quote": ">",
34
+ "callout": ">",
35
+ "toggle": "-",
36
+ "text": "",
37
+ "code": "```",
38
+ }
39
+
40
+ # Block types to skip entirely
41
+ SKIP_TYPES = {
42
+ "page", "collection_view_page", "collection_view",
43
+ "image", "video", "bookmark", "divider", "table_of_contents",
44
+ "column_list", "column", "table", "table_row",
45
+ "external_object_instance", "ai_block", "personal_home_page",
46
+ }
47
+
48
+ # Valid key prefixes for LLM extraction
49
+ VALID_PREFIXES = {
50
+ "contact", "project", "business", "work",
51
+ "relationship", "product", "company", "activity",
52
+ }
53
+
54
+ # Valid tags for LLM extraction
55
+ VALID_TAGS = {"contact", "work", "knowledge", "finance", "tool"}
56
+
57
+ MAX_BLOCKS_PER_PAGE = 500
58
+ MAX_CHARS_TO_LLM = 8000
59
+
60
+
61
+ def _copy_notion_db() -> Optional[Path]:
62
+ """Copy notion.db + WAL/SHM to temp dir (same pattern as copy_db)."""
63
+ if not NOTION_DB.exists():
64
+ return None
65
+ tmp = Path(tempfile.mkdtemp(prefix="ai_browser_profile_notion_"))
66
+ dst = tmp / NOTION_DB.name
67
+ shutil.copy2(NOTION_DB, dst)
68
+ for suffix in ["-wal", "-shm"]:
69
+ wal = NOTION_DB.parent / (NOTION_DB.name + suffix)
70
+ if wal.exists():
71
+ shutil.copy2(wal, tmp / (NOTION_DB.name + suffix))
72
+ return dst
73
+
74
+
75
+ def _extract_title(properties_json: Optional[str]) -> str:
76
+ """Parse Notion rich-text title from properties JSON.
77
+
78
+ Format: {"title": [["plain text"], ["bold text", [["b"]]], ...]}
79
+ """
80
+ if not properties_json:
81
+ return ""
82
+ try:
83
+ props = json.loads(properties_json)
84
+ except (json.JSONDecodeError, TypeError):
85
+ return ""
86
+ title_parts = props.get("title", [])
87
+ if not title_parts:
88
+ return ""
89
+ segments = []
90
+ for part in title_parts:
91
+ if isinstance(part, list) and len(part) >= 1 and isinstance(part[0], str):
92
+ segments.append(part[0])
93
+ return "".join(segments).strip()
94
+
95
+
96
+ def _block_to_markdown(block_type: str, properties_json: Optional[str], depth: int = 0) -> Optional[str]:
97
+ """Convert a single block to a markdown line."""
98
+ if block_type in SKIP_TYPES:
99
+ return None
100
+
101
+ title = _extract_title(properties_json)
102
+ if not title:
103
+ return None
104
+
105
+ prefix = BLOCK_MD.get(block_type, "")
106
+ indent = " " * depth
107
+
108
+ # Handle to_do checked state
109
+ if block_type == "to_do":
110
+ try:
111
+ props = json.loads(properties_json) if properties_json else {}
112
+ checked = props.get("checked", [])
113
+ if checked and checked[0][0] == "Yes":
114
+ prefix = "- [x]"
115
+ except (json.JSONDecodeError, TypeError, IndexError):
116
+ pass
117
+
118
+ # Handle code blocks
119
+ if block_type == "code":
120
+ return f"{indent}```\n{indent}{title}\n{indent}```"
121
+
122
+ if prefix:
123
+ return f"{indent}{prefix} {title}"
124
+ return f"{indent}{title}"
125
+
126
+
127
+ def _reconstruct_page(conn: sqlite3.Connection, page_id: str, page_title: str) -> str:
128
+ """BFS walk child blocks via parent_id, assemble markdown. Cap at MAX_BLOCKS_PER_PAGE."""
129
+ lines = [f"# {page_title}", ""]
130
+
131
+ # BFS: queue of (block_id, depth)
132
+ queue = deque()
133
+ block_count = 0
134
+
135
+ # Get direct children of the page
136
+ rows = conn.execute(
137
+ "SELECT id, type, properties FROM block "
138
+ "WHERE parent_id = ? AND alive = 1 ORDER BY rowid",
139
+ (page_id,)
140
+ ).fetchall()
141
+
142
+ for row in rows:
143
+ queue.append((row["id"], row["type"], row["properties"], 0))
144
+
145
+ while queue and block_count < MAX_BLOCKS_PER_PAGE:
146
+ block_id, btype, props, depth = queue.popleft()
147
+ block_count += 1
148
+
149
+ md_line = _block_to_markdown(btype, props, depth)
150
+ if md_line:
151
+ lines.append(md_line)
152
+
153
+ # Add children to queue
154
+ children = conn.execute(
155
+ "SELECT id, type, properties FROM block "
156
+ "WHERE parent_id = ? AND alive = 1 ORDER BY rowid",
157
+ (block_id,)
158
+ ).fetchall()
159
+ for child in children:
160
+ queue.append((child["id"], child["type"], child["properties"], depth + 1))
161
+
162
+ return "\n".join(lines)
163
+
164
+
165
+ def _ingest_workspace(mem: MemoryDB, conn: sqlite3.Connection) -> int:
166
+ """Tier 1: space table → company: memories."""
167
+ count = 0
168
+ for row in conn.execute("SELECT name FROM space WHERE name IS NOT NULL AND name != ''"):
169
+ mem.upsert(
170
+ f"company:notion_workspace",
171
+ row["name"],
172
+ tags=["work"],
173
+ source="notion:workspace",
174
+ )
175
+ count += 1
176
+ return count
177
+
178
+
179
+ def _ingest_users(mem: MemoryDB, conn: sqlite3.Connection) -> int:
180
+ """Tier 1: notion_user → contact:Name memories."""
181
+ count = 0
182
+ for row in conn.execute(
183
+ "SELECT name, email, given_name, family_name FROM notion_user "
184
+ "WHERE name IS NOT NULL AND name != '' AND email IS NOT NULL AND email != ''"
185
+ ):
186
+ name = row["name"].strip()
187
+ email = row["email"].strip()
188
+ if not name or not email:
189
+ continue
190
+ mem.upsert(
191
+ f"contact:{name}",
192
+ email,
193
+ tags=["contact", "work"],
194
+ source="notion:user",
195
+ )
196
+ count += 1
197
+ return count
198
+
199
+
200
+ def _ingest_page_titles(mem: MemoryDB, conn: sqlite3.Connection) -> int:
201
+ """Tier 1: page/transcription titles → project:/activity: memories."""
202
+ count = 0
203
+ for row in conn.execute(
204
+ "SELECT id, type, properties FROM block "
205
+ "WHERE type IN ('page', 'transcription') AND alive = 1 "
206
+ "AND properties IS NOT NULL AND parent_table = 'space'"
207
+ ):
208
+ title = _extract_title(row["properties"])
209
+ if not title or len(title) < 3:
210
+ continue
211
+
212
+ if row["type"] == "transcription":
213
+ mem.upsert(
214
+ f"activity:meeting:{title[:80]}",
215
+ title,
216
+ tags=["work", "contact"],
217
+ source="notion:transcription",
218
+ )
219
+ else:
220
+ mem.upsert(
221
+ f"project:{title[:80]}",
222
+ title,
223
+ tags=["work", "knowledge"],
224
+ source="notion:page",
225
+ )
226
+ count += 1
227
+ return count
228
+
229
+
230
+ def dump_changed_pages(mem: MemoryDB, limit: int = 50, min_blocks: int = 5) -> str:
231
+ """Reconstruct changed Notion pages as markdown for LLM extraction.
232
+
233
+ Returns concatenated markdown of pages changed since last sync,
234
+ filtered to pages with at least min_blocks child blocks.
235
+ Used by run.sh to pass content to a claude -p session.
236
+ """
237
+ tmp = _copy_notion_db()
238
+ if not tmp:
239
+ return ""
240
+
241
+ try:
242
+ conn = sqlite3.connect(f"file:{tmp}?mode=ro", uri=True)
243
+ conn.row_factory = sqlite3.Row
244
+
245
+ since_ts = float(mem.get_meta("notion_last_sync_ts") or "0")
246
+
247
+ rows = conn.execute(
248
+ "SELECT id, type, properties, last_edited_time FROM block "
249
+ "WHERE type IN ('page', 'transcription') AND alive = 1 "
250
+ "AND properties IS NOT NULL AND last_edited_time > ? "
251
+ "ORDER BY last_edited_time DESC LIMIT ?",
252
+ (since_ts, limit * 3), # fetch extra to filter by block count
253
+ ).fetchall()
254
+
255
+ pages = []
256
+ for row in rows:
257
+ title = _extract_title(row["properties"])
258
+ if not title or len(title) < 3:
259
+ continue
260
+ child_count = conn.execute(
261
+ "SELECT COUNT(*) FROM block WHERE parent_id = ? AND alive = 1",
262
+ (row["id"],)
263
+ ).fetchone()[0]
264
+ if child_count < min_blocks:
265
+ continue
266
+ page_md = _reconstruct_page(conn, row["id"], title)
267
+ if len(page_md) > 50:
268
+ pages.append(page_md[:MAX_CHARS_TO_LLM])
269
+ if len(pages) >= limit:
270
+ break
271
+
272
+ conn.close()
273
+ return "\n\n---\n\n".join(pages)
274
+ finally:
275
+ shutil.rmtree(tmp.parent, ignore_errors=True)
276
+
277
+
278
+ def ingest_notion(mem: MemoryDB):
279
+ """Main entry point. Copies DB, runs Tier 1, updates high-water mark."""
280
+ tmp = _copy_notion_db()
281
+ if not tmp:
282
+ log.warning("Notion DB not found — skipping Notion ingestor")
283
+ return
284
+
285
+ try:
286
+ conn = sqlite3.connect(f"file:{tmp}?mode=ro", uri=True)
287
+ conn.row_factory = sqlite3.Row
288
+
289
+ # Get max last_edited_time for new high-water mark
290
+ row = conn.execute(
291
+ "SELECT MAX(last_edited_time) as max_ts FROM block"
292
+ ).fetchone()
293
+ new_ts = row["max_ts"] if row and row["max_ts"] else 0
294
+
295
+ # Tier 1: always runs
296
+ ws_count = _ingest_workspace(mem, conn)
297
+ user_count = _ingest_users(mem, conn)
298
+ title_count = _ingest_page_titles(mem, conn)
299
+ log.info(
300
+ f"Notion Tier 1: {ws_count} workspaces, {user_count} users, "
301
+ f"{title_count} page titles"
302
+ )
303
+
304
+ # Update high-water mark
305
+ if new_ts:
306
+ mem.set_meta("notion_last_sync_ts", str(new_ts))
307
+
308
+ conn.close()
309
+ except Exception as e:
310
+ log.warning(f"Notion ingestor error: {e}")
311
+ raise
312
+ finally:
313
+ shutil.rmtree(tmp.parent, ignore_errors=True)
@@ -0,0 +1,134 @@
1
+ """Ingest memories directly from Chromium Web Data files (address profiles, autofill, cards)."""
2
+
3
+ import shutil
4
+ import sqlite3
5
+ import tempfile
6
+ import logging
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ from ai_browser_profile.db import MemoryDB
11
+ from ai_browser_profile.ingestors.constants import (
12
+ ADDRESS_TYPE_MAP, AUTOFILL_FIELD_MAP, BROWSER_PATHS,
13
+ clean_field_name, is_noise_field, infer_tags,
14
+ )
15
+
16
+ log = logging.getLogger(__name__)
17
+
18
+
19
+ def _copy_db(src: Path) -> Optional[Path]:
20
+ """Copy a SQLite DB to temp dir to avoid browser locks."""
21
+ if not src.exists():
22
+ return None
23
+ tmp = Path(tempfile.mkdtemp(prefix="ai_browser_profile_"))
24
+ dst = tmp / src.name
25
+ shutil.copy2(src, dst)
26
+ for suffix in ["-wal", "-shm"]:
27
+ wal = src.parent / (src.name + suffix)
28
+ if wal.exists():
29
+ shutil.copy2(wal, tmp / (src.name + suffix))
30
+ return dst
31
+
32
+
33
+ def _extract_webdata(mem: MemoryDB, browser: str, profile: str, webdata_path: Path):
34
+ """Extract address profiles, form autofill, and credit card info from Web Data."""
35
+ tmp_db = _copy_db(webdata_path)
36
+ if not tmp_db:
37
+ return
38
+ source_prefix = f"autofill:{browser}:{profile}"
39
+
40
+ try:
41
+ conn = sqlite3.connect(f"file:{tmp_db}?mode=ro", uri=True)
42
+ conn.row_factory = sqlite3.Row
43
+
44
+ # --- Structured address profiles (all type codes) ---
45
+ use_counts = {}
46
+ try:
47
+ for row in conn.execute("SELECT guid, use_count FROM addresses"):
48
+ use_counts[row["guid"]] = row["use_count"]
49
+ except sqlite3.OperationalError:
50
+ pass
51
+
52
+ try:
53
+ for row in conn.execute("SELECT guid, type, value FROM address_type_tokens WHERE value != ''"):
54
+ type_code = row["type"]
55
+ use_count = use_counts.get(row["guid"], 0)
56
+
57
+ if type_code in ADDRESS_TYPE_MAP:
58
+ key_name, tags = ADDRESS_TYPE_MAP[type_code]
59
+ else:
60
+ key_name = f"address_type_{type_code}"
61
+ tags = ["address"]
62
+
63
+ mem.upsert(key_name, row["value"], tags, source=source_prefix)
64
+ except sqlite3.OperationalError:
65
+ pass
66
+
67
+ # --- Form autofill entries (ALL fields, not just mapped ones) ---
68
+ try:
69
+ for row in conn.execute("SELECT name, value, count FROM autofill WHERE value != '' ORDER BY count DESC"):
70
+ raw_field = row["name"]
71
+ value = row["value"]
72
+ use_count = row["count"]
73
+
74
+ # Skip noise: pure numbers, UUIDs, timestamps, CSS selectors
75
+ if is_noise_field(raw_field):
76
+ continue
77
+
78
+ # Skip very low usage (likely accidental fills)
79
+ if use_count < 2:
80
+ continue
81
+
82
+ # Skip very long values (likely not user data)
83
+ if len(value) > 500:
84
+ continue
85
+
86
+ # Clean the field name
87
+ cleaned = clean_field_name(raw_field)
88
+ if not cleaned or len(cleaned) < 2:
89
+ continue
90
+
91
+ # Try to map to a known normalized key
92
+ if cleaned in AUTOFILL_FIELD_MAP:
93
+ key_name, tags = AUTOFILL_FIELD_MAP[cleaned]
94
+ else:
95
+ key_name = f"autofill:{cleaned}"
96
+ tags = infer_tags(cleaned)
97
+
98
+ mem.upsert(key_name, value, tags, source=f"form:{browser}:{profile}")
99
+ except sqlite3.OperationalError:
100
+ pass
101
+
102
+ # --- Credit cards (metadata only, no card numbers) ---
103
+ try:
104
+ for row in conn.execute("SELECT name_on_card, expiration_month, expiration_year, nickname FROM credit_cards"):
105
+ if row["name_on_card"]:
106
+ mem.upsert("card_holder_name", row["name_on_card"],
107
+ ["payment", "identity"], source=f"card:{browser}:{profile}")
108
+ if row["expiration_month"] and row["expiration_year"]:
109
+ mem.upsert("card_expiry", f"{row['expiration_month']:02d}/{row['expiration_year']}",
110
+ ["payment"], source=f"card:{browser}:{profile}")
111
+ if row["nickname"]:
112
+ mem.upsert("card_nickname", row["nickname"],
113
+ ["payment"], source=f"card:{browser}:{profile}")
114
+ except sqlite3.OperationalError:
115
+ pass
116
+
117
+ conn.close()
118
+ except Exception as e:
119
+ log.warning(f"Failed to extract Web Data for {browser}/{profile}: {e}")
120
+ finally:
121
+ shutil.rmtree(tmp_db.parent, ignore_errors=True)
122
+
123
+
124
+ def ingest_webdata(mem: MemoryDB):
125
+ """Extract memories from all Chromium Web Data files."""
126
+ for browser, base in BROWSER_PATHS.items():
127
+ if not base.exists():
128
+ continue
129
+ for d in sorted(base.iterdir()):
130
+ if d.is_dir() and (d.name == "Default" or d.name.startswith("Profile ")):
131
+ webdata = d / "Web Data"
132
+ if webdata.exists():
133
+ log.info(f" Web Data: {browser}/{d.name}")
134
+ _extract_webdata(mem, browser, d.name, webdata)