ai-browser-profile 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +118 -0
- package/ai_browser_profile/__init__.py +6 -0
- package/ai_browser_profile/db.py +929 -0
- package/ai_browser_profile/embeddings.py +196 -0
- package/ai_browser_profile/extract.py +108 -0
- package/ai_browser_profile/ingestors/__init__.py +0 -0
- package/ai_browser_profile/ingestors/bookmarks.py +185 -0
- package/ai_browser_profile/ingestors/browser_detect.py +100 -0
- package/ai_browser_profile/ingestors/constants.py +208 -0
- package/ai_browser_profile/ingestors/history.py +123 -0
- package/ai_browser_profile/ingestors/indexeddb.py +203 -0
- package/ai_browser_profile/ingestors/localstorage.py +66 -0
- package/ai_browser_profile/ingestors/logins.py +46 -0
- package/ai_browser_profile/ingestors/messages.py +151 -0
- package/ai_browser_profile/ingestors/notion.py +313 -0
- package/ai_browser_profile/ingestors/webdata.py +134 -0
- package/autofill/SKILL.md +252 -0
- package/bin/cli.js +315 -0
- package/clean.py +295 -0
- package/extract.py +53 -0
- package/package.json +40 -0
- package/review/SKILL.md +171 -0
- package/review/run.sh +82 -0
- package/setup/SKILL.md +177 -0
- package/skill/SKILL.md +180 -0
- package/whatsapp/SKILL.md +321 -0
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""Lazy-loading embedding model + vector storage for semantic search.
|
|
2
|
+
|
|
3
|
+
Uses a plain SQLite table for embeddings (no sqlite-vec extension needed)
|
|
4
|
+
and computes cosine similarity in Python via numpy.
|
|
5
|
+
|
|
6
|
+
Model: nomic-embed-text-v1.5 (768-dim, 2K context, MTEB ~65).
|
|
7
|
+
Runtime: ONNX Runtime (~50MB) with pre-built quantized model (~131MB).
|
|
8
|
+
No PyTorch, transformers, scipy, or sklearn needed.
|
|
9
|
+
|
|
10
|
+
Nomic uses task prefixes: "search_document: " for stored texts,
|
|
11
|
+
"search_query: " for queries.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import logging
|
|
15
|
+
import struct
|
|
16
|
+
from typing import Optional
|
|
17
|
+
|
|
18
|
+
log = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
# Module-level state — loaded on first call
|
|
21
|
+
_session = None
|
|
22
|
+
_tokenizer = None
|
|
23
|
+
|
|
24
|
+
EMBEDDING_DIM = 768
|
|
25
|
+
MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5"
|
|
26
|
+
ONNX_FILE = "onnx/model_quantized.onnx"
|
|
27
|
+
MAX_LENGTH = 512
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _load_model():
|
|
31
|
+
"""Load ONNX model + tokenizer on first use (~131MB download)."""
|
|
32
|
+
global _session, _tokenizer
|
|
33
|
+
if _session is not None:
|
|
34
|
+
return True
|
|
35
|
+
try:
|
|
36
|
+
import onnxruntime as ort
|
|
37
|
+
from huggingface_hub import hf_hub_download
|
|
38
|
+
from tokenizers import Tokenizer
|
|
39
|
+
|
|
40
|
+
onnx_path = hf_hub_download(MODEL_NAME, ONNX_FILE)
|
|
41
|
+
tok_path = hf_hub_download(MODEL_NAME, "tokenizer.json")
|
|
42
|
+
|
|
43
|
+
providers = ['CoreMLExecutionProvider', 'CPUExecutionProvider'] \
|
|
44
|
+
if 'CoreMLExecutionProvider' in ort.get_available_providers() \
|
|
45
|
+
else ['CPUExecutionProvider']
|
|
46
|
+
_session = ort.InferenceSession(onnx_path, providers=providers)
|
|
47
|
+
|
|
48
|
+
_tokenizer = Tokenizer.from_file(tok_path)
|
|
49
|
+
# Only truncate; padding is done dynamically per-batch in _embed_raw
|
|
50
|
+
_tokenizer.enable_truncation(max_length=MAX_LENGTH)
|
|
51
|
+
|
|
52
|
+
log.info(f"Loaded embedding model: {MODEL_NAME} (ONNX, {_session.get_providers()})")
|
|
53
|
+
return True
|
|
54
|
+
except ImportError as e:
|
|
55
|
+
log.warning(f"ONNX runtime dependencies not installed — semantic search disabled: {e}")
|
|
56
|
+
return False
|
|
57
|
+
except Exception as e:
|
|
58
|
+
log.warning(f"Failed to load embedding model: {e}")
|
|
59
|
+
return False
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _embed_raw(texts: list[str]) -> list[Optional[list[float]]]:
|
|
63
|
+
"""Embed pre-prefixed texts via ONNX Runtime. Returns normalized vectors."""
|
|
64
|
+
import numpy as np
|
|
65
|
+
|
|
66
|
+
results = []
|
|
67
|
+
batch_size = 32
|
|
68
|
+
for i in range(0, len(texts), batch_size):
|
|
69
|
+
batch = texts[i:i + batch_size]
|
|
70
|
+
encoded = _tokenizer.encode_batch(batch)
|
|
71
|
+
|
|
72
|
+
# Dynamic padding: pad to max length in this batch, not MAX_LENGTH
|
|
73
|
+
max_len = max(len(e.ids) for e in encoded)
|
|
74
|
+
input_ids = np.zeros((len(encoded), max_len), dtype=np.int64)
|
|
75
|
+
attention_mask = np.zeros((len(encoded), max_len), dtype=np.int64)
|
|
76
|
+
for j, e in enumerate(encoded):
|
|
77
|
+
seq_len = len(e.ids)
|
|
78
|
+
input_ids[j, :seq_len] = e.ids
|
|
79
|
+
attention_mask[j, :seq_len] = e.attention_mask
|
|
80
|
+
token_type_ids = np.zeros_like(input_ids)
|
|
81
|
+
|
|
82
|
+
outputs = _session.run(None, {
|
|
83
|
+
"input_ids": input_ids,
|
|
84
|
+
"attention_mask": attention_mask,
|
|
85
|
+
"token_type_ids": token_type_ids,
|
|
86
|
+
})
|
|
87
|
+
|
|
88
|
+
last_hidden = outputs[0] # (batch, seq, 768)
|
|
89
|
+
mask = attention_mask[:, :, None].astype(np.float32)
|
|
90
|
+
emb = (last_hidden * mask).sum(axis=1) / mask.sum(axis=1)
|
|
91
|
+
norms = np.linalg.norm(emb, axis=1, keepdims=True)
|
|
92
|
+
emb = emb / norms
|
|
93
|
+
|
|
94
|
+
for vec in emb:
|
|
95
|
+
results.append(vec.tolist())
|
|
96
|
+
|
|
97
|
+
return results
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def is_available() -> bool:
|
|
101
|
+
"""Check if embedding model can be loaded."""
|
|
102
|
+
return _load_model()
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def embed_text(text: str, prefix: str = "search_document") -> Optional[list[float]]:
|
|
106
|
+
"""Embed a single text string. Returns 768-dim vector or None if unavailable.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
text: The text to embed.
|
|
110
|
+
prefix: "search_document" for storing, "search_query" for searching.
|
|
111
|
+
"""
|
|
112
|
+
if not _load_model():
|
|
113
|
+
return None
|
|
114
|
+
results = _embed_raw([f"{prefix}: {text}"])
|
|
115
|
+
return results[0] if results else None
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def embed_batch(texts: list[str], prefix: str = "search_document") -> list[Optional[list[float]]]:
|
|
119
|
+
"""Embed a batch of texts. Returns list of 768-dim vectors."""
|
|
120
|
+
if not _load_model():
|
|
121
|
+
return [None] * len(texts)
|
|
122
|
+
prefixed = [f"{prefix}: {t}" for t in texts]
|
|
123
|
+
return _embed_raw(prefixed)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _serialize_vec(vec: list[float]) -> bytes:
|
|
127
|
+
"""Serialize a float vector to bytes for SQLite BLOB storage."""
|
|
128
|
+
return struct.pack(f"{len(vec)}f", *vec)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _deserialize_vec(blob: bytes) -> list[float]:
|
|
132
|
+
"""Deserialize bytes back to float vector."""
|
|
133
|
+
n = len(blob) // 4
|
|
134
|
+
return list(struct.unpack(f"{n}f", blob))
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def setup_embeddings_table(conn) -> bool:
|
|
138
|
+
"""Create memory_embeddings table (plain SQLite, no extensions needed)."""
|
|
139
|
+
try:
|
|
140
|
+
conn.execute("""
|
|
141
|
+
CREATE TABLE IF NOT EXISTS memory_embeddings (
|
|
142
|
+
memory_id INTEGER PRIMARY KEY,
|
|
143
|
+
embedding BLOB NOT NULL
|
|
144
|
+
)
|
|
145
|
+
""")
|
|
146
|
+
conn.commit()
|
|
147
|
+
return True
|
|
148
|
+
except Exception as e:
|
|
149
|
+
log.warning(f"Failed to create embeddings table: {e}")
|
|
150
|
+
return False
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def store_embedding(conn, memory_id: int, vec: list[float]):
|
|
154
|
+
"""Store an embedding for a memory."""
|
|
155
|
+
try:
|
|
156
|
+
conn.execute(
|
|
157
|
+
"INSERT OR REPLACE INTO memory_embeddings (memory_id, embedding) VALUES (?, ?)",
|
|
158
|
+
(memory_id, _serialize_vec(vec)),
|
|
159
|
+
)
|
|
160
|
+
except Exception as e:
|
|
161
|
+
log.debug(f"Failed to store embedding for {memory_id}: {e}")
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def cosine_search(conn, query_vec: list[float], limit: int = 20,
|
|
165
|
+
threshold: float = 0.5) -> list[tuple[int, float]]:
|
|
166
|
+
"""Search for similar memories by embedding.
|
|
167
|
+
|
|
168
|
+
Computes cosine similarity in Python (vectors are pre-normalized).
|
|
169
|
+
Returns [(memory_id, similarity)] sorted by similarity descending.
|
|
170
|
+
"""
|
|
171
|
+
try:
|
|
172
|
+
import numpy as np
|
|
173
|
+
except ImportError:
|
|
174
|
+
log.warning("numpy not available for cosine search")
|
|
175
|
+
return []
|
|
176
|
+
|
|
177
|
+
try:
|
|
178
|
+
rows = conn.execute("SELECT memory_id, embedding FROM memory_embeddings").fetchall()
|
|
179
|
+
except Exception:
|
|
180
|
+
return []
|
|
181
|
+
|
|
182
|
+
if not rows:
|
|
183
|
+
return []
|
|
184
|
+
|
|
185
|
+
q = np.array(query_vec, dtype=np.float32)
|
|
186
|
+
|
|
187
|
+
results = []
|
|
188
|
+
for mem_id, blob in rows:
|
|
189
|
+
vec = np.frombuffer(blob, dtype=np.float32)
|
|
190
|
+
# Dot product of normalized vectors = cosine similarity
|
|
191
|
+
sim = float(np.dot(q, vec))
|
|
192
|
+
if sim >= threshold:
|
|
193
|
+
results.append((mem_id, sim))
|
|
194
|
+
|
|
195
|
+
results.sort(key=lambda x: -x[1])
|
|
196
|
+
return results[:limit]
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
"""Orchestrate memory extraction from all browser sources."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
from typing import Optional, Set
|
|
6
|
+
|
|
7
|
+
from ai_browser_profile.db import MemoryDB
|
|
8
|
+
from ai_browser_profile.embeddings import setup_embeddings_table
|
|
9
|
+
from ai_browser_profile.ingestors.browser_detect import detect_browsers
|
|
10
|
+
from ai_browser_profile.ingestors.webdata import ingest_webdata
|
|
11
|
+
from ai_browser_profile.ingestors.history import ingest_history
|
|
12
|
+
from ai_browser_profile.ingestors.logins import ingest_logins
|
|
13
|
+
from ai_browser_profile.ingestors.bookmarks import ingest_bookmarks
|
|
14
|
+
|
|
15
|
+
log = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _timed(name, func, *args, **kwargs):
|
|
19
|
+
"""Run func with timing, log duration."""
|
|
20
|
+
log.info(f"[{name}] starting...")
|
|
21
|
+
t0 = time.monotonic()
|
|
22
|
+
result = func(*args, **kwargs)
|
|
23
|
+
elapsed = time.monotonic() - t0
|
|
24
|
+
log.info(f"[{name}] done in {elapsed:.1f}s")
|
|
25
|
+
return result
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def extract_memories(memories_db_path: str = "memories.db",
|
|
29
|
+
browsers: Optional[Set[str]] = None,
|
|
30
|
+
skip_indexeddb: bool = False,
|
|
31
|
+
skip_localstorage: bool = False,
|
|
32
|
+
skip_notion: bool = False) -> MemoryDB:
|
|
33
|
+
"""Build the memories database directly from browser files.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
memories_db_path: Output database path.
|
|
37
|
+
browsers: Set of browser names to scan (None = all).
|
|
38
|
+
skip_indexeddb: Skip IndexedDB extraction (requires ccl_chromium_reader).
|
|
39
|
+
skip_localstorage: Skip Local Storage extraction (requires ccl_chromium_reader).
|
|
40
|
+
"""
|
|
41
|
+
total_start = time.monotonic()
|
|
42
|
+
mem = MemoryDB(memories_db_path, defer_embeddings=True)
|
|
43
|
+
profiles = detect_browsers(allowed=browsers)
|
|
44
|
+
log.info(f"Extracting memories from {len(profiles)} profiles...")
|
|
45
|
+
|
|
46
|
+
# 1. Autofill — saved form data, addresses, credit cards
|
|
47
|
+
_timed("Autofill", ingest_webdata, mem)
|
|
48
|
+
|
|
49
|
+
# 2. History — tools and services used
|
|
50
|
+
_timed("History", ingest_history, mem, profiles)
|
|
51
|
+
|
|
52
|
+
# 3. Bookmarks — interests and saved links
|
|
53
|
+
_timed("Bookmarks", ingest_bookmarks, mem, profiles)
|
|
54
|
+
|
|
55
|
+
# 4. Logins — saved accounts per site
|
|
56
|
+
_timed("Logins", ingest_logins, mem, profiles)
|
|
57
|
+
|
|
58
|
+
# 5. LinkedIn — connections from Local Storage
|
|
59
|
+
if not skip_localstorage:
|
|
60
|
+
try:
|
|
61
|
+
from ai_browser_profile.ingestors.localstorage import ingest_localstorage
|
|
62
|
+
_timed("LinkedIn", ingest_localstorage, mem, profiles)
|
|
63
|
+
except ImportError:
|
|
64
|
+
log.warning("ccl_chromium_reader not installed — skipping LinkedIn")
|
|
65
|
+
|
|
66
|
+
# 6. Notion — workspace contacts, pages, meetings
|
|
67
|
+
if not skip_notion:
|
|
68
|
+
try:
|
|
69
|
+
from ai_browser_profile.ingestors.notion import ingest_notion
|
|
70
|
+
_timed("Notion", ingest_notion, mem)
|
|
71
|
+
except Exception as e:
|
|
72
|
+
log.warning(f"Notion ingestor failed: {e}")
|
|
73
|
+
|
|
74
|
+
# -- Interim profile: core data is ready, show it before slow steps --
|
|
75
|
+
mem.conn.commit()
|
|
76
|
+
# Run cleanup before showing profile so noise is removed
|
|
77
|
+
from clean import run_cleanup
|
|
78
|
+
log.info("Running interim cleanup...")
|
|
79
|
+
mem.close()
|
|
80
|
+
run_cleanup(db_path=memories_db_path)
|
|
81
|
+
mem = MemoryDB(memories_db_path, defer_embeddings=True)
|
|
82
|
+
interim_profile = mem.profile_text()
|
|
83
|
+
log.info(f"Interim profile ready (WhatsApp + embeddings still running):\n{interim_profile}")
|
|
84
|
+
|
|
85
|
+
# 7. WhatsApp — contacts from IndexedDB (slow, runs last)
|
|
86
|
+
if not skip_indexeddb:
|
|
87
|
+
try:
|
|
88
|
+
from ai_browser_profile.ingestors.indexeddb import ingest_indexeddb
|
|
89
|
+
_timed("WhatsApp", ingest_indexeddb, mem, profiles)
|
|
90
|
+
except ImportError:
|
|
91
|
+
log.warning("ccl_chromium_reader not installed — skipping WhatsApp")
|
|
92
|
+
|
|
93
|
+
mem.conn.commit()
|
|
94
|
+
|
|
95
|
+
# 8. Embeddings — backfill all at once (loads ONNX model once, batches efficiently)
|
|
96
|
+
mem._vec_ready = setup_embeddings_table(mem.conn)
|
|
97
|
+
mem._defer_embeddings = False
|
|
98
|
+
if mem._vec_ready:
|
|
99
|
+
_timed("Embeddings", mem.backfill_embeddings)
|
|
100
|
+
|
|
101
|
+
total_elapsed = time.monotonic() - total_start
|
|
102
|
+
stats = mem.stats()
|
|
103
|
+
log.info(
|
|
104
|
+
f"Memories: {stats['total_memories']} total, "
|
|
105
|
+
f"tags: {', '.join(f'{t}={c}' for t, c in list(stats['by_tag'].items())[:10])}"
|
|
106
|
+
)
|
|
107
|
+
log.info(f"Total extraction time: {total_elapsed:.1f}s")
|
|
108
|
+
return mem
|
|
File without changes
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""Ingest bookmark memories from browser bookmark files."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import plistlib
|
|
5
|
+
import shutil
|
|
6
|
+
import sqlite3
|
|
7
|
+
import logging
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from ai_browser_profile.db import MemoryDB
|
|
11
|
+
from ai_browser_profile.ingestors.browser_detect import BrowserProfile, copy_db, domain
|
|
12
|
+
from ai_browser_profile.ingestors.constants import SERVICE_NAMES
|
|
13
|
+
|
|
14
|
+
log = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
# URLs to skip entirely
|
|
17
|
+
_SKIP_SCHEMES = ("chrome://", "chrome-extension://", "about:", "javascript:", "edge://", "brave://")
|
|
18
|
+
|
|
19
|
+
# Default bookmarks that are noise
|
|
20
|
+
_SKIP_DOMAINS = {"www.apple.com", "apple.com", "support.apple.com"}
|
|
21
|
+
_SKIP_URLS = {"https://www.google.com/", "http://www.google.com/", "https://google.com/", "http://google.com/"}
|
|
22
|
+
|
|
23
|
+
# Domain keywords for tag inference on unknown bookmarks
|
|
24
|
+
_TAG_KEYWORDS = {
|
|
25
|
+
"tool": {"github", "gitlab", "stackoverflow", "stackexchange", "dev", "codepen",
|
|
26
|
+
"codesandbox", "npm", "pypi", "crates", "packagist", "brew"},
|
|
27
|
+
"knowledge": {"docs", "wiki", "learn", "tutorial", "guide", "course", "edu",
|
|
28
|
+
"arxiv", "scholar", "paper", "blog", "medium", "substack"},
|
|
29
|
+
"social": {"twitter", "x.com", "linkedin", "facebook", "instagram", "reddit",
|
|
30
|
+
"mastodon", "threads", "tiktok", "youtube"},
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _should_skip(url: str) -> bool:
|
|
35
|
+
"""Return True if this bookmark URL should be skipped."""
|
|
36
|
+
if any(url.startswith(s) for s in _SKIP_SCHEMES):
|
|
37
|
+
return True
|
|
38
|
+
if url in _SKIP_URLS:
|
|
39
|
+
return True
|
|
40
|
+
d = domain(url)
|
|
41
|
+
if d in _SKIP_DOMAINS:
|
|
42
|
+
return True
|
|
43
|
+
if not d:
|
|
44
|
+
return True
|
|
45
|
+
return False
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _infer_tags(d: str) -> list[str]:
|
|
49
|
+
"""Infer tags for an unknown bookmark domain."""
|
|
50
|
+
tags = {"knowledge"}
|
|
51
|
+
d_lower = d.lower()
|
|
52
|
+
for tag, keywords in _TAG_KEYWORDS.items():
|
|
53
|
+
for kw in keywords:
|
|
54
|
+
if kw in d_lower:
|
|
55
|
+
tags.add(tag)
|
|
56
|
+
break
|
|
57
|
+
return list(tags)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _walk_chromium_bookmarks(node: dict, out: list[dict]):
|
|
61
|
+
"""Recursively walk a Chromium bookmark tree node."""
|
|
62
|
+
if node.get("type") == "url":
|
|
63
|
+
url = node.get("url", "")
|
|
64
|
+
title = node.get("name", "")
|
|
65
|
+
if url and not _should_skip(url):
|
|
66
|
+
out.append({"url": url, "title": title})
|
|
67
|
+
for child in node.get("children", []):
|
|
68
|
+
_walk_chromium_bookmarks(child, out)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _chromium_bookmarks(profile: BrowserProfile) -> list[dict]:
|
|
72
|
+
"""Read bookmarks from Chromium JSON file."""
|
|
73
|
+
bm_path = profile.path / "Bookmarks"
|
|
74
|
+
if not bm_path.exists():
|
|
75
|
+
return []
|
|
76
|
+
try:
|
|
77
|
+
with open(bm_path, "r", encoding="utf-8") as f:
|
|
78
|
+
data = json.load(f)
|
|
79
|
+
out: list[dict] = []
|
|
80
|
+
roots = data.get("roots", {})
|
|
81
|
+
for key in ("bookmark_bar", "other", "synced"):
|
|
82
|
+
if key in roots:
|
|
83
|
+
_walk_chromium_bookmarks(roots[key], out)
|
|
84
|
+
return out
|
|
85
|
+
except Exception as e:
|
|
86
|
+
log.warning(f"Failed to read Bookmarks for {profile.browser}/{profile.name}: {e}")
|
|
87
|
+
return []
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def _walk_safari_bookmarks(node: dict, out: list[dict]):
|
|
91
|
+
"""Recursively walk a Safari bookmark plist node."""
|
|
92
|
+
bm_type = node.get("WebBookmarkType", "")
|
|
93
|
+
if bm_type == "WebBookmarkTypeLeaf":
|
|
94
|
+
url = node.get("URLString", "")
|
|
95
|
+
title = node.get("URIDictionary", {}).get("title", "")
|
|
96
|
+
if url and not _should_skip(url):
|
|
97
|
+
out.append({"url": url, "title": title})
|
|
98
|
+
for child in node.get("Children", []):
|
|
99
|
+
_walk_safari_bookmarks(child, out)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _safari_bookmarks(profile: BrowserProfile) -> list[dict]:
|
|
103
|
+
"""Read bookmarks from Safari Bookmarks.plist."""
|
|
104
|
+
bm_path = profile.path / "Bookmarks.plist"
|
|
105
|
+
if not bm_path.exists():
|
|
106
|
+
return []
|
|
107
|
+
try:
|
|
108
|
+
with open(bm_path, "rb") as f:
|
|
109
|
+
data = plistlib.load(f)
|
|
110
|
+
out: list[dict] = []
|
|
111
|
+
_walk_safari_bookmarks(data, out)
|
|
112
|
+
return out
|
|
113
|
+
except Exception as e:
|
|
114
|
+
log.warning(f"Failed to read Safari Bookmarks.plist: {e}")
|
|
115
|
+
return []
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _firefox_bookmarks(profile: BrowserProfile) -> list[dict]:
|
|
119
|
+
"""Read bookmarks from Firefox places.sqlite."""
|
|
120
|
+
tmp = copy_db(profile.path / "places.sqlite")
|
|
121
|
+
if not tmp:
|
|
122
|
+
return []
|
|
123
|
+
try:
|
|
124
|
+
conn = sqlite3.connect(f"file:{tmp}?mode=ro", uri=True)
|
|
125
|
+
conn.row_factory = sqlite3.Row
|
|
126
|
+
rows = conn.execute(
|
|
127
|
+
"SELECT b.title, p.url FROM moz_bookmarks b "
|
|
128
|
+
"JOIN moz_places p ON b.fk = p.id "
|
|
129
|
+
"WHERE b.type = 1"
|
|
130
|
+
).fetchall()
|
|
131
|
+
conn.close()
|
|
132
|
+
out: list[dict] = []
|
|
133
|
+
for row in rows:
|
|
134
|
+
url = row["url"] or ""
|
|
135
|
+
title = row["title"] or ""
|
|
136
|
+
if url and not _should_skip(url):
|
|
137
|
+
out.append({"url": url, "title": title})
|
|
138
|
+
return out
|
|
139
|
+
except Exception as e:
|
|
140
|
+
log.warning(f"Failed to read Firefox bookmarks: {e}")
|
|
141
|
+
return []
|
|
142
|
+
finally:
|
|
143
|
+
shutil.rmtree(tmp.parent, ignore_errors=True)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def ingest_bookmarks(mem: MemoryDB, profiles: list[BrowserProfile]):
|
|
147
|
+
"""Extract bookmark memories from all browser profiles."""
|
|
148
|
+
# Collect all bookmarks across profiles
|
|
149
|
+
all_bookmarks: list[dict] = []
|
|
150
|
+
for profile in profiles:
|
|
151
|
+
if profile.browser in ("arc", "chrome", "brave", "edge"):
|
|
152
|
+
all_bookmarks.extend(_chromium_bookmarks(profile))
|
|
153
|
+
elif profile.browser == "safari":
|
|
154
|
+
all_bookmarks.extend(_safari_bookmarks(profile))
|
|
155
|
+
elif profile.browser == "firefox":
|
|
156
|
+
all_bookmarks.extend(_firefox_bookmarks(profile))
|
|
157
|
+
|
|
158
|
+
# Deduplicate by URL (keep first title seen)
|
|
159
|
+
seen_urls: dict[str, str] = {}
|
|
160
|
+
for bm in all_bookmarks:
|
|
161
|
+
url = bm["url"]
|
|
162
|
+
if url not in seen_urls:
|
|
163
|
+
seen_urls[url] = bm["title"]
|
|
164
|
+
|
|
165
|
+
known_count = 0
|
|
166
|
+
unknown_count = 0
|
|
167
|
+
|
|
168
|
+
for url, title in seen_urls.items():
|
|
169
|
+
d = domain(url)
|
|
170
|
+
if not d:
|
|
171
|
+
continue
|
|
172
|
+
|
|
173
|
+
if d in SERVICE_NAMES:
|
|
174
|
+
# Known service: boost the tool entry
|
|
175
|
+
service = SERVICE_NAMES[d]
|
|
176
|
+
tags = ["account", "tool"]
|
|
177
|
+
mem.upsert(f"tool:{service}", title or service, tags, source=f"bookmark:{d}")
|
|
178
|
+
known_count += 1
|
|
179
|
+
else:
|
|
180
|
+
# Unknown domain: create bookmark entry
|
|
181
|
+
tags = _infer_tags(d)
|
|
182
|
+
mem.upsert(f"bookmark:{d}", title or d, tags, confidence=0.6, source=f"bookmark:{url}")
|
|
183
|
+
unknown_count += 1
|
|
184
|
+
|
|
185
|
+
log.info(f" Bookmarks: {len(seen_urls)} unique, {known_count} known services, {unknown_count} new domains")
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""Detect installed browsers and their profiles."""
|
|
2
|
+
|
|
3
|
+
import shutil
|
|
4
|
+
import sqlite3
|
|
5
|
+
import tempfile
|
|
6
|
+
import logging
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Optional, Set
|
|
10
|
+
from urllib.parse import urlparse
|
|
11
|
+
|
|
12
|
+
log = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
APP_SUPPORT = Path.home() / "Library" / "Application Support"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class BrowserProfile:
|
|
19
|
+
browser: str # "arc", "chrome", "safari", "firefox", "brave", "edge"
|
|
20
|
+
name: str # "Default", "Profile 1", etc.
|
|
21
|
+
path: Path # Full path to the profile directory
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _chromium_profiles(browser: str, base: Path) -> list[BrowserProfile]:
|
|
25
|
+
"""Find Chromium-based browser profiles (Default, Profile 1, etc.)."""
|
|
26
|
+
profiles = []
|
|
27
|
+
if not base.exists():
|
|
28
|
+
return profiles
|
|
29
|
+
|
|
30
|
+
for d in sorted(base.iterdir()):
|
|
31
|
+
if d.is_dir() and (d.name == "Default" or d.name.startswith("Profile ")):
|
|
32
|
+
if (d / "History").exists() or (d / "IndexedDB").exists():
|
|
33
|
+
profiles.append(BrowserProfile(browser=browser, name=d.name, path=d))
|
|
34
|
+
|
|
35
|
+
if not profiles:
|
|
36
|
+
default = base / "Default"
|
|
37
|
+
if default.exists():
|
|
38
|
+
profiles.append(BrowserProfile(browser=browser, name="Default", path=default))
|
|
39
|
+
|
|
40
|
+
return profiles
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def detect_browsers(allowed: Optional[Set[str]] = None) -> list[BrowserProfile]:
|
|
44
|
+
"""Return all detected browser profiles. Optionally filter by browser name."""
|
|
45
|
+
profiles: list[BrowserProfile] = []
|
|
46
|
+
|
|
47
|
+
browsers = {
|
|
48
|
+
"arc": APP_SUPPORT / "Arc" / "User Data",
|
|
49
|
+
"chrome": APP_SUPPORT / "Google" / "Chrome",
|
|
50
|
+
"brave": APP_SUPPORT / "BraveSoftware" / "Brave-Browser",
|
|
51
|
+
"edge": APP_SUPPORT / "Microsoft Edge",
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
for name, base in browsers.items():
|
|
55
|
+
if allowed and name not in allowed:
|
|
56
|
+
continue
|
|
57
|
+
profiles.extend(_chromium_profiles(name, base))
|
|
58
|
+
|
|
59
|
+
# Safari
|
|
60
|
+
if not allowed or "safari" in allowed:
|
|
61
|
+
safari_dir = Path.home() / "Library" / "Safari"
|
|
62
|
+
if safari_dir.exists():
|
|
63
|
+
profiles.append(BrowserProfile(browser="safari", name="Default", path=safari_dir))
|
|
64
|
+
|
|
65
|
+
# Firefox
|
|
66
|
+
if not allowed or "firefox" in allowed:
|
|
67
|
+
firefox_base = APP_SUPPORT / "Firefox" / "Profiles"
|
|
68
|
+
if firefox_base.exists():
|
|
69
|
+
for d in sorted(firefox_base.iterdir()):
|
|
70
|
+
if d.is_dir() and (d / "places.sqlite").exists():
|
|
71
|
+
profiles.append(BrowserProfile(browser="firefox", name=d.name, path=d))
|
|
72
|
+
|
|
73
|
+
log.info(f"Detected {len(profiles)} browser profiles: {[(p.browser, p.name) for p in profiles]}")
|
|
74
|
+
return profiles
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def copy_db(src: Path) -> Optional[Path]:
|
|
78
|
+
"""Copy a SQLite DB to temp dir to avoid browser locks."""
|
|
79
|
+
if not src.exists():
|
|
80
|
+
return None
|
|
81
|
+
try:
|
|
82
|
+
tmp = Path(tempfile.mkdtemp(prefix="ai_browser_profile_"))
|
|
83
|
+
dst = tmp / src.name
|
|
84
|
+
shutil.copy2(src, dst)
|
|
85
|
+
for suffix in ["-wal", "-shm"]:
|
|
86
|
+
wal = src.parent / (src.name + suffix)
|
|
87
|
+
if wal.exists():
|
|
88
|
+
shutil.copy2(wal, tmp / (src.name + suffix))
|
|
89
|
+
return dst
|
|
90
|
+
except PermissionError:
|
|
91
|
+
log.warning(f"Permission denied reading {src} — grant Full Disk Access or skip")
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def domain(url: str) -> str:
|
|
96
|
+
"""Extract domain from URL."""
|
|
97
|
+
try:
|
|
98
|
+
return urlparse(url).netloc
|
|
99
|
+
except Exception:
|
|
100
|
+
return ""
|