evolutiondb-browser-sync 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- browser_sync/__init__.py +10 -0
- browser_sync/__main__.py +4 -0
- browser_sync/extract.py +212 -0
- browser_sync/scanner.py +223 -0
- browser_sync/state.py +130 -0
- browser_sync/sync.py +228 -0
- evolutiondb_browser_sync-0.1.0.dist-info/METADATA +29 -0
- evolutiondb_browser_sync-0.1.0.dist-info/RECORD +11 -0
- evolutiondb_browser_sync-0.1.0.dist-info/WHEEL +5 -0
- evolutiondb_browser_sync-0.1.0.dist-info/entry_points.txt +2 -0
- evolutiondb_browser_sync-0.1.0.dist-info/top_level.txt +1 -0
browser_sync/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""evolutiondb-browser-sync — read local browser history (Chrome,
|
|
2
|
+
Edge, Firefox, Brave, Arc, Vivaldi) directly from each browser's
|
|
3
|
+
on-disk SQLite store and pour the entries into EvolutionDB
|
|
4
|
+
long-term memory.
|
|
5
|
+
|
|
6
|
+
No OAuth, no extension install, no remote API. The browser's own
|
|
7
|
+
data lives in a local file the user already owns, so the sync
|
|
8
|
+
just opens it read-only and copies the rows out."""
|
|
9
|
+
|
|
10
|
+
__version__ = "0.1.0"
|
browser_sync/__main__.py
ADDED
browser_sync/extract.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Open a browser history SQLite store in read-only mode and yield
|
|
3
|
+
normalised visit records. Two on-disk schemas exist in the wild —
|
|
4
|
+
the Chromium family (Chrome, Edge, Brave, Arc, Vivaldi, Opera) and
|
|
5
|
+
Mozilla Firefox. Each gets its own SELECT; the records they
|
|
6
|
+
produce share a single shape so the sync loop downstream does not
|
|
7
|
+
care which browser they came from.
|
|
8
|
+
|
|
9
|
+
Locking
|
|
10
|
+
-------
|
|
11
|
+
A running browser holds an exclusive WAL lock on its History /
|
|
12
|
+
places.sqlite file. We snapshot to a temp file with `shutil.copy`
|
|
13
|
+
first; on Windows that step can race a busy browser and raise
|
|
14
|
+
`PermissionError`, in which case we fall back to SQLite's URI
|
|
15
|
+
`mode=ro&immutable=1` open which often succeeds on the original.
|
|
16
|
+
"""
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import hashlib
|
|
20
|
+
import shutil
|
|
21
|
+
import sqlite3
|
|
22
|
+
import sys
|
|
23
|
+
import tempfile
|
|
24
|
+
from datetime import datetime, timedelta, timezone
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Dict, Iterator, Optional
|
|
27
|
+
|
|
28
|
+
from .scanner import CHROMIUM, FIREFOX, Profile
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
_CHROME_EPOCH = datetime(1601, 1, 1, tzinfo=timezone.utc)
|
|
32
|
+
_SKIP_PREFIXES = (
|
|
33
|
+
"chrome://", "chrome-extension://", "chrome-search://",
|
|
34
|
+
"chrome-untrusted://", "edge://", "extension://",
|
|
35
|
+
"moz-extension://", "about:", "view-source:",
|
|
36
|
+
"file://", "data:", "javascript:", "blob:",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ---------------------------------------------------------------- #
|
|
41
|
+
# Snapshot + connect #
|
|
42
|
+
# ---------------------------------------------------------------- #
|
|
43
|
+
def _snapshot(src: Path) -> Path:
|
|
44
|
+
tmp = Path(tempfile.mkstemp(suffix=".db",
|
|
45
|
+
prefix="evosql-bs-")[1])
|
|
46
|
+
try:
|
|
47
|
+
shutil.copy(src, tmp)
|
|
48
|
+
# WAL sidecars carry uncommitted history; copying them too
|
|
49
|
+
# avoids "no such table: visits" on freshly-opened browsers.
|
|
50
|
+
for sfx in ("-wal", "-shm"):
|
|
51
|
+
side = src.parent / (src.name + sfx)
|
|
52
|
+
if side.is_file():
|
|
53
|
+
try:
|
|
54
|
+
shutil.copy(side, tmp.parent / (tmp.name + sfx))
|
|
55
|
+
except OSError:
|
|
56
|
+
pass
|
|
57
|
+
return tmp
|
|
58
|
+
except PermissionError:
|
|
59
|
+
# Windows + browser exclusively locked. Cleanup, signal
|
|
60
|
+
# caller to fall back to URI immutable open of the original.
|
|
61
|
+
try:
|
|
62
|
+
tmp.unlink(missing_ok=True)
|
|
63
|
+
except Exception:
|
|
64
|
+
pass
|
|
65
|
+
raise
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _connect(src: Path) -> sqlite3.Connection:
|
|
69
|
+
try:
|
|
70
|
+
snap = _snapshot(src)
|
|
71
|
+
return sqlite3.connect(snap)
|
|
72
|
+
except PermissionError:
|
|
73
|
+
# Last-resort: read the live file in immutable mode. Will
|
|
74
|
+
# surface uncommitted rows when -wal exists, but at least
|
|
75
|
+
# surfaces something on busy Windows hosts.
|
|
76
|
+
return sqlite3.connect(
|
|
77
|
+
f"file:{src}?mode=ro&immutable=1", uri=True)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
# ---------------------------------------------------------------- #
|
|
81
|
+
# Timestamp normalisation #
|
|
82
|
+
# ---------------------------------------------------------------- #
|
|
83
|
+
def _chrome_us_to_iso(us: int) -> Optional[str]:
|
|
84
|
+
if not us or us < 0:
|
|
85
|
+
return None
|
|
86
|
+
try:
|
|
87
|
+
dt = _CHROME_EPOCH + timedelta(microseconds=int(us))
|
|
88
|
+
return dt.isoformat(timespec="seconds").replace("+00:00", "Z")
|
|
89
|
+
except (OverflowError, OSError):
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _firefox_us_to_iso(us: int) -> Optional[str]:
|
|
94
|
+
if not us or us < 0:
|
|
95
|
+
return None
|
|
96
|
+
try:
|
|
97
|
+
dt = datetime.fromtimestamp(int(us) / 1_000_000,
|
|
98
|
+
tz=timezone.utc)
|
|
99
|
+
return dt.isoformat(timespec="seconds").replace("+00:00", "Z")
|
|
100
|
+
except (OverflowError, OSError, ValueError):
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# ---------------------------------------------------------------- #
|
|
105
|
+
# Filtering #
|
|
106
|
+
# ---------------------------------------------------------------- #
|
|
107
|
+
def _keep(url: str) -> bool:
|
|
108
|
+
if not url:
|
|
109
|
+
return False
|
|
110
|
+
lower = url.lower()
|
|
111
|
+
if lower.startswith(_SKIP_PREFIXES):
|
|
112
|
+
return False
|
|
113
|
+
return True
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
# ---------------------------------------------------------------- #
|
|
117
|
+
# Record builder #
|
|
118
|
+
# ---------------------------------------------------------------- #
|
|
119
|
+
def _short_url(url: str, n: int = 80) -> str:
|
|
120
|
+
return url if len(url) <= n else url[: n - 1] + "…"
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _build_record(profile: Profile, url: str, title: str,
|
|
124
|
+
visit_count: int, last_iso: str) -> Dict:
|
|
125
|
+
title = (title or "").strip() or "(no title)"
|
|
126
|
+
fact = (f"Visited [{profile.browser}] \"{title}\" "
|
|
127
|
+
f"({_short_url(url)}) — last on {last_iso}, "
|
|
128
|
+
f"{visit_count} visit{'s' if visit_count != 1 else ''}")
|
|
129
|
+
url_hash = hashlib.sha1(url.encode("utf-8")).hexdigest()[:16]
|
|
130
|
+
return {
|
|
131
|
+
"fact": fact,
|
|
132
|
+
"source": "browser",
|
|
133
|
+
"kind": "visit",
|
|
134
|
+
"browser": profile.browser,
|
|
135
|
+
"browser_family": profile.family,
|
|
136
|
+
"profile": profile.profile_id,
|
|
137
|
+
"url": url,
|
|
138
|
+
"title": title,
|
|
139
|
+
"visit_count": int(visit_count or 0),
|
|
140
|
+
"last_visited_at": last_iso,
|
|
141
|
+
"url_hash": url_hash,
|
|
142
|
+
"tags": ["browser", "history", profile.browser],
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
# ---------------------------------------------------------------- #
|
|
147
|
+
# Per-family SELECT #
|
|
148
|
+
# ---------------------------------------------------------------- #
|
|
149
|
+
def _iter_chromium(profile: Profile,
|
|
150
|
+
since_iso: Optional[str]) -> Iterator[Dict]:
|
|
151
|
+
"""Chromium schema: `urls(url, title, visit_count,
|
|
152
|
+
last_visit_time)`. `last_visit_time` is Chrome epoch
|
|
153
|
+
microseconds since 1601."""
|
|
154
|
+
conn = _connect(profile.history_path)
|
|
155
|
+
try:
|
|
156
|
+
try:
|
|
157
|
+
cur = conn.execute(
|
|
158
|
+
"SELECT url, title, visit_count, last_visit_time "
|
|
159
|
+
"FROM urls WHERE last_visit_time > 0")
|
|
160
|
+
except sqlite3.DatabaseError as exc:
|
|
161
|
+
print(f"[browser-sync] {profile.profile_id} "
|
|
162
|
+
f"DB error: {exc}", file=sys.stderr, flush=True)
|
|
163
|
+
return
|
|
164
|
+
for url, title, visits, last in cur:
|
|
165
|
+
if not _keep(url):
|
|
166
|
+
continue
|
|
167
|
+
iso = _chrome_us_to_iso(last)
|
|
168
|
+
if iso is None:
|
|
169
|
+
continue
|
|
170
|
+
if since_iso and iso < since_iso:
|
|
171
|
+
continue
|
|
172
|
+
yield _build_record(profile, url, title, visits, iso)
|
|
173
|
+
finally:
|
|
174
|
+
conn.close()
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _iter_firefox(profile: Profile,
|
|
178
|
+
since_iso: Optional[str]) -> Iterator[Dict]:
|
|
179
|
+
"""Firefox schema: `moz_places(url, title, visit_count,
|
|
180
|
+
last_visit_date)`. `last_visit_date` is Unix microseconds."""
|
|
181
|
+
conn = _connect(profile.history_path)
|
|
182
|
+
try:
|
|
183
|
+
try:
|
|
184
|
+
cur = conn.execute(
|
|
185
|
+
"SELECT url, title, visit_count, last_visit_date "
|
|
186
|
+
"FROM moz_places WHERE last_visit_date IS NOT NULL")
|
|
187
|
+
except sqlite3.DatabaseError as exc:
|
|
188
|
+
print(f"[browser-sync] {profile.profile_id} "
|
|
189
|
+
f"DB error: {exc}", file=sys.stderr, flush=True)
|
|
190
|
+
return
|
|
191
|
+
for url, title, visits, last in cur:
|
|
192
|
+
if not _keep(url):
|
|
193
|
+
continue
|
|
194
|
+
iso = _firefox_us_to_iso(last)
|
|
195
|
+
if iso is None:
|
|
196
|
+
continue
|
|
197
|
+
if since_iso and iso < since_iso:
|
|
198
|
+
continue
|
|
199
|
+
yield _build_record(profile, url, title, visits, iso)
|
|
200
|
+
finally:
|
|
201
|
+
conn.close()
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def iter_visits(profile: Profile,
|
|
205
|
+
since_iso: Optional[str] = None) -> Iterator[Dict]:
|
|
206
|
+
"""Yield normalised visit records for a profile, optionally
|
|
207
|
+
filtered to only the rows whose latest visit is after
|
|
208
|
+
`since_iso` (a UTC ISO 8601 string)."""
|
|
209
|
+
if profile.family == CHROMIUM:
|
|
210
|
+
yield from _iter_chromium(profile, since_iso)
|
|
211
|
+
elif profile.family == FIREFOX:
|
|
212
|
+
yield from _iter_firefox(profile, since_iso)
|
browser_sync/scanner.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Enumerate every browser profile the current user owns across the
|
|
3
|
+
three desktop OSes. Yields one (browser_family, browser_label,
|
|
4
|
+
profile_id, history_path) tuple per discovered profile.
|
|
5
|
+
|
|
6
|
+
Supports
|
|
7
|
+
--------
|
|
8
|
+
Chromium family Chrome, Edge, Brave, Arc, Vivaldi, Chromium
|
|
9
|
+
Mozilla family Firefox (incl. Developer / Nightly / ESR)
|
|
10
|
+
|
|
11
|
+
Cross-OS path handling
|
|
12
|
+
----------------------
|
|
13
|
+
macOS ~/Library/Application Support/<vendor>/...
|
|
14
|
+
Linux ~/.config/<vendor>/... + snap + flatpak sandbox paths
|
|
15
|
+
Windows %LOCALAPPDATA% or %APPDATA%\\<vendor>\\...
|
|
16
|
+
|
|
17
|
+
Firefox is special: the profile directory has a random id; the
|
|
18
|
+
active profile names live in profiles.ini next to the directory.
|
|
19
|
+
We enumerate every *.default* directory plus whatever profiles.ini
|
|
20
|
+
lists so users with multiple profiles all flow through.
|
|
21
|
+
"""
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import configparser
|
|
25
|
+
import os
|
|
26
|
+
import sys
|
|
27
|
+
from dataclasses import dataclass
|
|
28
|
+
from pathlib import Path
|
|
29
|
+
from typing import Iterator, List
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
CHROMIUM = "chromium"
|
|
33
|
+
FIREFOX = "firefox"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@dataclass(frozen=True)
|
|
37
|
+
class Profile:
|
|
38
|
+
family: str # "chromium" | "firefox"
|
|
39
|
+
browser: str # "chrome" | "edge" | "brave" | "arc" | "vivaldi" | "firefox"
|
|
40
|
+
profile_id: str # "Default", "Profile 1", or firefox random id
|
|
41
|
+
history_path: Path
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# ---------------------------------------------------------------- #
|
|
45
|
+
# Per-platform vendor roots #
|
|
46
|
+
# ---------------------------------------------------------------- #
|
|
47
|
+
def _chromium_roots() -> List[tuple]:
|
|
48
|
+
"""Return list of (browser_id, browser_label, user_data_root)
|
|
49
|
+
candidates for every Chromium-family browser known to the
|
|
50
|
+
current OS."""
|
|
51
|
+
home = Path.home()
|
|
52
|
+
p = sys.platform
|
|
53
|
+
out: List[tuple] = []
|
|
54
|
+
|
|
55
|
+
if p == "darwin":
|
|
56
|
+
base = home / "Library/Application Support"
|
|
57
|
+
out += [
|
|
58
|
+
("chrome", "Chrome", base / "Google/Chrome"),
|
|
59
|
+
("chrome", "Chrome Beta", base / "Google/Chrome Beta"),
|
|
60
|
+
("chrome", "Chromium", base / "Chromium"),
|
|
61
|
+
("edge", "Edge", base / "Microsoft Edge"),
|
|
62
|
+
("brave", "Brave", base / "BraveSoftware/Brave-Browser"),
|
|
63
|
+
("arc", "Arc", base / "Arc/User Data"),
|
|
64
|
+
("vivaldi", "Vivaldi", base / "Vivaldi"),
|
|
65
|
+
("opera", "Opera", base / "com.operasoftware.Opera"),
|
|
66
|
+
]
|
|
67
|
+
elif p.startswith("linux"):
|
|
68
|
+
cfg = home / ".config"
|
|
69
|
+
out += [
|
|
70
|
+
("chrome", "Chrome", cfg / "google-chrome"),
|
|
71
|
+
("chrome", "Chrome Beta", cfg / "google-chrome-beta"),
|
|
72
|
+
("chrome", "Chromium", cfg / "chromium"),
|
|
73
|
+
("edge", "Edge", cfg / "microsoft-edge"),
|
|
74
|
+
("brave", "Brave",
|
|
75
|
+
cfg / "BraveSoftware/Brave-Browser"),
|
|
76
|
+
("vivaldi", "Vivaldi", cfg / "vivaldi"),
|
|
77
|
+
("opera", "Opera", cfg / "opera"),
|
|
78
|
+
]
|
|
79
|
+
# snap + flatpak sandboxed paths
|
|
80
|
+
snap = home / "snap"
|
|
81
|
+
out += [
|
|
82
|
+
("chrome", "Chromium (snap)",
|
|
83
|
+
snap / "chromium/common/chromium"),
|
|
84
|
+
]
|
|
85
|
+
flat = home / ".var/app"
|
|
86
|
+
out += [
|
|
87
|
+
("chrome", "Chromium (flatpak)",
|
|
88
|
+
flat / "org.chromium.Chromium/config/chromium"),
|
|
89
|
+
("brave", "Brave (flatpak)",
|
|
90
|
+
flat / "com.brave.Browser/config/BraveSoftware/"
|
|
91
|
+
"Brave-Browser"),
|
|
92
|
+
]
|
|
93
|
+
elif p == "win32":
|
|
94
|
+
local = Path(os.environ.get("LOCALAPPDATA", str(home)))
|
|
95
|
+
roaming = Path(os.environ.get("APPDATA", str(home)))
|
|
96
|
+
out += [
|
|
97
|
+
("chrome", "Chrome",
|
|
98
|
+
local / "Google/Chrome/User Data"),
|
|
99
|
+
("chrome", "Chromium",
|
|
100
|
+
local / "Chromium/User Data"),
|
|
101
|
+
("edge", "Edge",
|
|
102
|
+
local / "Microsoft/Edge/User Data"),
|
|
103
|
+
("brave", "Brave",
|
|
104
|
+
local / "BraveSoftware/Brave-Browser/User Data"),
|
|
105
|
+
("vivaldi", "Vivaldi",
|
|
106
|
+
local / "Vivaldi/User Data"),
|
|
107
|
+
("opera", "Opera",
|
|
108
|
+
roaming / "Opera Software/Opera Stable"),
|
|
109
|
+
]
|
|
110
|
+
_ = roaming # quiet linter; reserved for future use
|
|
111
|
+
return out
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _firefox_roots() -> List[Path]:
|
|
115
|
+
"""Return list of Firefox-family base directories (Mozilla/Firefox
|
|
116
|
+
or equivalent) for the current OS. The actual profile lives one
|
|
117
|
+
or two levels deeper."""
|
|
118
|
+
home = Path.home()
|
|
119
|
+
p = sys.platform
|
|
120
|
+
if p == "darwin":
|
|
121
|
+
base = home / "Library/Application Support"
|
|
122
|
+
return [
|
|
123
|
+
base / "Firefox",
|
|
124
|
+
base / "Firefox Developer Edition",
|
|
125
|
+
base / "Firefox Nightly",
|
|
126
|
+
]
|
|
127
|
+
if p.startswith("linux"):
|
|
128
|
+
out = [
|
|
129
|
+
home / ".mozilla/firefox",
|
|
130
|
+
home / "snap/firefox/common/.mozilla/firefox",
|
|
131
|
+
home / ".var/app/org.mozilla.firefox/.mozilla/firefox",
|
|
132
|
+
]
|
|
133
|
+
return out
|
|
134
|
+
if p == "win32":
|
|
135
|
+
roaming = Path(os.environ.get("APPDATA", str(home)))
|
|
136
|
+
return [
|
|
137
|
+
roaming / "Mozilla/Firefox",
|
|
138
|
+
roaming / "Mozilla/Firefox Developer Edition",
|
|
139
|
+
]
|
|
140
|
+
return []
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
# ---------------------------------------------------------------- #
|
|
144
|
+
# Discovery #
|
|
145
|
+
# ---------------------------------------------------------------- #
|
|
146
|
+
def _iter_chromium() -> Iterator[Profile]:
|
|
147
|
+
for browser_id, browser_label, root in _chromium_roots():
|
|
148
|
+
if not root.is_dir():
|
|
149
|
+
continue
|
|
150
|
+
for prof_dir in sorted(root.iterdir()):
|
|
151
|
+
if not prof_dir.is_dir():
|
|
152
|
+
continue
|
|
153
|
+
name = prof_dir.name
|
|
154
|
+
if name not in ("Default",) and not name.startswith("Profile"):
|
|
155
|
+
continue
|
|
156
|
+
hist = prof_dir / "History"
|
|
157
|
+
if hist.is_file():
|
|
158
|
+
yield Profile(
|
|
159
|
+
family = CHROMIUM,
|
|
160
|
+
browser = browser_id,
|
|
161
|
+
profile_id = f"{browser_label}/{name}",
|
|
162
|
+
history_path = hist,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _iter_firefox() -> Iterator[Profile]:
|
|
167
|
+
for root in _firefox_roots():
|
|
168
|
+
if not root.is_dir():
|
|
169
|
+
continue
|
|
170
|
+
# profiles.ini tells us which directories Firefox itself
|
|
171
|
+
# treats as profiles. Falling back to a glob in case the ini
|
|
172
|
+
# is missing or stale.
|
|
173
|
+
seen: set = set()
|
|
174
|
+
ini = root / "profiles.ini"
|
|
175
|
+
if ini.is_file():
|
|
176
|
+
try:
|
|
177
|
+
cfg = configparser.ConfigParser()
|
|
178
|
+
cfg.read(ini, encoding="utf-8")
|
|
179
|
+
for section in cfg.sections():
|
|
180
|
+
if not section.startswith("Profile"):
|
|
181
|
+
continue
|
|
182
|
+
rel = cfg.get(section, "Path", fallback="")
|
|
183
|
+
if not rel:
|
|
184
|
+
continue
|
|
185
|
+
is_relative = cfg.get(section, "IsRelative",
|
|
186
|
+
fallback="1") == "1"
|
|
187
|
+
p = (root / rel) if is_relative else Path(rel)
|
|
188
|
+
if p.is_dir():
|
|
189
|
+
seen.add(p.resolve())
|
|
190
|
+
except (configparser.Error, OSError):
|
|
191
|
+
pass
|
|
192
|
+
|
|
193
|
+
profiles_root = root / "Profiles"
|
|
194
|
+
if profiles_root.is_dir():
|
|
195
|
+
for prof_dir in profiles_root.iterdir():
|
|
196
|
+
if prof_dir.is_dir():
|
|
197
|
+
seen.add(prof_dir.resolve())
|
|
198
|
+
|
|
199
|
+
for prof_dir in sorted(seen):
|
|
200
|
+
hist = prof_dir / "places.sqlite"
|
|
201
|
+
if hist.is_file():
|
|
202
|
+
yield Profile(
|
|
203
|
+
family = FIREFOX,
|
|
204
|
+
browser = "firefox",
|
|
205
|
+
profile_id = f"Firefox/{prof_dir.name}",
|
|
206
|
+
history_path = hist,
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def discover_profiles(
|
|
211
|
+
browser_filter: List[str] = None) -> List[Profile]:
|
|
212
|
+
"""Return every profile we can see on this machine. Optional
|
|
213
|
+
`browser_filter` keeps only matching browser ids (e.g.
|
|
214
|
+
['chrome', 'firefox'])."""
|
|
215
|
+
result: List[Profile] = []
|
|
216
|
+
for p in _iter_chromium():
|
|
217
|
+
result.append(p)
|
|
218
|
+
for p in _iter_firefox():
|
|
219
|
+
result.append(p)
|
|
220
|
+
if browser_filter:
|
|
221
|
+
keep = {b.strip().lower() for b in browser_filter if b.strip()}
|
|
222
|
+
result = [p for p in result if p.browser in keep]
|
|
223
|
+
return result
|
browser_sync/state.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""Memory backend for browser-sync. Same template as the sibling
|
|
2
|
+
connector packages — read-write to EvolutionDB via psycopg over the
|
|
3
|
+
PostgreSQL wire protocol. The browser sync stores one row per
|
|
4
|
+
unique URL per profile (keyed by sha1 of the URL) and one
|
|
5
|
+
per-profile watermark holding the ISO timestamp of the most recent
|
|
6
|
+
visit it has already imported."""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import sys
|
|
11
|
+
import time
|
|
12
|
+
from typing import Dict, Optional
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _e(s: str) -> str:
|
|
16
|
+
if not isinstance(s, str):
|
|
17
|
+
s = str(s)
|
|
18
|
+
s = s.replace("\r", " ").replace("\n", " ").replace("\t", " ")
|
|
19
|
+
return s.replace("'", "''")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _parse_value(raw):
|
|
23
|
+
if isinstance(raw, dict):
|
|
24
|
+
return raw
|
|
25
|
+
if not raw:
|
|
26
|
+
return {}
|
|
27
|
+
try:
|
|
28
|
+
return json.loads(raw)
|
|
29
|
+
except (TypeError, ValueError):
|
|
30
|
+
return {}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class MemoryStore:
|
|
34
|
+
_RECONNECT_ATTEMPTS = 3
|
|
35
|
+
_RECONNECT_BACKOFF_SEC = 0.5
|
|
36
|
+
|
|
37
|
+
def __init__(self, host: str, port: int, user: str, password: str,
|
|
38
|
+
database: str, store: str, namespace: str):
|
|
39
|
+
try:
|
|
40
|
+
import psycopg
|
|
41
|
+
except ImportError as exc:
|
|
42
|
+
raise RuntimeError(
|
|
43
|
+
"evolutiondb-browser-sync requires psycopg. Install "
|
|
44
|
+
"with `pip install psycopg[binary]>=3.1`.") from exc
|
|
45
|
+
|
|
46
|
+
self.psycopg = psycopg
|
|
47
|
+
self._conn_kwargs = dict(
|
|
48
|
+
host=host, port=port, user=user, password=password,
|
|
49
|
+
dbname=database, autocommit=True,
|
|
50
|
+
)
|
|
51
|
+
self.store = store
|
|
52
|
+
self.namespace = namespace
|
|
53
|
+
self.conn = self._connect()
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
with self.conn.cursor() as cur:
|
|
57
|
+
cur.execute(f"CREATE MEMORY STORE {self.store}")
|
|
58
|
+
except Exception:
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
def _connect(self):
|
|
62
|
+
return self.psycopg.connect(**self._conn_kwargs)
|
|
63
|
+
|
|
64
|
+
def _is_dead(self, exc: BaseException) -> bool:
|
|
65
|
+
return isinstance(exc, (self.psycopg.OperationalError,
|
|
66
|
+
self.psycopg.InterfaceError))
|
|
67
|
+
|
|
68
|
+
def _retry(self, fn):
|
|
69
|
+
last = None
|
|
70
|
+
for attempt in range(self._RECONNECT_ATTEMPTS):
|
|
71
|
+
try:
|
|
72
|
+
with self.conn.cursor() as cur:
|
|
73
|
+
return fn(cur)
|
|
74
|
+
except Exception as exc:
|
|
75
|
+
if not self._is_dead(exc):
|
|
76
|
+
raise
|
|
77
|
+
last = exc
|
|
78
|
+
print(f"[browser-sync] db connection lost "
|
|
79
|
+
f"(attempt {attempt + 1}): {exc}",
|
|
80
|
+
file=sys.stderr, flush=True)
|
|
81
|
+
try:
|
|
82
|
+
self.conn.close()
|
|
83
|
+
except Exception:
|
|
84
|
+
pass
|
|
85
|
+
if attempt + 1 < self._RECONNECT_ATTEMPTS:
|
|
86
|
+
time.sleep(self._RECONNECT_BACKOFF_SEC *
|
|
87
|
+
(attempt + 1))
|
|
88
|
+
try:
|
|
89
|
+
self.conn = self._connect()
|
|
90
|
+
except Exception as reconn:
|
|
91
|
+
last = reconn
|
|
92
|
+
continue
|
|
93
|
+
raise last # type: ignore[misc]
|
|
94
|
+
|
|
95
|
+
# ---------- watermark (per-profile) ----------
|
|
96
|
+
@staticmethod
|
|
97
|
+
def _wm_key(profile_id: str) -> str:
|
|
98
|
+
safe = profile_id.replace("/", "_").replace(" ", "_")
|
|
99
|
+
return f"browser_state_{safe}"
|
|
100
|
+
|
|
101
|
+
def get_watermark_iso(self, profile_id: str) -> Optional[str]:
|
|
102
|
+
key = self._wm_key(profile_id)
|
|
103
|
+
|
|
104
|
+
def run(cur):
|
|
105
|
+
cur.execute(
|
|
106
|
+
f"SELECT mem_value FROM __mem_{self.store} "
|
|
107
|
+
f"WHERE mem_namespace = '{_e(self.namespace)}' "
|
|
108
|
+
f"AND mem_key = '{_e(key)}'")
|
|
109
|
+
rows = cur.fetchall()
|
|
110
|
+
if not rows:
|
|
111
|
+
return None
|
|
112
|
+
v = _parse_value(rows[0][0]).get("last_visited_at")
|
|
113
|
+
return str(v) if v else None
|
|
114
|
+
return self._retry(run)
|
|
115
|
+
|
|
116
|
+
def set_watermark_iso(self, profile_id: str,
|
|
117
|
+
last_iso: str) -> None:
|
|
118
|
+
key = self._wm_key(profile_id)
|
|
119
|
+
value = json.dumps({"last_visited_at": last_iso,
|
|
120
|
+
"saved_at": time.time()})
|
|
121
|
+
self._retry(lambda cur: cur.execute(
|
|
122
|
+
f"MEMORY PUT INTO {self.store} VALUES "
|
|
123
|
+
f"('{_e(self.namespace)}','{_e(key)}','{_e(value)}')"))
|
|
124
|
+
|
|
125
|
+
# ---------- records ----------
|
|
126
|
+
def put_record(self, key: str, record: Dict) -> None:
|
|
127
|
+
value = json.dumps(record, ensure_ascii=False)
|
|
128
|
+
self._retry(lambda cur: cur.execute(
|
|
129
|
+
f"MEMORY PUT INTO {self.store} VALUES "
|
|
130
|
+
f"('{_e(self.namespace)}','{_e(key)}','{_e(value)}')"))
|
browser_sync/sync.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
"""
|
|
2
|
+
evolutiondb-browser-sync — daemon that snapshots local browser
|
|
3
|
+
history and pours visit records into the evolutiondb-memory store.
|
|
4
|
+
|
|
5
|
+
Modes
|
|
6
|
+
-----
|
|
7
|
+
--once one sync pass over every discovered profile,
|
|
8
|
+
then exit.
|
|
9
|
+
--interval SECONDS daemon mode. Each pass is incremental.
|
|
10
|
+
--since "30d" first-run wall-clock floor when the profile
|
|
11
|
+
has no stored watermark.
|
|
12
|
+
--browser chrome,edge,firefox
|
|
13
|
+
restrict to a subset of browsers.
|
|
14
|
+
--dry-run do everything except write to memory.
|
|
15
|
+
|
|
16
|
+
No --auth subcommand exists. Browser history lives in files the
|
|
17
|
+
user already owns, so there is no OAuth flow to start.
|
|
18
|
+
"""
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
import argparse
|
|
22
|
+
import json
|
|
23
|
+
import os
|
|
24
|
+
import re
|
|
25
|
+
import signal
|
|
26
|
+
import sys
|
|
27
|
+
import time
|
|
28
|
+
from datetime import datetime, timedelta, timezone
|
|
29
|
+
from pathlib import Path
|
|
30
|
+
from typing import Dict, List, Optional
|
|
31
|
+
|
|
32
|
+
from . import scanner as scan_mod
|
|
33
|
+
from . import extract as ext_mod
|
|
34
|
+
from . import state as state_mod
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# ---------------------------------------------------------------- #
|
|
38
|
+
# Config + .env loader (same shape as sibling syncs) #
|
|
39
|
+
# ---------------------------------------------------------------- #
|
|
40
|
+
def _load_dotenv(path: Path) -> None:
|
|
41
|
+
if not path.exists():
|
|
42
|
+
return
|
|
43
|
+
for raw in path.read_text(encoding="utf-8").splitlines():
|
|
44
|
+
line = raw.strip()
|
|
45
|
+
if not line or line.startswith("#") or "=" not in line:
|
|
46
|
+
continue
|
|
47
|
+
k, _, v = line.partition("=")
|
|
48
|
+
k, v = k.strip(), v.strip().strip('"').strip("'")
|
|
49
|
+
if v:
|
|
50
|
+
os.environ.setdefault(k, v)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class Config:
|
|
54
|
+
def __init__(self):
|
|
55
|
+
self.evosql_host = os.environ.get("EVOSQL_HOST", "127.0.0.1")
|
|
56
|
+
self.evosql_port = int(os.environ.get("EVOSQL_PORT", "5433"))
|
|
57
|
+
self.evosql_user = os.environ.get("EVOSQL_USER", "admin")
|
|
58
|
+
self.evosql_pass = os.environ.get("EVOSQL_PASSWORD", "admin")
|
|
59
|
+
self.evosql_db = os.environ.get("EVOSQL_DATABASE", "evosql")
|
|
60
|
+
self.user_id = os.environ.get("MCP_USER_ID",
|
|
61
|
+
"default_user")
|
|
62
|
+
self.store = os.environ.get("BROWSER_MEMORY_STORE",
|
|
63
|
+
"mcp_mem")
|
|
64
|
+
self.poll_secs = int(os.environ.get("BROWSER_POLL_INTERVAL",
|
|
65
|
+
"1800"))
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
_SINCE_RE = re.compile(r"^\s*(\d+)\s*([smhdw])\s*$", re.I)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def parse_since(text: str) -> str:
|
|
72
|
+
m = _SINCE_RE.match(text)
|
|
73
|
+
if not m:
|
|
74
|
+
raise ValueError(
|
|
75
|
+
f"--since must be like '1h', '7d', got {text!r}")
|
|
76
|
+
n, unit = int(m.group(1)), m.group(2).lower()
|
|
77
|
+
delta = {
|
|
78
|
+
"s": timedelta(seconds=n),
|
|
79
|
+
"m": timedelta(minutes=n),
|
|
80
|
+
"h": timedelta(hours=n),
|
|
81
|
+
"d": timedelta(days=n),
|
|
82
|
+
"w": timedelta(weeks=n),
|
|
83
|
+
}[unit]
|
|
84
|
+
return (datetime.now(timezone.utc) - delta).isoformat(
|
|
85
|
+
timespec="seconds").replace("+00:00", "Z")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# ---------------------------------------------------------------- #
|
|
89
|
+
# Pass #
|
|
90
|
+
# ---------------------------------------------------------------- #
|
|
91
|
+
def _record_key(record: Dict) -> str:
|
|
92
|
+
"""Stable per (browser, profile, url) key so re-syncing the same
|
|
93
|
+
URL upserts the watermark / visit_count instead of multiplying
|
|
94
|
+
rows in the memory store."""
|
|
95
|
+
profile_slug = (record["profile"]
|
|
96
|
+
.replace("/", "_").replace(" ", "_"))
|
|
97
|
+
return (f"browser_visit_{record['browser']}_"
|
|
98
|
+
f"{profile_slug}_{record['url_hash']}")
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def sync_once(cfg: Config, *, since_iso: Optional[str],
|
|
102
|
+
browser_filter: Optional[List[str]],
|
|
103
|
+
dry_run: bool = False) -> Dict[str, int]:
|
|
104
|
+
profiles = scan_mod.discover_profiles(browser_filter)
|
|
105
|
+
counters = {"profiles": 0, "visits": 0,
|
|
106
|
+
"skipped": 0, "errors": 0}
|
|
107
|
+
|
|
108
|
+
store: Optional[state_mod.MemoryStore]
|
|
109
|
+
if dry_run:
|
|
110
|
+
store = None
|
|
111
|
+
else:
|
|
112
|
+
store = state_mod.MemoryStore(
|
|
113
|
+
host=cfg.evosql_host, port=cfg.evosql_port,
|
|
114
|
+
user=cfg.evosql_user, password=cfg.evosql_pass,
|
|
115
|
+
database=cfg.evosql_db, store=cfg.store,
|
|
116
|
+
namespace=cfg.user_id,
|
|
117
|
+
)
|
|
118
|
+
|
|
119
|
+
for prof in profiles:
|
|
120
|
+
counters["profiles"] += 1
|
|
121
|
+
wm = (store.get_watermark_iso(prof.profile_id)
|
|
122
|
+
if store else None)
|
|
123
|
+
floor = wm or since_iso
|
|
124
|
+
latest = floor or ""
|
|
125
|
+
try:
|
|
126
|
+
for record in ext_mod.iter_visits(prof, since_iso=floor):
|
|
127
|
+
key = _record_key(record)
|
|
128
|
+
if store:
|
|
129
|
+
store.put_record(key, record)
|
|
130
|
+
counters["visits"] += 1
|
|
131
|
+
if record["last_visited_at"] > latest:
|
|
132
|
+
latest = record["last_visited_at"]
|
|
133
|
+
except Exception as exc: # noqa: BLE001
|
|
134
|
+
print(f"[browser-sync] {prof.profile_id} failed: {exc}",
|
|
135
|
+
file=sys.stderr, flush=True)
|
|
136
|
+
counters["errors"] += 1
|
|
137
|
+
continue
|
|
138
|
+
if store and latest and latest != (wm or ""):
|
|
139
|
+
store.set_watermark_iso(prof.profile_id, latest)
|
|
140
|
+
|
|
141
|
+
return counters
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# ---------------------------------------------------------------- #
|
|
145
|
+
# Signals #
|
|
146
|
+
# ---------------------------------------------------------------- #
|
|
147
|
+
_stop = False
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _install_signal_handlers() -> None:
|
|
151
|
+
def _handler(_signum, _frame):
|
|
152
|
+
global _stop
|
|
153
|
+
_stop = True
|
|
154
|
+
print("[browser-sync] stop requested, finishing current pass",
|
|
155
|
+
file=sys.stderr, flush=True)
|
|
156
|
+
for s in (signal.SIGTERM, signal.SIGINT):
|
|
157
|
+
try:
|
|
158
|
+
signal.signal(s, _handler)
|
|
159
|
+
except (ValueError, OSError):
|
|
160
|
+
pass
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
# ---------------------------------------------------------------- #
|
|
164
|
+
# CLI #
|
|
165
|
+
# ---------------------------------------------------------------- #
|
|
166
|
+
def main(argv: Optional[List[str]] = None) -> int:
|
|
167
|
+
parser = argparse.ArgumentParser(prog="evolutiondb-browser-sync",
|
|
168
|
+
description="Sync browser history into EvolutionDB memory.")
|
|
169
|
+
parser.add_argument("--once", action="store_true")
|
|
170
|
+
parser.add_argument("--interval", type=int)
|
|
171
|
+
parser.add_argument("--since", default="7d")
|
|
172
|
+
parser.add_argument("--browser", default="",
|
|
173
|
+
help="Comma list (chrome,edge,firefox,brave,arc,vivaldi,"
|
|
174
|
+
"opera). Empty means every detected browser.")
|
|
175
|
+
parser.add_argument("--list", action="store_true",
|
|
176
|
+
help="Print every detected profile and exit.")
|
|
177
|
+
parser.add_argument("--dry-run", action="store_true")
|
|
178
|
+
parser.add_argument("--env-file", default=".env")
|
|
179
|
+
args = parser.parse_args(argv)
|
|
180
|
+
|
|
181
|
+
_load_dotenv(Path(args.env_file).expanduser())
|
|
182
|
+
|
|
183
|
+
browser_filter = (
|
|
184
|
+
[b for b in args.browser.split(",") if b.strip()]
|
|
185
|
+
if args.browser else None)
|
|
186
|
+
|
|
187
|
+
if args.list:
|
|
188
|
+
profiles = scan_mod.discover_profiles(browser_filter)
|
|
189
|
+
for p in profiles:
|
|
190
|
+
print(f"{p.browser:10s} {p.profile_id:40s} "
|
|
191
|
+
f"{p.history_path}")
|
|
192
|
+
if not profiles:
|
|
193
|
+
print("(no profiles detected)")
|
|
194
|
+
return 0
|
|
195
|
+
|
|
196
|
+
try:
|
|
197
|
+
since_iso = parse_since(args.since)
|
|
198
|
+
except ValueError as exc:
|
|
199
|
+
print(f"error: {exc}", file=sys.stderr)
|
|
200
|
+
return 2
|
|
201
|
+
|
|
202
|
+
cfg = Config()
|
|
203
|
+
_install_signal_handlers()
|
|
204
|
+
|
|
205
|
+
def run_pass() -> int:
|
|
206
|
+
try:
|
|
207
|
+
counts = sync_once(cfg, since_iso=since_iso,
|
|
208
|
+
browser_filter=browser_filter,
|
|
209
|
+
dry_run=args.dry_run)
|
|
210
|
+
print(json.dumps({"ok": True, **counts}), flush=True)
|
|
211
|
+
return 0
|
|
212
|
+
except Exception as exc: # noqa: BLE001
|
|
213
|
+
print(json.dumps({"ok": False, "error": str(exc)}),
|
|
214
|
+
flush=True)
|
|
215
|
+
return 4
|
|
216
|
+
|
|
217
|
+
if args.once or not args.interval:
|
|
218
|
+
return run_pass()
|
|
219
|
+
|
|
220
|
+
interval = max(60, int(args.interval))
|
|
221
|
+
rc = 0
|
|
222
|
+
while not _stop:
|
|
223
|
+
rc = run_pass()
|
|
224
|
+
for _ in range(interval):
|
|
225
|
+
if _stop:
|
|
226
|
+
break
|
|
227
|
+
time.sleep(1)
|
|
228
|
+
return rc
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: evolutiondb-browser-sync
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Sync browser history (Chrome, Edge, Firefox) into EvolutionDB long-term memory.
|
|
5
|
+
Author-email: alptekin topal <topal.alptekin@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/alptekin/evolutiondb
|
|
8
|
+
Project-URL: Repository, https://github.com/alptekin/evolutiondb
|
|
9
|
+
Project-URL: Source, https://github.com/alptekin/evolutiondb/tree/main/client/browser-sync
|
|
10
|
+
Project-URL: Issues, https://github.com/alptekin/evolutiondb/issues
|
|
11
|
+
Keywords: evolutiondb,browser,history,chrome,edge,firefox,long-term-memory,agent-memory,mcp
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Operating System :: MacOS
|
|
18
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
19
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
20
|
+
Classifier: Programming Language :: Python :: 3
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
26
|
+
Classifier: Topic :: Database
|
|
27
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Browsers
|
|
28
|
+
Requires-Python: >=3.9
|
|
29
|
+
Requires-Dist: psycopg[binary]>=3.1
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
browser_sync/__init__.py,sha256=Q_VVL-1Ove9j8DowqW0AYIa2QQbROunVMW0xq6MPv6g,413
|
|
2
|
+
browser_sync/__main__.py,sha256=8Ap-X_Be9dePdOFaAWS6BNHL2kg3FQHkMgQ6Hufk7jg,80
|
|
3
|
+
browser_sync/extract.py,sha256=10xCWtSADqBuRmjfNJldstnAw3ih_t3DAzTxIEc4yBQ,7972
|
|
4
|
+
browser_sync/scanner.py,sha256=xj_quelH4sJL4uxiA83XEPIZy4xeBSuDuns4aYuwQvI,8328
|
|
5
|
+
browser_sync/state.py,sha256=5BnSiTKwENvEyIUunFhCfUT7_4Q5Mv7gnOZBslEXUOY,4553
|
|
6
|
+
browser_sync/sync.py,sha256=JESNnblQSEB7IKwKma9AfH5Bi9th6FedZLJciEXPzBQ,8215
|
|
7
|
+
evolutiondb_browser_sync-0.1.0.dist-info/METADATA,sha256=k4lxKp2up8lSfdqu7KLs-62f7wyuSHWuP9KxsELP23Q,1408
|
|
8
|
+
evolutiondb_browser_sync-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
9
|
+
evolutiondb_browser_sync-0.1.0.dist-info/entry_points.txt,sha256=8CbFmjK0Ls5Usp86mI_VNCT2PHL3bmtJifBFZpLfc2M,72
|
|
10
|
+
evolutiondb_browser_sync-0.1.0.dist-info/top_level.txt,sha256=ZOT504CJKp9k-UKUagrC2S2-YZkSVCiGOfMzwbkcgFg,13
|
|
11
|
+
evolutiondb_browser_sync-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
browser_sync
|