evolutiondb-browser-sync 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/PKG-INFO +1 -1
- {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/browser_sync/extract.py +169 -2
- {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/evolutiondb_browser_sync.egg-info/PKG-INFO +1 -1
- {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/pyproject.toml +1 -1
- {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/browser_sync/__init__.py +0 -0
- {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/browser_sync/__main__.py +0 -0
- {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/browser_sync/scanner.py +0 -0
- {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/browser_sync/state.py +0 -0
- {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/browser_sync/sync.py +0 -0
- {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/evolutiondb_browser_sync.egg-info/SOURCES.txt +0 -0
- {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/evolutiondb_browser_sync.egg-info/dependency_links.txt +0 -0
- {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/evolutiondb_browser_sync.egg-info/entry_points.txt +0 -0
- {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/evolutiondb_browser_sync.egg-info/requires.txt +0 -0
- {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/evolutiondb_browser_sync.egg-info/top_level.txt +0 -0
- {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/setup.cfg +0 -0
|
@@ -23,11 +23,177 @@ import sys
|
|
|
23
23
|
import tempfile
|
|
24
24
|
from datetime import datetime, timedelta, timezone
|
|
25
25
|
from pathlib import Path
|
|
26
|
-
from typing import Dict, Iterator, Optional
|
|
26
|
+
from typing import Dict, Iterator, List, Optional
|
|
27
|
+
from urllib.parse import urlparse
|
|
27
28
|
|
|
28
29
|
from .scanner import CHROMIUM, FIREFOX, Profile
|
|
29
30
|
|
|
30
31
|
|
|
32
|
+
# ---------------------------------------------------------------- #
|
|
33
|
+
# Content tagging #
|
|
34
|
+
# ---------------------------------------------------------------- #
|
|
35
|
+
# A visit's `tags` field starts out generic ("browser", "history",
|
|
36
|
+
# "chrome"). _content_tags() inspects the URL host (and a little of
|
|
37
|
+
# the path, for YouTube) and adds topical labels so the catalog can
|
|
38
|
+
# answer questions like "what did I read about coding this week" or
|
|
39
|
+
# "which videos did I watch" without scanning every URL string.
|
|
40
|
+
#
|
|
41
|
+
# Lookups are exact-suffix matches on the registrable host; this is
|
|
42
|
+
# cheaper than parsing tld registries and catches the long tail of
|
|
43
|
+
# regional clones (e.g. youtube.com vs m.youtube.com) without false
|
|
44
|
+
# positives.
|
|
45
|
+
|
|
46
|
+
_DOMAIN_TAGS: Dict[str, List[str]] = {
|
|
47
|
+
# Video
|
|
48
|
+
"youtube.com": ["youtube", "video"],
|
|
49
|
+
"youtu.be": ["youtube", "video"],
|
|
50
|
+
"vimeo.com": ["video"],
|
|
51
|
+
"netflix.com": ["video", "streaming"],
|
|
52
|
+
"disneyplus.com": ["video", "streaming"],
|
|
53
|
+
"twitch.tv": ["video", "streaming"],
|
|
54
|
+
# Code / dev
|
|
55
|
+
"github.com": ["github", "code"],
|
|
56
|
+
"gist.github.com": ["github", "code"],
|
|
57
|
+
"gitlab.com": ["gitlab", "code"],
|
|
58
|
+
"bitbucket.org": ["bitbucket", "code"],
|
|
59
|
+
"stackoverflow.com": ["stackoverflow", "code"],
|
|
60
|
+
"stackexchange.com": ["stackoverflow", "code"],
|
|
61
|
+
"pypi.org": ["code", "docs"],
|
|
62
|
+
"npmjs.com": ["code", "docs"],
|
|
63
|
+
"crates.io": ["code", "docs"],
|
|
64
|
+
# Docs
|
|
65
|
+
"developer.mozilla.org": ["docs"],
|
|
66
|
+
"docs.python.org": ["docs"],
|
|
67
|
+
"react.dev": ["docs"],
|
|
68
|
+
"nodejs.org": ["docs"],
|
|
69
|
+
"kubernetes.io": ["docs"],
|
|
70
|
+
"postgresql.org": ["docs"],
|
|
71
|
+
# Search engines
|
|
72
|
+
"google.com": ["search"],
|
|
73
|
+
"bing.com": ["search"],
|
|
74
|
+
"duckduckgo.com": ["search"],
|
|
75
|
+
# AI
|
|
76
|
+
"chatgpt.com": ["ai"],
|
|
77
|
+
"openai.com": ["ai"],
|
|
78
|
+
"claude.ai": ["ai"],
|
|
79
|
+
"anthropic.com": ["ai"],
|
|
80
|
+
"gemini.google.com": ["ai"],
|
|
81
|
+
"perplexity.ai": ["ai"],
|
|
82
|
+
"huggingface.co": ["ai"],
|
|
83
|
+
# Social
|
|
84
|
+
"twitter.com": ["twitter", "social"],
|
|
85
|
+
"x.com": ["twitter", "social"],
|
|
86
|
+
"linkedin.com": ["linkedin", "social"],
|
|
87
|
+
"reddit.com": ["reddit", "social"],
|
|
88
|
+
"facebook.com": ["facebook", "social"],
|
|
89
|
+
"instagram.com": ["instagram", "social"],
|
|
90
|
+
"threads.net": ["social"],
|
|
91
|
+
"mastodon.social": ["social"],
|
|
92
|
+
"bsky.app": ["social"],
|
|
93
|
+
# Reading / blogs
|
|
94
|
+
"medium.com": ["blog", "reading"],
|
|
95
|
+
"dev.to": ["blog", "reading"],
|
|
96
|
+
"substack.com": ["blog", "reading"],
|
|
97
|
+
"hashnode.com": ["blog", "reading"],
|
|
98
|
+
"news.ycombinator.com": ["hn", "news"],
|
|
99
|
+
# News (intl)
|
|
100
|
+
"bbc.com": ["news"],
|
|
101
|
+
"bbc.co.uk": ["news"],
|
|
102
|
+
"cnn.com": ["news"],
|
|
103
|
+
"nytimes.com": ["news"],
|
|
104
|
+
"reuters.com": ["news"],
|
|
105
|
+
"theguardian.com": ["news"],
|
|
106
|
+
"wsj.com": ["news"],
|
|
107
|
+
# News (TR)
|
|
108
|
+
"hurriyet.com.tr": ["news"],
|
|
109
|
+
"milliyet.com.tr": ["news"],
|
|
110
|
+
"sabah.com.tr": ["news"],
|
|
111
|
+
"sozcu.com.tr": ["news"],
|
|
112
|
+
"haberturk.com": ["news"],
|
|
113
|
+
"ntv.com.tr": ["news"],
|
|
114
|
+
"cnnturk.com": ["news"],
|
|
115
|
+
"t24.com.tr": ["news"],
|
|
116
|
+
# Shopping
|
|
117
|
+
"amazon.com": ["shopping"],
|
|
118
|
+
"amazon.com.tr": ["shopping"],
|
|
119
|
+
"ebay.com": ["shopping"],
|
|
120
|
+
"trendyol.com": ["shopping"],
|
|
121
|
+
"hepsiburada.com": ["shopping"],
|
|
122
|
+
"n11.com": ["shopping"],
|
|
123
|
+
"gittigidiyor.com": ["shopping"],
|
|
124
|
+
# Email / productivity
|
|
125
|
+
"mail.google.com": ["email"],
|
|
126
|
+
"outlook.live.com": ["email"],
|
|
127
|
+
"outlook.office.com": ["email"],
|
|
128
|
+
"calendar.google.com": ["calendar"],
|
|
129
|
+
"drive.google.com": ["docs", "storage"],
|
|
130
|
+
"docs.google.com": ["docs"],
|
|
131
|
+
"notion.so": ["docs", "notes"],
|
|
132
|
+
"slack.com": ["chat", "work"],
|
|
133
|
+
"teams.microsoft.com": ["chat", "work"],
|
|
134
|
+
"discord.com": ["chat"],
|
|
135
|
+
# Maps / travel
|
|
136
|
+
"maps.google.com": ["maps"],
|
|
137
|
+
"google.com/maps": ["maps"],
|
|
138
|
+
"booking.com": ["travel"],
|
|
139
|
+
"airbnb.com": ["travel"],
|
|
140
|
+
# Banking (TR)
|
|
141
|
+
"garanti.com.tr": ["banking", "finance"],
|
|
142
|
+
"garantibbva.com.tr": ["banking", "finance"],
|
|
143
|
+
"akbank.com": ["banking", "finance"],
|
|
144
|
+
"isbank.com.tr": ["banking", "finance"],
|
|
145
|
+
"yapikredi.com.tr": ["banking", "finance"],
|
|
146
|
+
"ziraatbank.com.tr": ["banking", "finance"],
|
|
147
|
+
"denizbank.com": ["banking", "finance"],
|
|
148
|
+
"finansbank.com.tr": ["banking", "finance"],
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _host(url: str) -> str:
|
|
153
|
+
try:
|
|
154
|
+
h = urlparse(url).hostname or ""
|
|
155
|
+
except Exception:
|
|
156
|
+
return ""
|
|
157
|
+
return h.lower().lstrip(".")
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _content_tags(url: str) -> List[str]:
|
|
161
|
+
"""Derive topical tags from the URL. The host is matched as a
|
|
162
|
+
progressively shortened suffix so `m.youtube.com` and
|
|
163
|
+
`music.youtube.com` both hit the `youtube.com` rule.
|
|
164
|
+
"""
|
|
165
|
+
host = _host(url)
|
|
166
|
+
if not host:
|
|
167
|
+
return []
|
|
168
|
+
parts = host.split(".")
|
|
169
|
+
tags: List[str] = []
|
|
170
|
+
seen = set()
|
|
171
|
+
for i in range(len(parts) - 1):
|
|
172
|
+
candidate = ".".join(parts[i:])
|
|
173
|
+
for t in _DOMAIN_TAGS.get(candidate, ()):
|
|
174
|
+
if t not in seen:
|
|
175
|
+
seen.add(t)
|
|
176
|
+
tags.append(t)
|
|
177
|
+
if tags:
|
|
178
|
+
break
|
|
179
|
+
# YouTube path nuances — only meaningful for youtube hosts.
|
|
180
|
+
if "youtube" in tags:
|
|
181
|
+
try:
|
|
182
|
+
path = urlparse(url).path or ""
|
|
183
|
+
except Exception:
|
|
184
|
+
path = ""
|
|
185
|
+
if path.startswith("/shorts/"):
|
|
186
|
+
tags.append("shorts")
|
|
187
|
+
elif path.startswith("/watch"):
|
|
188
|
+
tags.append("watch")
|
|
189
|
+
elif path.startswith("/@") or path.startswith("/channel/") \
|
|
190
|
+
or path.startswith("/c/") or path.startswith("/user/"):
|
|
191
|
+
tags.append("channel")
|
|
192
|
+
elif path.startswith("/playlist"):
|
|
193
|
+
tags.append("playlist")
|
|
194
|
+
return tags
|
|
195
|
+
|
|
196
|
+
|
|
31
197
|
_CHROME_EPOCH = datetime(1601, 1, 1, tzinfo=timezone.utc)
|
|
32
198
|
_SKIP_PREFIXES = (
|
|
33
199
|
"chrome://", "chrome-extension://", "chrome-search://",
|
|
@@ -139,7 +305,8 @@ def _build_record(profile: Profile, url: str, title: str,
|
|
|
139
305
|
"visit_count": int(visit_count or 0),
|
|
140
306
|
"last_visited_at": last_iso,
|
|
141
307
|
"url_hash": url_hash,
|
|
142
|
-
"tags": ["browser", "history", profile.browser
|
|
308
|
+
"tags": ["browser", "history", profile.browser,
|
|
309
|
+
*_content_tags(url)],
|
|
143
310
|
}
|
|
144
311
|
|
|
145
312
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "evolutiondb-browser-sync"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.1"
|
|
8
8
|
description = "Sync browser history (Chrome, Edge, Firefox) into EvolutionDB long-term memory."
|
|
9
9
|
requires-python = ">=3.9"
|
|
10
10
|
license = {text = "MIT"}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|