evolutiondb-browser-sync 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (15) hide show
  1. {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/PKG-INFO +1 -1
  2. {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/browser_sync/extract.py +169 -2
  3. {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/evolutiondb_browser_sync.egg-info/PKG-INFO +1 -1
  4. {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/pyproject.toml +1 -1
  5. {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/browser_sync/__init__.py +0 -0
  6. {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/browser_sync/__main__.py +0 -0
  7. {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/browser_sync/scanner.py +0 -0
  8. {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/browser_sync/state.py +0 -0
  9. {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/browser_sync/sync.py +0 -0
  10. {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/evolutiondb_browser_sync.egg-info/SOURCES.txt +0 -0
  11. {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/evolutiondb_browser_sync.egg-info/dependency_links.txt +0 -0
  12. {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/evolutiondb_browser_sync.egg-info/entry_points.txt +0 -0
  13. {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/evolutiondb_browser_sync.egg-info/requires.txt +0 -0
  14. {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/evolutiondb_browser_sync.egg-info/top_level.txt +0 -0
  15. {evolutiondb_browser_sync-0.1.0 → evolutiondb_browser_sync-0.1.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evolutiondb-browser-sync
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Sync browser history (Chrome, Edge, Firefox) into EvolutionDB long-term memory.
5
5
  Author-email: alptekin topal <topal.alptekin@gmail.com>
6
6
  License: MIT
@@ -23,11 +23,177 @@ import sys
23
23
  import tempfile
24
24
  from datetime import datetime, timedelta, timezone
25
25
  from pathlib import Path
26
- from typing import Dict, Iterator, Optional
26
+ from typing import Dict, Iterator, List, Optional
27
+ from urllib.parse import urlparse
27
28
 
28
29
  from .scanner import CHROMIUM, FIREFOX, Profile
29
30
 
30
31
 
32
+ # ---------------------------------------------------------------- #
33
+ # Content tagging #
34
+ # ---------------------------------------------------------------- #
35
+ # A visit's `tags` field starts out generic ("browser", "history",
36
+ # "chrome"). _content_tags() inspects the URL host (and a little of
37
+ # the path, for YouTube) and adds topical labels so the catalog can
38
+ # answer questions like "what did I read about coding this week" or
39
+ # "which videos did I watch" without scanning every URL string.
40
+ #
41
+ # Lookups are exact-suffix matches on the registrable host; this is
42
+ # cheaper than parsing tld registries and catches the long tail of
43
+ # regional clones (e.g. youtube.com vs m.youtube.com) without false
44
+ # positives.
45
+
46
+ _DOMAIN_TAGS: Dict[str, List[str]] = {
47
+ # Video
48
+ "youtube.com": ["youtube", "video"],
49
+ "youtu.be": ["youtube", "video"],
50
+ "vimeo.com": ["video"],
51
+ "netflix.com": ["video", "streaming"],
52
+ "disneyplus.com": ["video", "streaming"],
53
+ "twitch.tv": ["video", "streaming"],
54
+ # Code / dev
55
+ "github.com": ["github", "code"],
56
+ "gist.github.com": ["github", "code"],
57
+ "gitlab.com": ["gitlab", "code"],
58
+ "bitbucket.org": ["bitbucket", "code"],
59
+ "stackoverflow.com": ["stackoverflow", "code"],
60
+ "stackexchange.com": ["stackoverflow", "code"],
61
+ "pypi.org": ["code", "docs"],
62
+ "npmjs.com": ["code", "docs"],
63
+ "crates.io": ["code", "docs"],
64
+ # Docs
65
+ "developer.mozilla.org": ["docs"],
66
+ "docs.python.org": ["docs"],
67
+ "react.dev": ["docs"],
68
+ "nodejs.org": ["docs"],
69
+ "kubernetes.io": ["docs"],
70
+ "postgresql.org": ["docs"],
71
+ # Search engines
72
+ "google.com": ["search"],
73
+ "bing.com": ["search"],
74
+ "duckduckgo.com": ["search"],
75
+ # AI
76
+ "chatgpt.com": ["ai"],
77
+ "openai.com": ["ai"],
78
+ "claude.ai": ["ai"],
79
+ "anthropic.com": ["ai"],
80
+ "gemini.google.com": ["ai"],
81
+ "perplexity.ai": ["ai"],
82
+ "huggingface.co": ["ai"],
83
+ # Social
84
+ "twitter.com": ["twitter", "social"],
85
+ "x.com": ["twitter", "social"],
86
+ "linkedin.com": ["linkedin", "social"],
87
+ "reddit.com": ["reddit", "social"],
88
+ "facebook.com": ["facebook", "social"],
89
+ "instagram.com": ["instagram", "social"],
90
+ "threads.net": ["social"],
91
+ "mastodon.social": ["social"],
92
+ "bsky.app": ["social"],
93
+ # Reading / blogs
94
+ "medium.com": ["blog", "reading"],
95
+ "dev.to": ["blog", "reading"],
96
+ "substack.com": ["blog", "reading"],
97
+ "hashnode.com": ["blog", "reading"],
98
+ "news.ycombinator.com": ["hn", "news"],
99
+ # News (intl)
100
+ "bbc.com": ["news"],
101
+ "bbc.co.uk": ["news"],
102
+ "cnn.com": ["news"],
103
+ "nytimes.com": ["news"],
104
+ "reuters.com": ["news"],
105
+ "theguardian.com": ["news"],
106
+ "wsj.com": ["news"],
107
+ # News (TR)
108
+ "hurriyet.com.tr": ["news"],
109
+ "milliyet.com.tr": ["news"],
110
+ "sabah.com.tr": ["news"],
111
+ "sozcu.com.tr": ["news"],
112
+ "haberturk.com": ["news"],
113
+ "ntv.com.tr": ["news"],
114
+ "cnnturk.com": ["news"],
115
+ "t24.com.tr": ["news"],
116
+ # Shopping
117
+ "amazon.com": ["shopping"],
118
+ "amazon.com.tr": ["shopping"],
119
+ "ebay.com": ["shopping"],
120
+ "trendyol.com": ["shopping"],
121
+ "hepsiburada.com": ["shopping"],
122
+ "n11.com": ["shopping"],
123
+ "gittigidiyor.com": ["shopping"],
124
+ # Email / productivity
125
+ "mail.google.com": ["email"],
126
+ "outlook.live.com": ["email"],
127
+ "outlook.office.com": ["email"],
128
+ "calendar.google.com": ["calendar"],
129
+ "drive.google.com": ["docs", "storage"],
130
+ "docs.google.com": ["docs"],
131
+ "notion.so": ["docs", "notes"],
132
+ "slack.com": ["chat", "work"],
133
+ "teams.microsoft.com": ["chat", "work"],
134
+ "discord.com": ["chat"],
135
+ # Maps / travel
136
+ "maps.google.com": ["maps"],
137
+ "google.com/maps": ["maps"],
138
+ "booking.com": ["travel"],
139
+ "airbnb.com": ["travel"],
140
+ # Banking (TR)
141
+ "garanti.com.tr": ["banking", "finance"],
142
+ "garantibbva.com.tr": ["banking", "finance"],
143
+ "akbank.com": ["banking", "finance"],
144
+ "isbank.com.tr": ["banking", "finance"],
145
+ "yapikredi.com.tr": ["banking", "finance"],
146
+ "ziraatbank.com.tr": ["banking", "finance"],
147
+ "denizbank.com": ["banking", "finance"],
148
+ "finansbank.com.tr": ["banking", "finance"],
149
+ }
150
+
151
+
152
+ def _host(url: str) -> str:
153
+ try:
154
+ h = urlparse(url).hostname or ""
155
+ except Exception:
156
+ return ""
157
+ return h.lower().lstrip(".")
158
+
159
+
160
+ def _content_tags(url: str) -> List[str]:
161
+ """Derive topical tags from the URL. The host is matched as a
162
+ progressively shortened suffix so `m.youtube.com` and
163
+ `music.youtube.com` both hit the `youtube.com` rule.
164
+ """
165
+ host = _host(url)
166
+ if not host:
167
+ return []
168
+ parts = host.split(".")
169
+ tags: List[str] = []
170
+ seen = set()
171
+ for i in range(len(parts) - 1):
172
+ candidate = ".".join(parts[i:])
173
+ for t in _DOMAIN_TAGS.get(candidate, ()):
174
+ if t not in seen:
175
+ seen.add(t)
176
+ tags.append(t)
177
+ if tags:
178
+ break
179
+ # YouTube path nuances — only meaningful for youtube hosts.
180
+ if "youtube" in tags:
181
+ try:
182
+ path = urlparse(url).path or ""
183
+ except Exception:
184
+ path = ""
185
+ if path.startswith("/shorts/"):
186
+ tags.append("shorts")
187
+ elif path.startswith("/watch"):
188
+ tags.append("watch")
189
+ elif path.startswith("/@") or path.startswith("/channel/") \
190
+ or path.startswith("/c/") or path.startswith("/user/"):
191
+ tags.append("channel")
192
+ elif path.startswith("/playlist"):
193
+ tags.append("playlist")
194
+ return tags
195
+
196
+
31
197
  _CHROME_EPOCH = datetime(1601, 1, 1, tzinfo=timezone.utc)
32
198
  _SKIP_PREFIXES = (
33
199
  "chrome://", "chrome-extension://", "chrome-search://",
@@ -139,7 +305,8 @@ def _build_record(profile: Profile, url: str, title: str,
139
305
  "visit_count": int(visit_count or 0),
140
306
  "last_visited_at": last_iso,
141
307
  "url_hash": url_hash,
142
- "tags": ["browser", "history", profile.browser],
308
+ "tags": ["browser", "history", profile.browser,
309
+ *_content_tags(url)],
143
310
  }
144
311
 
145
312
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evolutiondb-browser-sync
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: Sync browser history (Chrome, Edge, Firefox) into EvolutionDB long-term memory.
5
5
  Author-email: alptekin topal <topal.alptekin@gmail.com>
6
6
  License: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "evolutiondb-browser-sync"
7
- version = "0.1.0"
7
+ version = "0.1.1"
8
8
  description = "Sync browser history (Chrome, Edge, Firefox) into EvolutionDB long-term memory."
9
9
  requires-python = ">=3.9"
10
10
  license = {text = "MIT"}