weft-graph 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
weft/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ """Weft - A local-first knowledge graph for your browsing."""
2
+
3
+ __version__ = "1.0.0"
weft/__main__.py ADDED
@@ -0,0 +1,6 @@
1
+ """Entry point for python -m weft."""
2
+
3
+ from weft.cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
weft/cli.py ADDED
@@ -0,0 +1,372 @@
1
+ """CLI dispatcher for weft."""
2
+
3
+ import argparse
4
+ import json
5
+ import os
6
+ import sys
7
+
8
+ from weft import __version__
9
+
10
+
11
+ def cmd_weave(args):
12
+ """Build knowledge graph from browser tabs."""
13
+ from weft.export.browser import export_chrome, export_firefox
14
+ from weft.export.graph import GraphOptions, build_tab_graph, load_tabs_from_windows
15
+
16
+ # Export from browsers
17
+ windows = []
18
+ had_error = False
19
+
20
+ if args.browser in ("chrome", "all"):
21
+ try:
22
+ chrome_windows = export_chrome()
23
+ windows.extend(chrome_windows)
24
+ if args.verbose:
25
+ tab_count = sum(len(w.get("tabs", [])) for w in chrome_windows)
26
+ print(f"[INFO] Exported {tab_count} tabs from Chrome", file=sys.stderr)
27
+ except Exception as e:
28
+ had_error = True
29
+ print(f"[WARN] Chrome export failed: {e}", file=sys.stderr)
30
+
31
+ if args.browser in ("firefox", "all"):
32
+ try:
33
+ firefox_windows = export_firefox(args.firefox_profile, args.verbose)
34
+ windows.extend(firefox_windows)
35
+ if args.verbose:
36
+ tab_count = sum(len(w.get("tabs", [])) for w in firefox_windows)
37
+ print(f"[INFO] Exported {tab_count} tabs from Firefox", file=sys.stderr)
38
+ except Exception as e:
39
+ had_error = True
40
+ print(f"[WARN] Firefox export failed: {e}", file=sys.stderr)
41
+
42
+ if not windows:
43
+ print("[ERROR] No tabs exported from any browser.", file=sys.stderr)
44
+ sys.exit(1)
45
+
46
+ # Flatten to tabs
47
+ tabs = load_tabs_from_windows(windows)
48
+
49
+ if args.verbose:
50
+ print(f"[INFO] Processing {len(tabs)} tabs...", file=sys.stderr)
51
+
52
+ # Build graph options
53
+ options = GraphOptions(
54
+ out=args.out,
55
+ cache=args.cache,
56
+ refresh=args.refresh,
57
+ no_crawl=args.no_crawl,
58
+ max_chars=args.max_chars,
59
+ embed_max_chars=args.embed_max_chars,
60
+ user_agent=args.user_agent,
61
+ timeout=args.timeout,
62
+ js=args.js,
63
+ summarize=args.summarize,
64
+ llm_backend=args.llm_backend,
65
+ ollama_model=args.ollama_model,
66
+ ollama_url=args.ollama_url,
67
+ ollama_timeout=args.ollama_timeout,
68
+ embed_model=args.embed_model,
69
+ embed_url=args.embed_url,
70
+ embed_timeout=args.embed_timeout,
71
+ no_embeddings=args.no_embeddings,
72
+ store_embeddings=args.store_embeddings,
73
+ gguf=args.gguf,
74
+ llama_n_ctx=args.llama_n_ctx,
75
+ llama_n_threads=args.llama_n_threads,
76
+ llama_n_gpu_layers=args.llama_n_gpu_layers,
77
+ edge_threshold=args.edge_threshold,
78
+ group_threshold=args.group_threshold,
79
+ domain_bonus=args.domain_bonus,
80
+ no_domain_group=args.no_domain_group,
81
+ domain_group_min=args.domain_group_min,
82
+ knn_k=args.knn_k,
83
+ no_mutual_knn=args.no_mutual_knn,
84
+ dedupe_hamming=args.dedupe_hamming,
85
+ keyword_count=args.keyword_count,
86
+ verbose=args.verbose,
87
+ )
88
+
89
+ # Build graph
90
+ graph = build_tab_graph(tabs, options)
91
+
92
+ # Write output
93
+ with open(args.out, "w", encoding="utf-8") as f:
94
+ json.dump(graph, f, ensure_ascii=False, indent=2)
95
+
96
+ stats = graph["stats"]
97
+ mode = "with LLM summaries" if args.summarize else "lightweight (keywords only)"
98
+ print(
99
+ f"[OK] Wrote {args.out} {mode}\n"
100
+ f" {stats['tab_count']} tabs, {stats['group_count']} groups, "
101
+ f"{stats['edge_count']} edges, {stats['duplicates']} duplicates"
102
+ )
103
+ if stats["errors"]:
104
+ print(f" {stats['errors']} errors during processing", file=sys.stderr)
105
+ if had_error:
106
+ print("[NOTE] Some browsers failed to export; see warnings above.", file=sys.stderr)
107
+
108
+
109
+ def cmd_explore(args):
110
+ """Launch the terminal UI."""
111
+ from weft.tui.app import TabGraphApp, load_graph
112
+
113
+ if not os.path.exists(args.graph_json):
114
+ print(f"[ERROR] Graph file not found: {args.graph_json}", file=sys.stderr)
115
+ print("Run 'weft weave' first to generate the graph.", file=sys.stderr)
116
+ sys.exit(1)
117
+
118
+ graph = load_graph(args.graph_json)
119
+ app = TabGraphApp(graph)
120
+ app.run()
121
+
122
+
123
+ def main():
124
+ """Main CLI entry point."""
125
+ parser = argparse.ArgumentParser(
126
+ prog="weft",
127
+ description="A local-first knowledge graph for your browsing",
128
+ )
129
+ parser.add_argument("--version", action="version", version=f"weft {__version__}")
130
+
131
+ subparsers = parser.add_subparsers(dest="command", help="Available commands")
132
+
133
+ # Weave command
134
+ weave_parser = subparsers.add_parser(
135
+ "weave",
136
+ help="Build knowledge graph from browser tabs",
137
+ description="Extract tabs from Chrome/Firefox and weave them into a knowledge graph with clustering.",
138
+ )
139
+
140
+ # Browser options
141
+ weave_parser.add_argument(
142
+ "--browser",
143
+ choices=["chrome", "firefox", "all"],
144
+ default="all",
145
+ help="Which browser(s) to export from (default: all)",
146
+ )
147
+ weave_parser.add_argument(
148
+ "--firefox-profile",
149
+ help="Override: path to a specific Firefox profile directory",
150
+ )
151
+
152
+ # Output options
153
+ weave_parser.add_argument(
154
+ "--out",
155
+ default="weft_graph.json",
156
+ help="Output graph JSON path (default: weft_graph.json)",
157
+ )
158
+
159
+ # LLM options
160
+ weave_parser.add_argument(
161
+ "--summarize",
162
+ action="store_true",
163
+ help="Enable LLM summarization (requires Ollama or GGUF model)",
164
+ )
165
+ weave_parser.add_argument(
166
+ "--llm-backend",
167
+ choices=["ollama", "gguf"],
168
+ default="ollama",
169
+ help="LLM backend (default: ollama)",
170
+ )
171
+ weave_parser.add_argument(
172
+ "--ollama-model",
173
+ default="llama3.1:8b",
174
+ help="Ollama model name (default: llama3.1:8b)",
175
+ )
176
+ weave_parser.add_argument(
177
+ "--ollama-url",
178
+ default="http://localhost:11434",
179
+ help="Ollama base URL",
180
+ )
181
+ weave_parser.add_argument(
182
+ "--ollama-timeout",
183
+ type=int,
184
+ default=120,
185
+ help="Ollama request timeout in seconds",
186
+ )
187
+ weave_parser.add_argument(
188
+ "--gguf",
189
+ help="Path to a GGUF model file (for --llm-backend=gguf)",
190
+ )
191
+ weave_parser.add_argument(
192
+ "--llama-n-ctx",
193
+ type=int,
194
+ default=4096,
195
+ help="Context size for llama-cpp",
196
+ )
197
+ weave_parser.add_argument(
198
+ "--llama-n-threads",
199
+ type=int,
200
+ default=max(1, os.cpu_count() or 1),
201
+ help="Threads for llama-cpp",
202
+ )
203
+ weave_parser.add_argument(
204
+ "--llama-n-gpu-layers",
205
+ type=int,
206
+ default=0,
207
+ help="GPU layers for llama-cpp",
208
+ )
209
+
210
+ # Embedding options
211
+ weave_parser.add_argument(
212
+ "--embed-model",
213
+ default="nomic-embed-text",
214
+ help="Ollama embedding model name",
215
+ )
216
+ weave_parser.add_argument(
217
+ "--embed-url",
218
+ help="Ollama base URL for embeddings (defaults to --ollama-url)",
219
+ )
220
+ weave_parser.add_argument(
221
+ "--embed-timeout",
222
+ type=int,
223
+ default=60,
224
+ help="Embedding request timeout",
225
+ )
226
+ weave_parser.add_argument(
227
+ "--no-embeddings",
228
+ action="store_true",
229
+ help="Disable embeddings (use keyword similarity only)",
230
+ )
231
+ weave_parser.add_argument(
232
+ "--store-embeddings",
233
+ action="store_true",
234
+ help="Include embeddings in output JSON",
235
+ )
236
+
237
+ # Crawling options
238
+ weave_parser.add_argument(
239
+ "--no-crawl",
240
+ action="store_true",
241
+ help="Skip URL crawling (use metadata only)",
242
+ )
243
+ weave_parser.add_argument(
244
+ "--max-chars",
245
+ type=int,
246
+ default=6000,
247
+ help="Max characters sent to LLM",
248
+ )
249
+ weave_parser.add_argument(
250
+ "--embed-max-chars",
251
+ type=int,
252
+ default=2000,
253
+ help="Max characters sent to embed model",
254
+ )
255
+ weave_parser.add_argument(
256
+ "--user-agent",
257
+ default="Mozilla/5.0",
258
+ help="User-Agent for HTTP requests",
259
+ )
260
+ weave_parser.add_argument(
261
+ "--timeout",
262
+ type=int,
263
+ default=20,
264
+ help="HTTP timeout in seconds",
265
+ )
266
+ weave_parser.add_argument(
267
+ "--js",
268
+ action="store_true",
269
+ help="Use Playwright to render JS-heavy pages",
270
+ )
271
+
272
+ # Caching options
273
+ weave_parser.add_argument(
274
+ "--cache",
275
+ default=os.path.join("data", "weft_cache.json"),
276
+ help="Cache file path",
277
+ )
278
+ weave_parser.add_argument(
279
+ "--refresh",
280
+ action="store_true",
281
+ help="Ignore cache and re-fetch everything",
282
+ )
283
+
284
+ # Clustering options
285
+ weave_parser.add_argument(
286
+ "--edge-threshold",
287
+ type=float,
288
+ default=0.2,
289
+ help="Edge weight threshold",
290
+ )
291
+ weave_parser.add_argument(
292
+ "--group-threshold",
293
+ type=float,
294
+ default=0.25,
295
+ help="Grouping similarity threshold",
296
+ )
297
+ weave_parser.add_argument(
298
+ "--domain-bonus",
299
+ type=float,
300
+ default=0.25,
301
+ help="Similarity bonus for same-domain tabs",
302
+ )
303
+ weave_parser.add_argument(
304
+ "--no-domain-group",
305
+ action="store_true",
306
+ help="Disable auto-grouping by domain",
307
+ )
308
+ weave_parser.add_argument(
309
+ "--domain-group-min",
310
+ type=int,
311
+ default=2,
312
+ help="Min tabs per domain to auto-group",
313
+ )
314
+ weave_parser.add_argument(
315
+ "--knn-k",
316
+ type=int,
317
+ default=6,
318
+ help="Mutual KNN size for grouping",
319
+ )
320
+ weave_parser.add_argument(
321
+ "--no-mutual-knn",
322
+ action="store_true",
323
+ help="Disable mutual-KNN grouping filter",
324
+ )
325
+ weave_parser.add_argument(
326
+ "--dedupe-hamming",
327
+ type=int,
328
+ default=3,
329
+ help="Simhash Hamming distance for dedupe",
330
+ )
331
+ weave_parser.add_argument(
332
+ "--keyword-count",
333
+ type=int,
334
+ default=8,
335
+ help="Number of keywords per tab",
336
+ )
337
+
338
+ # General options
339
+ weave_parser.add_argument(
340
+ "-v", "--verbose",
341
+ action="store_true",
342
+ help="Verbose logging",
343
+ )
344
+
345
+ # Explore command
346
+ explore_parser = subparsers.add_parser(
347
+ "explore",
348
+ help="Launch terminal UI to explore the graph",
349
+ description="Interactive terminal UI for browsing and searching grouped tabs.",
350
+ )
351
+ explore_parser.add_argument(
352
+ "graph_json",
353
+ nargs="?",
354
+ default="weft_graph.json",
355
+ help="Path to graph JSON file (default: weft_graph.json)",
356
+ )
357
+
358
+ # Parse and dispatch
359
+ args = parser.parse_args()
360
+
361
+ if args.command is None:
362
+ parser.print_help()
363
+ sys.exit(0)
364
+
365
+ if args.command == "weave":
366
+ cmd_weave(args)
367
+ elif args.command == "explore":
368
+ cmd_explore(args)
369
+
370
+
371
+ if __name__ == "__main__":
372
+ main()
@@ -0,0 +1 @@
1
+ """Export module for tab extraction and graph building."""
weft/export/browser.py ADDED
@@ -0,0 +1,156 @@
1
+ """Browser tab export functions for Chrome and Firefox on macOS."""
2
+
3
+ import json
4
+ import subprocess
5
+ import sys
6
+ import time
7
+ from pathlib import Path
8
+ from typing import Dict, List, Optional, Tuple
9
+
10
+ FF_BASE = Path.home() / "Library" / "Application Support" / "Firefox" / "Profiles"
11
+
12
+
13
+ def run_jxa(script: str) -> str:
14
+ """Run JavaScript for Automation via osascript."""
15
+ try:
16
+ p = subprocess.run(
17
+ ["osascript", "-l", "JavaScript", "-e", script],
18
+ check=True,
19
+ capture_output=True,
20
+ text=True,
21
+ )
22
+ return p.stdout.strip()
23
+ except subprocess.CalledProcessError as e:
24
+ raise RuntimeError(e.stderr.strip() or str(e))
25
+
26
+
27
+ def export_chrome() -> List[Dict]:
28
+ """Export all open tabs from Google Chrome using AppleScript."""
29
+ jxa = """
30
+ function run() {
31
+ var app = Application.currentApplication();
32
+ app.includeStandardAdditions = true;
33
+ var output = [];
34
+ try {
35
+ var chrome = Application('Google Chrome');
36
+ chrome.windows().forEach(function(w){
37
+ var tabs = w.tabs().map(function(t){ return {title: t.title(), url: t.url()}; });
38
+ if (tabs.length > 0) {
39
+ output.push({browser: 'chrome', windowId: w.id(), tabs: tabs});
40
+ }
41
+ });
42
+ } catch (e) {
43
+ // Chrome not running or AppleScript disabled
44
+ }
45
+ return JSON.stringify(output);
46
+ }
47
+ """
48
+ out = run_jxa(jxa)
49
+ return json.loads(out) if out else []
50
+
51
+
52
+ def find_all_firefox_profiles() -> List[Path]:
53
+ """Find all Firefox profile directories."""
54
+ if not FF_BASE.exists():
55
+ return []
56
+ return [p for p in FF_BASE.iterdir() if p.is_dir()]
57
+
58
+
59
+ def best_sessionstore_for_profile(prof: Path) -> Optional[Tuple[Path, float]]:
60
+ """Return (path, mtime) of the freshest sessionstore candidate or None."""
61
+ candidates = []
62
+ ssb = prof / "sessionstore-backups"
63
+ for name in ("recovery.jsonlz4", "previous.jsonlz4"):
64
+ p = ssb / name
65
+ if p.exists():
66
+ candidates.append(p)
67
+ candidates.extend((ssb).glob("upgrade.jsonlz4*"))
68
+ p_root = prof / "sessionstore.jsonlz4"
69
+ if p_root.exists():
70
+ candidates.append(p_root)
71
+ if not candidates:
72
+ return None
73
+ candidates = [(p, p.stat().st_mtime) for p in candidates if p.is_file()]
74
+ if not candidates:
75
+ return None
76
+ candidates.sort(key=lambda t: t[1], reverse=True)
77
+ return candidates[0]
78
+
79
+
80
+ def choose_firefox_profile_with_fresh_session() -> Optional[Tuple[Path, Path]]:
81
+ """Return (profile_dir, sessionstore_path) with the freshest valid session."""
82
+ profiles = find_all_firefox_profiles()
83
+ scored = []
84
+ for prof in profiles:
85
+ best = best_sessionstore_for_profile(prof)
86
+ if best:
87
+ scored.append((prof, best[0], best[1]))
88
+ if not scored:
89
+ return None
90
+ scored.sort(key=lambda t: t[2], reverse=True)
91
+ return scored[0][0], scored[0][1]
92
+
93
+
94
+ def export_firefox(
95
+ profile_override: Optional[str] = None, verbose: bool = False
96
+ ) -> List[Dict]:
97
+ """Export all open tabs from Firefox session backup."""
98
+ try:
99
+ from lz4.block import decompress as lz4_decompress
100
+ except ImportError as e:
101
+ raise RuntimeError(
102
+ "Python package 'lz4' is required for Firefox export. Install with: pip install lz4"
103
+ ) from e
104
+
105
+ session_path: Optional[Path] = None
106
+ selected_profile: Optional[Path] = None
107
+
108
+ if profile_override:
109
+ p = Path(profile_override).expanduser()
110
+ if not p.exists():
111
+ raise FileNotFoundError(f"Provided --firefox-profile does not exist: {p}")
112
+ best = best_sessionstore_for_profile(p)
113
+ if not best:
114
+ raise FileNotFoundError(
115
+ f"No sessionstore files found in provided profile: {p}"
116
+ )
117
+ selected_profile, session_path = p, best[0]
118
+ else:
119
+ chosen = choose_firefox_profile_with_fresh_session()
120
+ if not chosen:
121
+ raise FileNotFoundError(
122
+ "No Firefox sessionstore *.jsonlz4 files found in ANY profile. "
123
+ "Open Firefox (non-private window), ensure tabs are open, wait 10s, and try again."
124
+ )
125
+ selected_profile, session_path = chosen
126
+
127
+ if verbose:
128
+ print(f"[INFO] Using Firefox profile: {selected_profile}", file=sys.stderr)
129
+ print(
130
+ f"[INFO] Sessionstore file: {session_path} (mtime={time.ctime(session_path.stat().st_mtime)})",
131
+ file=sys.stderr,
132
+ )
133
+
134
+ raw = session_path.read_bytes()
135
+ if raw[:8] != b"mozLz40\x00":
136
+ raise RuntimeError("Unexpected Firefox sessionstore header; not mozlz4.")
137
+
138
+ decomp = lz4_decompress(raw[8:])
139
+ session = json.loads(decomp.decode("utf-8"))
140
+
141
+ windows = session.get("windows", [])
142
+ payload = []
143
+ for w in windows:
144
+ tabs = []
145
+ for t in w.get("tabs", []):
146
+ idx = t.get("index", 1)
147
+ entries = t.get("entries", [])
148
+ if 1 <= idx <= len(entries):
149
+ e = entries[idx - 1]
150
+ url = e.get("url", "")
151
+ title = e.get("title", "")
152
+ if url:
153
+ tabs.append({"title": title, "url": url})
154
+ if tabs:
155
+ payload.append({"browser": "firefox", "windowId": None, "tabs": tabs})
156
+ return payload
weft/export/cache.py ADDED
@@ -0,0 +1,38 @@
1
+ """Cache management for tab graph building."""
2
+
3
+ import json
4
+ import os
5
+ from typing import Dict, Optional, Tuple
6
+
7
+ from weft.utils.url import canonicalize_url
8
+
9
+
10
+ def load_cache(path: Optional[str]) -> Dict[str, Dict]:
11
+ """Load cache from JSON file."""
12
+ if not path or not os.path.exists(path):
13
+ return {}
14
+ with open(path, "r", encoding="utf-8") as f:
15
+ return json.load(f)
16
+
17
+
18
+ def get_cache_entry(
19
+ cache: Dict[str, Dict], url: str
20
+ ) -> Tuple[Optional[Dict], Optional[str]]:
21
+ """Get cache entry for URL, trying both original and canonical forms."""
22
+ if not cache:
23
+ return None, None
24
+ for key in (url, canonicalize_url(url)):
25
+ if key in cache:
26
+ return cache[key], key
27
+ return None, None
28
+
29
+
30
+ def save_cache(path: Optional[str], cache: Dict[str, Dict]) -> None:
31
+ """Save cache to JSON file."""
32
+ if not path:
33
+ return
34
+ dir_name = os.path.dirname(path)
35
+ if dir_name:
36
+ os.makedirs(dir_name, exist_ok=True)
37
+ with open(path, "w", encoding="utf-8") as f:
38
+ json.dump(cache, f, ensure_ascii=False, indent=2)
weft/export/crawl.py ADDED
@@ -0,0 +1,55 @@
1
+ """HTML fetching and text extraction functions."""
2
+
3
+ import re
4
+ from typing import Optional
5
+
6
+ import requests
7
+
8
+ try:
9
+ import trafilatura
10
+ except ImportError:
11
+ trafilatura = None
12
+
13
+
14
+ def fetch_html_requests(url: str, timeout: int, user_agent: Optional[str] = None) -> str:
15
+ """Fetch HTML content from URL using requests library."""
16
+ headers = {}
17
+ if user_agent:
18
+ headers["User-Agent"] = user_agent
19
+ resp = requests.get(url, headers=headers, timeout=timeout)
20
+ resp.raise_for_status()
21
+ content_type = resp.headers.get("content-type", "")
22
+ if "text/html" not in content_type and "application/xhtml+xml" not in content_type:
23
+ return ""
24
+ return resp.text
25
+
26
+
27
+ def extract_text(html: str, url: str) -> str:
28
+ """Extract readable text from HTML using trafilatura."""
29
+ if trafilatura is None:
30
+ raise ImportError("trafilatura is required. Install with: pip install trafilatura")
31
+
32
+ text = trafilatura.extract(
33
+ html,
34
+ url=url,
35
+ include_comments=False,
36
+ include_tables=False,
37
+ include_links=False,
38
+ )
39
+ if text:
40
+ return text
41
+ # Fallback for edge cases where trafilatura fails
42
+ text = re.sub(r"<[^>]+>", " ", html)
43
+ text = re.sub(r"\s+", " ", text).strip()
44
+ return text
45
+
46
+
47
+ def truncate_text(text: str, max_chars: int) -> str:
48
+ """Truncate text to max_chars, breaking at word boundary."""
49
+ if len(text) <= max_chars:
50
+ return text
51
+ clipped = text[: max_chars - 1]
52
+ last_space = clipped.rfind(" ")
53
+ if last_space > 200:
54
+ clipped = clipped[:last_space]
55
+ return clipped