weft-graph 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- weft/__init__.py +3 -0
- weft/__main__.py +6 -0
- weft/cli.py +372 -0
- weft/export/__init__.py +1 -0
- weft/export/browser.py +156 -0
- weft/export/cache.py +38 -0
- weft/export/crawl.py +55 -0
- weft/export/graph.py +413 -0
- weft/export/llm.py +86 -0
- weft/export/similarity.py +261 -0
- weft/tui/__init__.py +1 -0
- weft/tui/app.py +674 -0
- weft/utils/__init__.py +1 -0
- weft/utils/text.py +78 -0
- weft/utils/url.py +92 -0
- weft_graph-1.0.0.dist-info/METADATA +175 -0
- weft_graph-1.0.0.dist-info/RECORD +20 -0
- weft_graph-1.0.0.dist-info/WHEEL +5 -0
- weft_graph-1.0.0.dist-info/entry_points.txt +2 -0
- weft_graph-1.0.0.dist-info/top_level.txt +1 -0
weft/__init__.py
ADDED
weft/__main__.py
ADDED
weft/cli.py
ADDED
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
"""CLI dispatcher for weft."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
from weft import __version__
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def cmd_weave(args):
|
|
12
|
+
"""Build knowledge graph from browser tabs."""
|
|
13
|
+
from weft.export.browser import export_chrome, export_firefox
|
|
14
|
+
from weft.export.graph import GraphOptions, build_tab_graph, load_tabs_from_windows
|
|
15
|
+
|
|
16
|
+
# Export from browsers
|
|
17
|
+
windows = []
|
|
18
|
+
had_error = False
|
|
19
|
+
|
|
20
|
+
if args.browser in ("chrome", "all"):
|
|
21
|
+
try:
|
|
22
|
+
chrome_windows = export_chrome()
|
|
23
|
+
windows.extend(chrome_windows)
|
|
24
|
+
if args.verbose:
|
|
25
|
+
tab_count = sum(len(w.get("tabs", [])) for w in chrome_windows)
|
|
26
|
+
print(f"[INFO] Exported {tab_count} tabs from Chrome", file=sys.stderr)
|
|
27
|
+
except Exception as e:
|
|
28
|
+
had_error = True
|
|
29
|
+
print(f"[WARN] Chrome export failed: {e}", file=sys.stderr)
|
|
30
|
+
|
|
31
|
+
if args.browser in ("firefox", "all"):
|
|
32
|
+
try:
|
|
33
|
+
firefox_windows = export_firefox(args.firefox_profile, args.verbose)
|
|
34
|
+
windows.extend(firefox_windows)
|
|
35
|
+
if args.verbose:
|
|
36
|
+
tab_count = sum(len(w.get("tabs", [])) for w in firefox_windows)
|
|
37
|
+
print(f"[INFO] Exported {tab_count} tabs from Firefox", file=sys.stderr)
|
|
38
|
+
except Exception as e:
|
|
39
|
+
had_error = True
|
|
40
|
+
print(f"[WARN] Firefox export failed: {e}", file=sys.stderr)
|
|
41
|
+
|
|
42
|
+
if not windows:
|
|
43
|
+
print("[ERROR] No tabs exported from any browser.", file=sys.stderr)
|
|
44
|
+
sys.exit(1)
|
|
45
|
+
|
|
46
|
+
# Flatten to tabs
|
|
47
|
+
tabs = load_tabs_from_windows(windows)
|
|
48
|
+
|
|
49
|
+
if args.verbose:
|
|
50
|
+
print(f"[INFO] Processing {len(tabs)} tabs...", file=sys.stderr)
|
|
51
|
+
|
|
52
|
+
# Build graph options
|
|
53
|
+
options = GraphOptions(
|
|
54
|
+
out=args.out,
|
|
55
|
+
cache=args.cache,
|
|
56
|
+
refresh=args.refresh,
|
|
57
|
+
no_crawl=args.no_crawl,
|
|
58
|
+
max_chars=args.max_chars,
|
|
59
|
+
embed_max_chars=args.embed_max_chars,
|
|
60
|
+
user_agent=args.user_agent,
|
|
61
|
+
timeout=args.timeout,
|
|
62
|
+
js=args.js,
|
|
63
|
+
summarize=args.summarize,
|
|
64
|
+
llm_backend=args.llm_backend,
|
|
65
|
+
ollama_model=args.ollama_model,
|
|
66
|
+
ollama_url=args.ollama_url,
|
|
67
|
+
ollama_timeout=args.ollama_timeout,
|
|
68
|
+
embed_model=args.embed_model,
|
|
69
|
+
embed_url=args.embed_url,
|
|
70
|
+
embed_timeout=args.embed_timeout,
|
|
71
|
+
no_embeddings=args.no_embeddings,
|
|
72
|
+
store_embeddings=args.store_embeddings,
|
|
73
|
+
gguf=args.gguf,
|
|
74
|
+
llama_n_ctx=args.llama_n_ctx,
|
|
75
|
+
llama_n_threads=args.llama_n_threads,
|
|
76
|
+
llama_n_gpu_layers=args.llama_n_gpu_layers,
|
|
77
|
+
edge_threshold=args.edge_threshold,
|
|
78
|
+
group_threshold=args.group_threshold,
|
|
79
|
+
domain_bonus=args.domain_bonus,
|
|
80
|
+
no_domain_group=args.no_domain_group,
|
|
81
|
+
domain_group_min=args.domain_group_min,
|
|
82
|
+
knn_k=args.knn_k,
|
|
83
|
+
no_mutual_knn=args.no_mutual_knn,
|
|
84
|
+
dedupe_hamming=args.dedupe_hamming,
|
|
85
|
+
keyword_count=args.keyword_count,
|
|
86
|
+
verbose=args.verbose,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# Build graph
|
|
90
|
+
graph = build_tab_graph(tabs, options)
|
|
91
|
+
|
|
92
|
+
# Write output
|
|
93
|
+
with open(args.out, "w", encoding="utf-8") as f:
|
|
94
|
+
json.dump(graph, f, ensure_ascii=False, indent=2)
|
|
95
|
+
|
|
96
|
+
stats = graph["stats"]
|
|
97
|
+
mode = "with LLM summaries" if args.summarize else "lightweight (keywords only)"
|
|
98
|
+
print(
|
|
99
|
+
f"[OK] Wrote {args.out} {mode}\n"
|
|
100
|
+
f" {stats['tab_count']} tabs, {stats['group_count']} groups, "
|
|
101
|
+
f"{stats['edge_count']} edges, {stats['duplicates']} duplicates"
|
|
102
|
+
)
|
|
103
|
+
if stats["errors"]:
|
|
104
|
+
print(f" {stats['errors']} errors during processing", file=sys.stderr)
|
|
105
|
+
if had_error:
|
|
106
|
+
print("[NOTE] Some browsers failed to export; see warnings above.", file=sys.stderr)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def cmd_explore(args):
|
|
110
|
+
"""Launch the terminal UI."""
|
|
111
|
+
from weft.tui.app import TabGraphApp, load_graph
|
|
112
|
+
|
|
113
|
+
if not os.path.exists(args.graph_json):
|
|
114
|
+
print(f"[ERROR] Graph file not found: {args.graph_json}", file=sys.stderr)
|
|
115
|
+
print("Run 'weft weave' first to generate the graph.", file=sys.stderr)
|
|
116
|
+
sys.exit(1)
|
|
117
|
+
|
|
118
|
+
graph = load_graph(args.graph_json)
|
|
119
|
+
app = TabGraphApp(graph)
|
|
120
|
+
app.run()
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def main():
|
|
124
|
+
"""Main CLI entry point."""
|
|
125
|
+
parser = argparse.ArgumentParser(
|
|
126
|
+
prog="weft",
|
|
127
|
+
description="A local-first knowledge graph for your browsing",
|
|
128
|
+
)
|
|
129
|
+
parser.add_argument("--version", action="version", version=f"weft {__version__}")
|
|
130
|
+
|
|
131
|
+
subparsers = parser.add_subparsers(dest="command", help="Available commands")
|
|
132
|
+
|
|
133
|
+
# Weave command
|
|
134
|
+
weave_parser = subparsers.add_parser(
|
|
135
|
+
"weave",
|
|
136
|
+
help="Build knowledge graph from browser tabs",
|
|
137
|
+
description="Extract tabs from Chrome/Firefox and weave them into a knowledge graph with clustering.",
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Browser options
|
|
141
|
+
weave_parser.add_argument(
|
|
142
|
+
"--browser",
|
|
143
|
+
choices=["chrome", "firefox", "all"],
|
|
144
|
+
default="all",
|
|
145
|
+
help="Which browser(s) to export from (default: all)",
|
|
146
|
+
)
|
|
147
|
+
weave_parser.add_argument(
|
|
148
|
+
"--firefox-profile",
|
|
149
|
+
help="Override: path to a specific Firefox profile directory",
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
# Output options
|
|
153
|
+
weave_parser.add_argument(
|
|
154
|
+
"--out",
|
|
155
|
+
default="weft_graph.json",
|
|
156
|
+
help="Output graph JSON path (default: weft_graph.json)",
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
# LLM options
|
|
160
|
+
weave_parser.add_argument(
|
|
161
|
+
"--summarize",
|
|
162
|
+
action="store_true",
|
|
163
|
+
help="Enable LLM summarization (requires Ollama or GGUF model)",
|
|
164
|
+
)
|
|
165
|
+
weave_parser.add_argument(
|
|
166
|
+
"--llm-backend",
|
|
167
|
+
choices=["ollama", "gguf"],
|
|
168
|
+
default="ollama",
|
|
169
|
+
help="LLM backend (default: ollama)",
|
|
170
|
+
)
|
|
171
|
+
weave_parser.add_argument(
|
|
172
|
+
"--ollama-model",
|
|
173
|
+
default="llama3.1:8b",
|
|
174
|
+
help="Ollama model name (default: llama3.1:8b)",
|
|
175
|
+
)
|
|
176
|
+
weave_parser.add_argument(
|
|
177
|
+
"--ollama-url",
|
|
178
|
+
default="http://localhost:11434",
|
|
179
|
+
help="Ollama base URL",
|
|
180
|
+
)
|
|
181
|
+
weave_parser.add_argument(
|
|
182
|
+
"--ollama-timeout",
|
|
183
|
+
type=int,
|
|
184
|
+
default=120,
|
|
185
|
+
help="Ollama request timeout in seconds",
|
|
186
|
+
)
|
|
187
|
+
weave_parser.add_argument(
|
|
188
|
+
"--gguf",
|
|
189
|
+
help="Path to a GGUF model file (for --llm-backend=gguf)",
|
|
190
|
+
)
|
|
191
|
+
weave_parser.add_argument(
|
|
192
|
+
"--llama-n-ctx",
|
|
193
|
+
type=int,
|
|
194
|
+
default=4096,
|
|
195
|
+
help="Context size for llama-cpp",
|
|
196
|
+
)
|
|
197
|
+
weave_parser.add_argument(
|
|
198
|
+
"--llama-n-threads",
|
|
199
|
+
type=int,
|
|
200
|
+
default=max(1, os.cpu_count() or 1),
|
|
201
|
+
help="Threads for llama-cpp",
|
|
202
|
+
)
|
|
203
|
+
weave_parser.add_argument(
|
|
204
|
+
"--llama-n-gpu-layers",
|
|
205
|
+
type=int,
|
|
206
|
+
default=0,
|
|
207
|
+
help="GPU layers for llama-cpp",
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# Embedding options
|
|
211
|
+
weave_parser.add_argument(
|
|
212
|
+
"--embed-model",
|
|
213
|
+
default="nomic-embed-text",
|
|
214
|
+
help="Ollama embedding model name",
|
|
215
|
+
)
|
|
216
|
+
weave_parser.add_argument(
|
|
217
|
+
"--embed-url",
|
|
218
|
+
help="Ollama base URL for embeddings (defaults to --ollama-url)",
|
|
219
|
+
)
|
|
220
|
+
weave_parser.add_argument(
|
|
221
|
+
"--embed-timeout",
|
|
222
|
+
type=int,
|
|
223
|
+
default=60,
|
|
224
|
+
help="Embedding request timeout",
|
|
225
|
+
)
|
|
226
|
+
weave_parser.add_argument(
|
|
227
|
+
"--no-embeddings",
|
|
228
|
+
action="store_true",
|
|
229
|
+
help="Disable embeddings (use keyword similarity only)",
|
|
230
|
+
)
|
|
231
|
+
weave_parser.add_argument(
|
|
232
|
+
"--store-embeddings",
|
|
233
|
+
action="store_true",
|
|
234
|
+
help="Include embeddings in output JSON",
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# Crawling options
|
|
238
|
+
weave_parser.add_argument(
|
|
239
|
+
"--no-crawl",
|
|
240
|
+
action="store_true",
|
|
241
|
+
help="Skip URL crawling (use metadata only)",
|
|
242
|
+
)
|
|
243
|
+
weave_parser.add_argument(
|
|
244
|
+
"--max-chars",
|
|
245
|
+
type=int,
|
|
246
|
+
default=6000,
|
|
247
|
+
help="Max characters sent to LLM",
|
|
248
|
+
)
|
|
249
|
+
weave_parser.add_argument(
|
|
250
|
+
"--embed-max-chars",
|
|
251
|
+
type=int,
|
|
252
|
+
default=2000,
|
|
253
|
+
help="Max characters sent to embed model",
|
|
254
|
+
)
|
|
255
|
+
weave_parser.add_argument(
|
|
256
|
+
"--user-agent",
|
|
257
|
+
default="Mozilla/5.0",
|
|
258
|
+
help="User-Agent for HTTP requests",
|
|
259
|
+
)
|
|
260
|
+
weave_parser.add_argument(
|
|
261
|
+
"--timeout",
|
|
262
|
+
type=int,
|
|
263
|
+
default=20,
|
|
264
|
+
help="HTTP timeout in seconds",
|
|
265
|
+
)
|
|
266
|
+
weave_parser.add_argument(
|
|
267
|
+
"--js",
|
|
268
|
+
action="store_true",
|
|
269
|
+
help="Use Playwright to render JS-heavy pages",
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
# Caching options
|
|
273
|
+
weave_parser.add_argument(
|
|
274
|
+
"--cache",
|
|
275
|
+
default=os.path.join("data", "weft_cache.json"),
|
|
276
|
+
help="Cache file path",
|
|
277
|
+
)
|
|
278
|
+
weave_parser.add_argument(
|
|
279
|
+
"--refresh",
|
|
280
|
+
action="store_true",
|
|
281
|
+
help="Ignore cache and re-fetch everything",
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
# Clustering options
|
|
285
|
+
weave_parser.add_argument(
|
|
286
|
+
"--edge-threshold",
|
|
287
|
+
type=float,
|
|
288
|
+
default=0.2,
|
|
289
|
+
help="Edge weight threshold",
|
|
290
|
+
)
|
|
291
|
+
weave_parser.add_argument(
|
|
292
|
+
"--group-threshold",
|
|
293
|
+
type=float,
|
|
294
|
+
default=0.25,
|
|
295
|
+
help="Grouping similarity threshold",
|
|
296
|
+
)
|
|
297
|
+
weave_parser.add_argument(
|
|
298
|
+
"--domain-bonus",
|
|
299
|
+
type=float,
|
|
300
|
+
default=0.25,
|
|
301
|
+
help="Similarity bonus for same-domain tabs",
|
|
302
|
+
)
|
|
303
|
+
weave_parser.add_argument(
|
|
304
|
+
"--no-domain-group",
|
|
305
|
+
action="store_true",
|
|
306
|
+
help="Disable auto-grouping by domain",
|
|
307
|
+
)
|
|
308
|
+
weave_parser.add_argument(
|
|
309
|
+
"--domain-group-min",
|
|
310
|
+
type=int,
|
|
311
|
+
default=2,
|
|
312
|
+
help="Min tabs per domain to auto-group",
|
|
313
|
+
)
|
|
314
|
+
weave_parser.add_argument(
|
|
315
|
+
"--knn-k",
|
|
316
|
+
type=int,
|
|
317
|
+
default=6,
|
|
318
|
+
help="Mutual KNN size for grouping",
|
|
319
|
+
)
|
|
320
|
+
weave_parser.add_argument(
|
|
321
|
+
"--no-mutual-knn",
|
|
322
|
+
action="store_true",
|
|
323
|
+
help="Disable mutual-KNN grouping filter",
|
|
324
|
+
)
|
|
325
|
+
weave_parser.add_argument(
|
|
326
|
+
"--dedupe-hamming",
|
|
327
|
+
type=int,
|
|
328
|
+
default=3,
|
|
329
|
+
help="Simhash Hamming distance for dedupe",
|
|
330
|
+
)
|
|
331
|
+
weave_parser.add_argument(
|
|
332
|
+
"--keyword-count",
|
|
333
|
+
type=int,
|
|
334
|
+
default=8,
|
|
335
|
+
help="Number of keywords per tab",
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
# General options
|
|
339
|
+
weave_parser.add_argument(
|
|
340
|
+
"-v", "--verbose",
|
|
341
|
+
action="store_true",
|
|
342
|
+
help="Verbose logging",
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
# Explore command
|
|
346
|
+
explore_parser = subparsers.add_parser(
|
|
347
|
+
"explore",
|
|
348
|
+
help="Launch terminal UI to explore the graph",
|
|
349
|
+
description="Interactive terminal UI for browsing and searching grouped tabs.",
|
|
350
|
+
)
|
|
351
|
+
explore_parser.add_argument(
|
|
352
|
+
"graph_json",
|
|
353
|
+
nargs="?",
|
|
354
|
+
default="weft_graph.json",
|
|
355
|
+
help="Path to graph JSON file (default: weft_graph.json)",
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
# Parse and dispatch
|
|
359
|
+
args = parser.parse_args()
|
|
360
|
+
|
|
361
|
+
if args.command is None:
|
|
362
|
+
parser.print_help()
|
|
363
|
+
sys.exit(0)
|
|
364
|
+
|
|
365
|
+
if args.command == "weave":
|
|
366
|
+
cmd_weave(args)
|
|
367
|
+
elif args.command == "explore":
|
|
368
|
+
cmd_explore(args)
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
if __name__ == "__main__":
|
|
372
|
+
main()
|
weft/export/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Export module for tab extraction and graph building."""
|
weft/export/browser.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
"""Browser tab export functions for Chrome and Firefox on macOS."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import subprocess
|
|
5
|
+
import sys
|
|
6
|
+
import time
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Dict, List, Optional, Tuple
|
|
9
|
+
|
|
10
|
+
FF_BASE = Path.home() / "Library" / "Application Support" / "Firefox" / "Profiles"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def run_jxa(script: str) -> str:
|
|
14
|
+
"""Run JavaScript for Automation via osascript."""
|
|
15
|
+
try:
|
|
16
|
+
p = subprocess.run(
|
|
17
|
+
["osascript", "-l", "JavaScript", "-e", script],
|
|
18
|
+
check=True,
|
|
19
|
+
capture_output=True,
|
|
20
|
+
text=True,
|
|
21
|
+
)
|
|
22
|
+
return p.stdout.strip()
|
|
23
|
+
except subprocess.CalledProcessError as e:
|
|
24
|
+
raise RuntimeError(e.stderr.strip() or str(e))
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def export_chrome() -> List[Dict]:
|
|
28
|
+
"""Export all open tabs from Google Chrome using AppleScript."""
|
|
29
|
+
jxa = """
|
|
30
|
+
function run() {
|
|
31
|
+
var app = Application.currentApplication();
|
|
32
|
+
app.includeStandardAdditions = true;
|
|
33
|
+
var output = [];
|
|
34
|
+
try {
|
|
35
|
+
var chrome = Application('Google Chrome');
|
|
36
|
+
chrome.windows().forEach(function(w){
|
|
37
|
+
var tabs = w.tabs().map(function(t){ return {title: t.title(), url: t.url()}; });
|
|
38
|
+
if (tabs.length > 0) {
|
|
39
|
+
output.push({browser: 'chrome', windowId: w.id(), tabs: tabs});
|
|
40
|
+
}
|
|
41
|
+
});
|
|
42
|
+
} catch (e) {
|
|
43
|
+
// Chrome not running or AppleScript disabled
|
|
44
|
+
}
|
|
45
|
+
return JSON.stringify(output);
|
|
46
|
+
}
|
|
47
|
+
"""
|
|
48
|
+
out = run_jxa(jxa)
|
|
49
|
+
return json.loads(out) if out else []
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def find_all_firefox_profiles() -> List[Path]:
|
|
53
|
+
"""Find all Firefox profile directories."""
|
|
54
|
+
if not FF_BASE.exists():
|
|
55
|
+
return []
|
|
56
|
+
return [p for p in FF_BASE.iterdir() if p.is_dir()]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def best_sessionstore_for_profile(prof: Path) -> Optional[Tuple[Path, float]]:
|
|
60
|
+
"""Return (path, mtime) of the freshest sessionstore candidate or None."""
|
|
61
|
+
candidates = []
|
|
62
|
+
ssb = prof / "sessionstore-backups"
|
|
63
|
+
for name in ("recovery.jsonlz4", "previous.jsonlz4"):
|
|
64
|
+
p = ssb / name
|
|
65
|
+
if p.exists():
|
|
66
|
+
candidates.append(p)
|
|
67
|
+
candidates.extend((ssb).glob("upgrade.jsonlz4*"))
|
|
68
|
+
p_root = prof / "sessionstore.jsonlz4"
|
|
69
|
+
if p_root.exists():
|
|
70
|
+
candidates.append(p_root)
|
|
71
|
+
if not candidates:
|
|
72
|
+
return None
|
|
73
|
+
candidates = [(p, p.stat().st_mtime) for p in candidates if p.is_file()]
|
|
74
|
+
if not candidates:
|
|
75
|
+
return None
|
|
76
|
+
candidates.sort(key=lambda t: t[1], reverse=True)
|
|
77
|
+
return candidates[0]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def choose_firefox_profile_with_fresh_session() -> Optional[Tuple[Path, Path]]:
|
|
81
|
+
"""Return (profile_dir, sessionstore_path) with the freshest valid session."""
|
|
82
|
+
profiles = find_all_firefox_profiles()
|
|
83
|
+
scored = []
|
|
84
|
+
for prof in profiles:
|
|
85
|
+
best = best_sessionstore_for_profile(prof)
|
|
86
|
+
if best:
|
|
87
|
+
scored.append((prof, best[0], best[1]))
|
|
88
|
+
if not scored:
|
|
89
|
+
return None
|
|
90
|
+
scored.sort(key=lambda t: t[2], reverse=True)
|
|
91
|
+
return scored[0][0], scored[0][1]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def export_firefox(
|
|
95
|
+
profile_override: Optional[str] = None, verbose: bool = False
|
|
96
|
+
) -> List[Dict]:
|
|
97
|
+
"""Export all open tabs from Firefox session backup."""
|
|
98
|
+
try:
|
|
99
|
+
from lz4.block import decompress as lz4_decompress
|
|
100
|
+
except ImportError as e:
|
|
101
|
+
raise RuntimeError(
|
|
102
|
+
"Python package 'lz4' is required for Firefox export. Install with: pip install lz4"
|
|
103
|
+
) from e
|
|
104
|
+
|
|
105
|
+
session_path: Optional[Path] = None
|
|
106
|
+
selected_profile: Optional[Path] = None
|
|
107
|
+
|
|
108
|
+
if profile_override:
|
|
109
|
+
p = Path(profile_override).expanduser()
|
|
110
|
+
if not p.exists():
|
|
111
|
+
raise FileNotFoundError(f"Provided --firefox-profile does not exist: {p}")
|
|
112
|
+
best = best_sessionstore_for_profile(p)
|
|
113
|
+
if not best:
|
|
114
|
+
raise FileNotFoundError(
|
|
115
|
+
f"No sessionstore files found in provided profile: {p}"
|
|
116
|
+
)
|
|
117
|
+
selected_profile, session_path = p, best[0]
|
|
118
|
+
else:
|
|
119
|
+
chosen = choose_firefox_profile_with_fresh_session()
|
|
120
|
+
if not chosen:
|
|
121
|
+
raise FileNotFoundError(
|
|
122
|
+
"No Firefox sessionstore *.jsonlz4 files found in ANY profile. "
|
|
123
|
+
"Open Firefox (non-private window), ensure tabs are open, wait 10s, and try again."
|
|
124
|
+
)
|
|
125
|
+
selected_profile, session_path = chosen
|
|
126
|
+
|
|
127
|
+
if verbose:
|
|
128
|
+
print(f"[INFO] Using Firefox profile: {selected_profile}", file=sys.stderr)
|
|
129
|
+
print(
|
|
130
|
+
f"[INFO] Sessionstore file: {session_path} (mtime={time.ctime(session_path.stat().st_mtime)})",
|
|
131
|
+
file=sys.stderr,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
raw = session_path.read_bytes()
|
|
135
|
+
if raw[:8] != b"mozLz40\x00":
|
|
136
|
+
raise RuntimeError("Unexpected Firefox sessionstore header; not mozlz4.")
|
|
137
|
+
|
|
138
|
+
decomp = lz4_decompress(raw[8:])
|
|
139
|
+
session = json.loads(decomp.decode("utf-8"))
|
|
140
|
+
|
|
141
|
+
windows = session.get("windows", [])
|
|
142
|
+
payload = []
|
|
143
|
+
for w in windows:
|
|
144
|
+
tabs = []
|
|
145
|
+
for t in w.get("tabs", []):
|
|
146
|
+
idx = t.get("index", 1)
|
|
147
|
+
entries = t.get("entries", [])
|
|
148
|
+
if 1 <= idx <= len(entries):
|
|
149
|
+
e = entries[idx - 1]
|
|
150
|
+
url = e.get("url", "")
|
|
151
|
+
title = e.get("title", "")
|
|
152
|
+
if url:
|
|
153
|
+
tabs.append({"title": title, "url": url})
|
|
154
|
+
if tabs:
|
|
155
|
+
payload.append({"browser": "firefox", "windowId": None, "tabs": tabs})
|
|
156
|
+
return payload
|
weft/export/cache.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Cache management for tab graph building."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from typing import Dict, Optional, Tuple
|
|
6
|
+
|
|
7
|
+
from weft.utils.url import canonicalize_url
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def load_cache(path: Optional[str]) -> Dict[str, Dict]:
|
|
11
|
+
"""Load cache from JSON file."""
|
|
12
|
+
if not path or not os.path.exists(path):
|
|
13
|
+
return {}
|
|
14
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
15
|
+
return json.load(f)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def get_cache_entry(
|
|
19
|
+
cache: Dict[str, Dict], url: str
|
|
20
|
+
) -> Tuple[Optional[Dict], Optional[str]]:
|
|
21
|
+
"""Get cache entry for URL, trying both original and canonical forms."""
|
|
22
|
+
if not cache:
|
|
23
|
+
return None, None
|
|
24
|
+
for key in (url, canonicalize_url(url)):
|
|
25
|
+
if key in cache:
|
|
26
|
+
return cache[key], key
|
|
27
|
+
return None, None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def save_cache(path: Optional[str], cache: Dict[str, Dict]) -> None:
|
|
31
|
+
"""Save cache to JSON file."""
|
|
32
|
+
if not path:
|
|
33
|
+
return
|
|
34
|
+
dir_name = os.path.dirname(path)
|
|
35
|
+
if dir_name:
|
|
36
|
+
os.makedirs(dir_name, exist_ok=True)
|
|
37
|
+
with open(path, "w", encoding="utf-8") as f:
|
|
38
|
+
json.dump(cache, f, ensure_ascii=False, indent=2)
|
weft/export/crawl.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""HTML fetching and text extraction functions."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
import requests
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
import trafilatura
|
|
10
|
+
except ImportError:
|
|
11
|
+
trafilatura = None
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def fetch_html_requests(url: str, timeout: int, user_agent: Optional[str] = None) -> str:
|
|
15
|
+
"""Fetch HTML content from URL using requests library."""
|
|
16
|
+
headers = {}
|
|
17
|
+
if user_agent:
|
|
18
|
+
headers["User-Agent"] = user_agent
|
|
19
|
+
resp = requests.get(url, headers=headers, timeout=timeout)
|
|
20
|
+
resp.raise_for_status()
|
|
21
|
+
content_type = resp.headers.get("content-type", "")
|
|
22
|
+
if "text/html" not in content_type and "application/xhtml+xml" not in content_type:
|
|
23
|
+
return ""
|
|
24
|
+
return resp.text
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def extract_text(html: str, url: str) -> str:
|
|
28
|
+
"""Extract readable text from HTML using trafilatura."""
|
|
29
|
+
if trafilatura is None:
|
|
30
|
+
raise ImportError("trafilatura is required. Install with: pip install trafilatura")
|
|
31
|
+
|
|
32
|
+
text = trafilatura.extract(
|
|
33
|
+
html,
|
|
34
|
+
url=url,
|
|
35
|
+
include_comments=False,
|
|
36
|
+
include_tables=False,
|
|
37
|
+
include_links=False,
|
|
38
|
+
)
|
|
39
|
+
if text:
|
|
40
|
+
return text
|
|
41
|
+
# Fallback for edge cases where trafilatura fails
|
|
42
|
+
text = re.sub(r"<[^>]+>", " ", html)
|
|
43
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
44
|
+
return text
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def truncate_text(text: str, max_chars: int) -> str:
|
|
48
|
+
"""Truncate text to max_chars, breaking at word boundary."""
|
|
49
|
+
if len(text) <= max_chars:
|
|
50
|
+
return text
|
|
51
|
+
clipped = text[: max_chars - 1]
|
|
52
|
+
last_space = clipped.rfind(" ")
|
|
53
|
+
if last_space > 200:
|
|
54
|
+
clipped = clipped[:last_space]
|
|
55
|
+
return clipped
|