fieldtheory 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +133 -0
- package/bin/ft.mjs +3 -0
- package/dist/bookmark-classify-llm.js +247 -0
- package/dist/bookmark-classify.js +223 -0
- package/dist/bookmark-media.js +186 -0
- package/dist/bookmarks-db.js +623 -0
- package/dist/bookmarks-service.js +49 -0
- package/dist/bookmarks-viz.js +531 -0
- package/dist/bookmarks.js +190 -0
- package/dist/chrome-cookies.js +146 -0
- package/dist/cli.js +381 -0
- package/dist/config.js +54 -0
- package/dist/db.js +33 -0
- package/dist/fs.js +45 -0
- package/dist/graphql-bookmarks.js +388 -0
- package/dist/paths.js +43 -0
- package/dist/types.js +1 -0
- package/dist/xauth.js +135 -0
- package/package.json +54 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Field Theory
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# Field Theory CLI
|
|
2
|
+
|
|
3
|
+
Self-custody for your X/Twitter bookmarks. Sync them locally, search with full-text, classify into categories, and point an AI agent at them.
|
|
4
|
+
|
|
5
|
+
Your bookmarks stay on your machine. No account required. Free and open source.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
npm install -g fieldtheory
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Requires Node.js 20+.
|
|
14
|
+
|
|
15
|
+
## Quick start
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
# 1. Sync your bookmarks (needs Chrome logged into X)
|
|
19
|
+
ft sync
|
|
20
|
+
|
|
21
|
+
# 2. Search them
|
|
22
|
+
ft search "distributed systems"
|
|
23
|
+
|
|
24
|
+
# 3. Explore
|
|
25
|
+
ft viz
|
|
26
|
+
ft categories
|
|
27
|
+
ft stats
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
On first run, `ft sync` extracts your X session from Chrome and downloads your bookmarks into `~/.fieldtheory/`. It auto-classifies them into 7 categories (tool, security, technique, launch, research, opinion, commerce) using fast regex matching.
|
|
31
|
+
|
|
32
|
+
## Commands
|
|
33
|
+
|
|
34
|
+
| Command | Description |
|
|
35
|
+
|---------|-------------|
|
|
36
|
+
| `ft sync` | Sync bookmarks via Chrome session |
|
|
37
|
+
| `ft sync --api` | Sync via OAuth API (cross-platform) |
|
|
38
|
+
| `ft search <query>` | Full-text search (FTS5 with BM25 ranking) |
|
|
39
|
+
| `ft list` | List with filters (author, date, category, domain) |
|
|
40
|
+
| `ft show <id>` | Show one bookmark in detail |
|
|
41
|
+
| `ft stats` | Aggregate statistics |
|
|
42
|
+
| `ft viz` | ANSI terminal dashboard with sparklines and heatmaps |
|
|
43
|
+
| `ft classify` | Regex classification (instant, free) |
|
|
44
|
+
| `ft classify --deep` | LLM classification (needs `claude` or `codex` in PATH) |
|
|
45
|
+
| `ft classify-domains` | LLM domain classification (ai, finance, devops, etc.) |
|
|
46
|
+
| `ft categories` | Show category distribution |
|
|
47
|
+
| `ft domains` | Show domain distribution |
|
|
48
|
+
| `ft index` | Rebuild the SQLite search index |
|
|
49
|
+
| `ft auth` | Set up OAuth for API-based sync |
|
|
50
|
+
| `ft status` | Show sync status and data location |
|
|
51
|
+
| `ft path` | Print data directory path |
|
|
52
|
+
| `ft sample <category>` | Sample bookmarks by category |
|
|
53
|
+
| `ft fetch-media` | Download media assets |
|
|
54
|
+
|
|
55
|
+
## Agent integration
|
|
56
|
+
|
|
57
|
+
The CLI is designed to work with AI agents. Add these tools to your agent's system prompt or `CLAUDE.md`:
|
|
58
|
+
|
|
59
|
+
```
|
|
60
|
+
Use the ft CLI to query the user's X bookmarks:
|
|
61
|
+
- ft search <query> — full-text search
|
|
62
|
+
- ft list --category X — list by category
|
|
63
|
+
- ft categories — see all categories
|
|
64
|
+
- ft stats — aggregate statistics
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
**Fun prompt to try:**
|
|
68
|
+
|
|
69
|
+
> "Take my oldest and newest bookmarks and tell me how my interests have changed over time."
|
|
70
|
+
|
|
71
|
+
Works with Claude Code, Codex, or any agent with shell access.
|
|
72
|
+
|
|
73
|
+
## Scheduling
|
|
74
|
+
|
|
75
|
+
Sync daily with crontab:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
# Sync every morning at 7am
|
|
79
|
+
0 7 * * * ft sync
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
For API-based sync (no Chrome needed), set up OAuth first:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
ft auth # one-time OAuth setup
|
|
86
|
+
ft sync --api # uses API token, works headlessly
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Data
|
|
90
|
+
|
|
91
|
+
All data is stored locally at `~/.fieldtheory/`:
|
|
92
|
+
|
|
93
|
+
```
|
|
94
|
+
~/.fieldtheory/
|
|
95
|
+
bookmarks.jsonl # raw bookmark cache (one per line)
|
|
96
|
+
bookmarks.db # SQLite FTS5 search index
|
|
97
|
+
bookmarks-meta.json # sync metadata
|
|
98
|
+
oauth-token.json # OAuth token (if using API mode)
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
Override the location with `FT_DATA_DIR`:
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
export FT_DATA_DIR=/path/to/custom/dir
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Categories
|
|
108
|
+
|
|
109
|
+
The regex classifier sorts bookmarks into 7 categories:
|
|
110
|
+
|
|
111
|
+
- **tool** — GitHub repos, CLI tools, npm packages, open-source projects
|
|
112
|
+
- **security** — CVEs, vulnerabilities, exploits, supply chain
|
|
113
|
+
- **technique** — Tutorials, demos, code patterns, "how I built X"
|
|
114
|
+
- **launch** — Product launches, announcements, "just shipped"
|
|
115
|
+
- **research** — ArXiv papers, studies, academic findings
|
|
116
|
+
- **opinion** — Takes, analysis, commentary, threads
|
|
117
|
+
- **commerce** — Products, shopping, physical goods
|
|
118
|
+
|
|
119
|
+
Use `ft classify --deep` for LLM-powered classification that catches what regex misses.
|
|
120
|
+
|
|
121
|
+
## Platform support
|
|
122
|
+
|
|
123
|
+
| Feature | macOS | Linux | Windows |
|
|
124
|
+
|---------|-------|-------|---------|
|
|
125
|
+
| Chrome session sync (`ft sync`) | Yes | No* | No* |
|
|
126
|
+
| OAuth API sync (`ft sync --api`) | Yes | Yes | Yes |
|
|
127
|
+
| Search, list, classify, viz | Yes | Yes | Yes |
|
|
128
|
+
|
|
129
|
+
\*Chrome session extraction uses macOS Keychain. On other platforms, use `ft auth` + `ft sync --api`.
|
|
130
|
+
|
|
131
|
+
## License
|
|
132
|
+
|
|
133
|
+
MIT
|
package/bin/ft.mjs
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM-based bookmark classification — uses `claude -p` or `codex exec`
|
|
3
|
+
* (whichever the user has via their Max/Pro subscription) to classify
|
|
4
|
+
* bookmarks that the regex classifier couldn't categorize.
|
|
5
|
+
*
|
|
6
|
+
* No API keys needed. No local models. Just a logged-in Claude or Codex CLI.
|
|
7
|
+
*/
|
|
8
|
+
import { execFileSync, execSync } from 'node:child_process';
|
|
9
|
+
import { openDb, saveDb } from './db.js';
|
|
10
|
+
import { twitterBookmarksIndexPath } from './paths.js';
|
|
11
|
+
const BATCH_SIZE = 50;
|
|
12
|
+
function detectEngine() {
|
|
13
|
+
try {
|
|
14
|
+
execSync('which claude', { stdio: 'ignore' });
|
|
15
|
+
return 'claude';
|
|
16
|
+
}
|
|
17
|
+
catch { /* not found */ }
|
|
18
|
+
try {
|
|
19
|
+
execSync('which codex', { stdio: 'ignore' });
|
|
20
|
+
return 'codex';
|
|
21
|
+
}
|
|
22
|
+
catch { /* not found */ }
|
|
23
|
+
return null;
|
|
24
|
+
}
|
|
25
|
+
function invokeEngine(engine, prompt) {
|
|
26
|
+
const bin = engine === 'claude' ? 'claude' : 'codex';
|
|
27
|
+
const args = engine === 'claude'
|
|
28
|
+
? ['-p', '--output-format', 'text', prompt]
|
|
29
|
+
: ['exec', '--full-auto', prompt];
|
|
30
|
+
return execFileSync(bin, args, {
|
|
31
|
+
encoding: 'utf-8',
|
|
32
|
+
timeout: 120_000, // 2 minutes per batch
|
|
33
|
+
maxBuffer: 1024 * 1024,
|
|
34
|
+
stdio: ['pipe', 'pipe', 'ignore'],
|
|
35
|
+
}).trim();
|
|
36
|
+
}
|
|
37
|
+
// ── Prompt construction ─────────────────────────────────────────────────
|
|
38
|
+
function buildPrompt(bookmarks) {
|
|
39
|
+
const items = bookmarks.map((b, i) => {
|
|
40
|
+
const links = b.links ? ` | Links: ${b.links}` : '';
|
|
41
|
+
return `[${i}] id=${b.id} @${b.authorHandle ?? 'unknown'}: ${b.text.slice(0, 300)}${links}`;
|
|
42
|
+
}).join('\n');
|
|
43
|
+
return `Classify each bookmark into one or more categories. Return ONLY a JSON array, no other text.
|
|
44
|
+
|
|
45
|
+
Known categories:
|
|
46
|
+
- tool: GitHub repos, CLI tools, npm packages, open-source projects, developer tools
|
|
47
|
+
- security: CVEs, vulnerabilities, exploits, supply chain attacks, breaches, hacking
|
|
48
|
+
- technique: tutorials, "how I built X", code patterns, architecture deep dives, demos
|
|
49
|
+
- launch: product launches, announcements, "just shipped", new releases
|
|
50
|
+
- research: academic papers, arxiv, studies, scientific findings
|
|
51
|
+
- opinion: hot takes, commentary, threads, "lessons learned", analysis
|
|
52
|
+
- commerce: products for sale, shopping, affiliate links, physical goods
|
|
53
|
+
|
|
54
|
+
You may create new categories if a bookmark clearly doesn't fit the above. Use short lowercase slugs (e.g. "health", "design", "career", "culture", "ai-news", "personal-story"). Prefer existing categories when they fit.
|
|
55
|
+
|
|
56
|
+
Rules:
|
|
57
|
+
- A bookmark can have multiple categories (e.g. a security tool is both "security" and "tool")
|
|
58
|
+
- "primary" is the single best-fit category
|
|
59
|
+
- If nothing fits well, create an appropriate new category rather than forcing a bad fit
|
|
60
|
+
- Return valid JSON only: [{"id":"...","categories":["..."],"primary":"..."},...]
|
|
61
|
+
|
|
62
|
+
Bookmarks:
|
|
63
|
+
${items}`;
|
|
64
|
+
}
|
|
65
|
+
// ── Parse and validate response ─────────────────────────────────────────
|
|
66
|
+
function parseResponse(raw, batchIds) {
|
|
67
|
+
// Extract JSON array from response (model might add markdown fences or commentary)
|
|
68
|
+
const jsonMatch = raw.match(/\[[\s\S]*\]/);
|
|
69
|
+
if (!jsonMatch)
|
|
70
|
+
throw new Error('No JSON array found in response');
|
|
71
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
72
|
+
if (!Array.isArray(parsed))
|
|
73
|
+
throw new Error('Response is not an array');
|
|
74
|
+
const results = [];
|
|
75
|
+
for (const item of parsed) {
|
|
76
|
+
if (!item.id || !batchIds.has(item.id))
|
|
77
|
+
continue;
|
|
78
|
+
const rawArr = item.categories ?? item.domains ?? [];
|
|
79
|
+
const categories = (Array.isArray(rawArr) ? rawArr : [])
|
|
80
|
+
.filter((c) => typeof c === 'string' && c.length > 0)
|
|
81
|
+
.map((c) => c.toLowerCase().trim());
|
|
82
|
+
const primary = (typeof item.primary === 'string' && item.primary.length > 0)
|
|
83
|
+
? item.primary.toLowerCase().trim()
|
|
84
|
+
: categories[0];
|
|
85
|
+
if (categories.length > 0 && primary) {
|
|
86
|
+
results.push({ id: item.id, categories, primary });
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
return results;
|
|
90
|
+
}
|
|
91
|
+
export async function classifyWithLlm(options = {}) {
|
|
92
|
+
const engine = detectEngine();
|
|
93
|
+
if (!engine) {
|
|
94
|
+
throw new Error('No supported LLM CLI found.\n' +
|
|
95
|
+
'Install one of the following and log in:\n' +
|
|
96
|
+
' - Claude Code: https://docs.anthropic.com/en/docs/claude-code\n' +
|
|
97
|
+
' - Codex CLI: https://github.com/openai/codex');
|
|
98
|
+
}
|
|
99
|
+
const dbPath = twitterBookmarksIndexPath();
|
|
100
|
+
const db = await openDb(dbPath);
|
|
101
|
+
try {
|
|
102
|
+
// Fetch unclassified bookmarks
|
|
103
|
+
const rows = db.exec(`SELECT id, text, author_handle, links_json FROM bookmarks
|
|
104
|
+
WHERE primary_category = 'unclassified' OR primary_category IS NULL
|
|
105
|
+
ORDER BY RANDOM()`);
|
|
106
|
+
if (!rows.length || !rows[0].values.length) {
|
|
107
|
+
return { engine, totalUnclassified: 0, classified: 0, failed: 0, batches: 0 };
|
|
108
|
+
}
|
|
109
|
+
const unclassified = rows[0].values.map(r => ({
|
|
110
|
+
id: r[0],
|
|
111
|
+
text: r[1],
|
|
112
|
+
authorHandle: r[2],
|
|
113
|
+
links: r[3],
|
|
114
|
+
}));
|
|
115
|
+
const totalUnclassified = unclassified.length;
|
|
116
|
+
let classified = 0;
|
|
117
|
+
let failed = 0;
|
|
118
|
+
let batchCount = 0;
|
|
119
|
+
// Process in batches
|
|
120
|
+
for (let i = 0; i < unclassified.length; i += BATCH_SIZE) {
|
|
121
|
+
const batch = unclassified.slice(i, i + BATCH_SIZE);
|
|
122
|
+
const batchIds = new Set(batch.map(b => b.id));
|
|
123
|
+
batchCount++;
|
|
124
|
+
options.onBatch?.(i, totalUnclassified);
|
|
125
|
+
try {
|
|
126
|
+
const prompt = buildPrompt(batch);
|
|
127
|
+
const raw = invokeEngine(engine, prompt);
|
|
128
|
+
const results = parseResponse(raw, batchIds);
|
|
129
|
+
// Update SQLite
|
|
130
|
+
const stmt = db.prepare(`UPDATE bookmarks SET categories = ?, primary_category = ? WHERE id = ?`);
|
|
131
|
+
for (const r of results) {
|
|
132
|
+
stmt.run([r.categories.join(','), r.primary, r.id]);
|
|
133
|
+
}
|
|
134
|
+
stmt.free();
|
|
135
|
+
classified += results.length;
|
|
136
|
+
failed += batch.length - results.length;
|
|
137
|
+
// Save after each batch in case of interruption
|
|
138
|
+
saveDb(db, dbPath);
|
|
139
|
+
}
|
|
140
|
+
catch (err) {
|
|
141
|
+
failed += batch.length;
|
|
142
|
+
process.stderr.write(` Batch ${batchCount} failed: ${err.message}\n`);
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
return { engine, totalUnclassified, classified, failed, batches: batchCount };
|
|
146
|
+
}
|
|
147
|
+
finally {
|
|
148
|
+
db.close();
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
function buildDomainPrompt(bookmarks) {
|
|
152
|
+
const items = bookmarks.map((b, i) => {
|
|
153
|
+
const cats = b.categories ? ` [${b.categories}]` : '';
|
|
154
|
+
return `[${i}] id=${b.id} @${b.authorHandle ?? 'unknown'}${cats}: ${b.text.slice(0, 300)}`;
|
|
155
|
+
}).join('\n');
|
|
156
|
+
return `Classify each bookmark by its SUBJECT DOMAIN — the topic or field it's about, NOT its format.
|
|
157
|
+
|
|
158
|
+
The bookmark's format (tool, technique, opinion, etc.) is already classified. Your job: what FIELD does this belong to?
|
|
159
|
+
|
|
160
|
+
Examples:
|
|
161
|
+
- A "technique" about Docker optimization → domain: "devops"
|
|
162
|
+
- A "technique" about diet plans → domain: "health"
|
|
163
|
+
- A "tool" for an AI agent framework → domain: "ai"
|
|
164
|
+
- An "opinion" about egg freezing → domain: "health"
|
|
165
|
+
- An "opinion" about market cycles → domain: "finance"
|
|
166
|
+
|
|
167
|
+
Known domains (prefer these when they fit):
|
|
168
|
+
ai, finance, defense, crypto, web-dev, devops, startups, health, politics, design, education, science, hardware, gaming, media, energy, legal, robotics, space
|
|
169
|
+
|
|
170
|
+
You may create new domain slugs if needed. Use short lowercase slugs. Prefer broad domains ("ai" not "ai-agents", "finance" not "quantitative-trading").
|
|
171
|
+
|
|
172
|
+
Rules:
|
|
173
|
+
- A bookmark can have multiple domains (e.g. an AI tool for finance is "ai,finance")
|
|
174
|
+
- "primary" is the single best-fit domain
|
|
175
|
+
- Return valid JSON only: [{"id":"...","domains":["..."],"primary":"..."},...]
|
|
176
|
+
|
|
177
|
+
Bookmarks:
|
|
178
|
+
${items}`;
|
|
179
|
+
}
|
|
180
|
+
export async function classifyDomainsWithLlm(options = {}) {
|
|
181
|
+
const engine = detectEngine();
|
|
182
|
+
if (!engine) {
|
|
183
|
+
throw new Error('No supported LLM CLI found.\n' +
|
|
184
|
+
'Install one of the following and log in:\n' +
|
|
185
|
+
' - Claude Code: https://docs.anthropic.com/en/docs/claude-code\n' +
|
|
186
|
+
' - Codex CLI: https://github.com/openai/codex');
|
|
187
|
+
}
|
|
188
|
+
const dbPath = twitterBookmarksIndexPath();
|
|
189
|
+
const db = await openDb(dbPath);
|
|
190
|
+
// Ensure domain columns exist (migration from schema v2)
|
|
191
|
+
try {
|
|
192
|
+
db.run('ALTER TABLE bookmarks ADD COLUMN domains TEXT');
|
|
193
|
+
}
|
|
194
|
+
catch { /* already exists */ }
|
|
195
|
+
try {
|
|
196
|
+
db.run('ALTER TABLE bookmarks ADD COLUMN primary_domain TEXT');
|
|
197
|
+
}
|
|
198
|
+
catch { /* already exists */ }
|
|
199
|
+
try {
|
|
200
|
+
const where = options.all
|
|
201
|
+
? '1=1'
|
|
202
|
+
: 'primary_domain IS NULL';
|
|
203
|
+
const rows = db.exec(`SELECT id, text, author_handle, categories FROM bookmarks
|
|
204
|
+
WHERE ${where} ORDER BY RANDOM()`);
|
|
205
|
+
if (!rows.length || !rows[0].values.length) {
|
|
206
|
+
return { engine, totalUnclassified: 0, classified: 0, failed: 0, batches: 0 };
|
|
207
|
+
}
|
|
208
|
+
const bookmarks = rows[0].values.map(r => ({
|
|
209
|
+
id: r[0],
|
|
210
|
+
text: r[1],
|
|
211
|
+
authorHandle: r[2],
|
|
212
|
+
categories: r[3],
|
|
213
|
+
}));
|
|
214
|
+
const total = bookmarks.length;
|
|
215
|
+
let classified = 0;
|
|
216
|
+
let failed = 0;
|
|
217
|
+
let batchCount = 0;
|
|
218
|
+
for (let i = 0; i < bookmarks.length; i += BATCH_SIZE) {
|
|
219
|
+
const batch = bookmarks.slice(i, i + BATCH_SIZE);
|
|
220
|
+
const batchIds = new Set(batch.map(b => b.id));
|
|
221
|
+
batchCount++;
|
|
222
|
+
options.onBatch?.(i, total);
|
|
223
|
+
try {
|
|
224
|
+
const prompt = buildDomainPrompt(batch);
|
|
225
|
+
const raw = invokeEngine(engine, prompt);
|
|
226
|
+
// Reuse the same parse logic — structure is identical
|
|
227
|
+
const results = parseResponse(raw, batchIds);
|
|
228
|
+
const stmt = db.prepare(`UPDATE bookmarks SET domains = ?, primary_domain = ? WHERE id = ?`);
|
|
229
|
+
for (const r of results) {
|
|
230
|
+
stmt.run([r.categories.join(','), r.primary, r.id]);
|
|
231
|
+
}
|
|
232
|
+
stmt.free();
|
|
233
|
+
classified += results.length;
|
|
234
|
+
failed += batch.length - results.length;
|
|
235
|
+
saveDb(db, dbPath);
|
|
236
|
+
}
|
|
237
|
+
catch (err) {
|
|
238
|
+
failed += batch.length;
|
|
239
|
+
process.stderr.write(` Batch ${batchCount} failed: ${err.message}\n`);
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
return { engine, totalUnclassified: total, classified, failed, batches: batchCount };
|
|
243
|
+
}
|
|
244
|
+
finally {
|
|
245
|
+
db.close();
|
|
246
|
+
}
|
|
247
|
+
}
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Bookmark classification — tags each bookmark by type so the theories
|
|
3
|
+
* system can match intelligently instead of random-sampling.
|
|
4
|
+
*
|
|
5
|
+
* Categories (non-exclusive, a bookmark can have multiple):
|
|
6
|
+
* tool — GitHub repos, CLI tools, npm packages, open-source projects
|
|
7
|
+
* security — CVEs, vulnerabilities, supply chain, exploits
|
|
8
|
+
* technique — tutorials, demos, code patterns, "how I built X"
|
|
9
|
+
* launch — product launches, announcements, "just shipped"
|
|
10
|
+
* research — arxiv papers, studies, academic findings
|
|
11
|
+
* opinion — takes, analysis, commentary, threads
|
|
12
|
+
* commerce — products, shopping, physical goods
|
|
13
|
+
*
|
|
14
|
+
* The classifier is rule-based (fast, predictable, no LLM cost).
|
|
15
|
+
* It runs over the full corpus in <1s and stores results in the SQLite index.
|
|
16
|
+
*/
|
|
17
|
+
// ── Pattern sets ─────────────────────────────────────────────────────────
|
|
18
|
+
const TOOL_PATTERNS = [
|
|
19
|
+
/github\.com\/[\w-]+\/[\w-]+/i,
|
|
20
|
+
/\bnpm\s+(install|i)\b/i,
|
|
21
|
+
/\bpip\s+install\b/i,
|
|
22
|
+
/\bcargo\s+add\b/i,
|
|
23
|
+
/\bbrew\s+install\b/i,
|
|
24
|
+
/\bopen[\s-]?source\b/i,
|
|
25
|
+
/\bcli\b.*\btool\b/i,
|
|
26
|
+
/\btool\b.*\bcli\b/i,
|
|
27
|
+
/\brust\s+crate\b/i,
|
|
28
|
+
/\bvscode\s+extension\b/i,
|
|
29
|
+
/\bnpx\s+/i,
|
|
30
|
+
/\brepo\b.*\bgithub\b/i,
|
|
31
|
+
/\bgithub\b.*\brepo\b/i,
|
|
32
|
+
/\bself[\s-]?hosted\b/i,
|
|
33
|
+
/\bopen[\s-]?sourced?\b/i,
|
|
34
|
+
];
|
|
35
|
+
const SECURITY_PATTERNS = [
|
|
36
|
+
/\bcve[-\s]?\d{4}/i,
|
|
37
|
+
/\bvulnerabilit/i,
|
|
38
|
+
/\bexploit/i,
|
|
39
|
+
/\bmalware\b/i,
|
|
40
|
+
/\bransomware\b/i,
|
|
41
|
+
/\bsupply[\s-]?chain\s+attack/i,
|
|
42
|
+
/\bsecurity\s+(flaw|bug|issue|patch|advisory|update|breach)/i,
|
|
43
|
+
/\bbreach\b/i,
|
|
44
|
+
/\bbackdoor\b/i,
|
|
45
|
+
/\bzero[\s-]?day\b/i,
|
|
46
|
+
/\bremote\s+code\s+execution\b/i,
|
|
47
|
+
/\brce\b/i,
|
|
48
|
+
/\bprivilege\s+escalation\b/i,
|
|
49
|
+
/\bcompromised?\b/i,
|
|
50
|
+
];
|
|
51
|
+
const TECHNIQUE_PATTERNS = [
|
|
52
|
+
/\bhow\s+(I|we|to)\b/i,
|
|
53
|
+
/\btutorial\b/i,
|
|
54
|
+
/\bwalkthrough\b/i,
|
|
55
|
+
/\bstep[\s-]?by[\s-]?step\b/i,
|
|
56
|
+
/\bbuilt\s+(with|using|this|a|an|my)\b/i,
|
|
57
|
+
/\bhere'?s?\s+how\b/i,
|
|
58
|
+
/\bcode\s+(pattern|example|snippet|sample)\b/i,
|
|
59
|
+
/\barchitecture\b.*\b(of|for|behind)\b/i,
|
|
60
|
+
/\bimplemented?\b.*\bfrom\s+scratch\b/i,
|
|
61
|
+
/\bunder\s+the\s+hood\b/i,
|
|
62
|
+
/\bdeep[\s-]?dive\b/i,
|
|
63
|
+
/\btechnique\b/i,
|
|
64
|
+
/\bpattern\b.*\b(for|in|to)\b/i,
|
|
65
|
+
];
|
|
66
|
+
const LAUNCH_PATTERNS = [
|
|
67
|
+
/\bjust\s+(launched|shipped|released|dropped|published)\b/i,
|
|
68
|
+
/\bwe('re|\s+are)\s+(launching|shipping|releasing)\b/i,
|
|
69
|
+
/\bannouncing\b/i,
|
|
70
|
+
/\bintroduc(ing|es?)\b/i,
|
|
71
|
+
/\bnow\s+(available|live|in\s+beta)\b/i,
|
|
72
|
+
/\bv\d+\.\d+/i,
|
|
73
|
+
/\b(alpha|beta)\s+(release|launch|is\s+here)\b/i,
|
|
74
|
+
/\bproduct\s+hunt\b/i,
|
|
75
|
+
/🚀.*\b(launch|ship|live)\b/i,
|
|
76
|
+
/\bcheck\s+it\s+out\b/i,
|
|
77
|
+
];
|
|
78
|
+
const RESEARCH_PATTERNS = [
|
|
79
|
+
/arxiv\.org/i,
|
|
80
|
+
/\bpaper\b.*\b(new|our|this|the)\b/i,
|
|
81
|
+
/\b(new|our|this)\b.*\bpaper\b/i,
|
|
82
|
+
/\bstudy\b.*\b(finds?|shows?|reveals?)\b/i,
|
|
83
|
+
/\bfindings?\b/i,
|
|
84
|
+
/\bpeer[\s-]?review/i,
|
|
85
|
+
/\bpreprint\b/i,
|
|
86
|
+
/\bresearch\b.*\b(from|by|at|shows?)\b/i,
|
|
87
|
+
/\bpublished\s+in\b/i,
|
|
88
|
+
/\bjournal\b/i,
|
|
89
|
+
/\bstate[\s-]?of[\s-]?the[\s-]?art\b/i,
|
|
90
|
+
];
|
|
91
|
+
const OPINION_PATTERNS = [
|
|
92
|
+
/\bthread\b.*👇/i,
|
|
93
|
+
/\bunpopular\s+opinion\b/i,
|
|
94
|
+
/\bhot\s+take\b/i,
|
|
95
|
+
/\bhere'?s?\s+(why|what|my\s+take)\b/i,
|
|
96
|
+
/\bi\s+think\b.*\b(about|that)\b/i,
|
|
97
|
+
/\bcontroversial\b/i,
|
|
98
|
+
/\boverrated\b/i,
|
|
99
|
+
/\bunderrated\b/i,
|
|
100
|
+
/\blessons?\s+(learned|from)\b/i,
|
|
101
|
+
/\bmistakes?\s+(I|we)\b/i,
|
|
102
|
+
];
|
|
103
|
+
const COMMERCE_PATTERNS = [
|
|
104
|
+
/\bamazon\.com\b/i,
|
|
105
|
+
/\bshop\s+(here|now)\b/i,
|
|
106
|
+
/\bbuy\s+(now|here|this)\b/i,
|
|
107
|
+
/\bdiscount\b/i,
|
|
108
|
+
/\bcoupon\b/i,
|
|
109
|
+
/\baffiliate\b/i,
|
|
110
|
+
/\bgeni\.us\b/i,
|
|
111
|
+
/\ba\.co\//i,
|
|
112
|
+
/\$\d+(\.\d{2})?\s*(off|USD|discount)/i,
|
|
113
|
+
];
|
|
114
|
+
const GITHUB_URL_RE = /github\.com\/[\w.-]+\/[\w.-]+/gi;
|
|
115
|
+
const URL_RE = /https?:\/\/[^\s)>\]]+/gi;
|
|
116
|
+
const TCO_RE = /https?:\/\/t\.co\/\w+/gi;
|
|
117
|
+
// ── Domains that indicate tool/project bookmarks ─────────────────────────
|
|
118
|
+
const TOOL_DOMAINS = new Set([
|
|
119
|
+
'github.com',
|
|
120
|
+
'gitlab.com',
|
|
121
|
+
'huggingface.co',
|
|
122
|
+
'npmjs.com',
|
|
123
|
+
'pypi.org',
|
|
124
|
+
'crates.io',
|
|
125
|
+
'pkg.go.dev',
|
|
126
|
+
]);
|
|
127
|
+
const RESEARCH_DOMAINS = new Set([
|
|
128
|
+
'arxiv.org',
|
|
129
|
+
'scholar.google.com',
|
|
130
|
+
'semanticscholar.org',
|
|
131
|
+
'biorxiv.org',
|
|
132
|
+
'medrxiv.org',
|
|
133
|
+
'nature.com',
|
|
134
|
+
'science.org',
|
|
135
|
+
]);
|
|
136
|
+
const COMMERCE_DOMAINS = new Set([
|
|
137
|
+
'amazon.com',
|
|
138
|
+
'www.amazon.com',
|
|
139
|
+
'a.co',
|
|
140
|
+
'store.steampowered.com',
|
|
141
|
+
'geni.us',
|
|
142
|
+
'ebay.com',
|
|
143
|
+
]);
|
|
144
|
+
// ── Classify a single bookmark ───────────────────────────────────────────
|
|
145
|
+
export function classifyBookmark(bookmark) {
|
|
146
|
+
const text = bookmark.text ?? '';
|
|
147
|
+
const allLinks = [...(bookmark.links ?? [])];
|
|
148
|
+
// Extract URLs from tweet text (excluding t.co shortlinks)
|
|
149
|
+
const textUrls = (text.match(URL_RE) ?? []).filter((u) => !TCO_RE.test(u));
|
|
150
|
+
const extractedUrls = [...new Set([...allLinks, ...textUrls])];
|
|
151
|
+
// Extract GitHub URLs
|
|
152
|
+
const githubMatches = text.match(GITHUB_URL_RE) ?? [];
|
|
153
|
+
const githubFromLinks = allLinks.filter((l) => /github\.com/i.test(l));
|
|
154
|
+
const githubUrls = [...new Set([...githubMatches.map((m) => `https://${m}`), ...githubFromLinks])];
|
|
155
|
+
// Get domains from all URLs
|
|
156
|
+
const domains = extractedUrls
|
|
157
|
+
.map((u) => {
|
|
158
|
+
try {
|
|
159
|
+
return new URL(u).hostname.replace(/^www\./, '');
|
|
160
|
+
}
|
|
161
|
+
catch {
|
|
162
|
+
return '';
|
|
163
|
+
}
|
|
164
|
+
})
|
|
165
|
+
.filter(Boolean);
|
|
166
|
+
const categories = [];
|
|
167
|
+
// Pattern matching
|
|
168
|
+
const matchesAny = (patterns) => patterns.some((p) => p.test(text));
|
|
169
|
+
if (matchesAny(SECURITY_PATTERNS))
|
|
170
|
+
categories.push('security');
|
|
171
|
+
if (matchesAny(TOOL_PATTERNS) || githubUrls.length > 0 || domains.some((d) => TOOL_DOMAINS.has(d)))
|
|
172
|
+
categories.push('tool');
|
|
173
|
+
if (matchesAny(TECHNIQUE_PATTERNS))
|
|
174
|
+
categories.push('technique');
|
|
175
|
+
if (matchesAny(LAUNCH_PATTERNS))
|
|
176
|
+
categories.push('launch');
|
|
177
|
+
if (matchesAny(RESEARCH_PATTERNS) || domains.some((d) => RESEARCH_DOMAINS.has(d)))
|
|
178
|
+
categories.push('research');
|
|
179
|
+
if (matchesAny(OPINION_PATTERNS))
|
|
180
|
+
categories.push('opinion');
|
|
181
|
+
if (matchesAny(COMMERCE_PATTERNS) || domains.some((d) => COMMERCE_DOMAINS.has(d)))
|
|
182
|
+
categories.push('commerce');
|
|
183
|
+
// Primary = first match (ordered by priority above: security > tool > technique > ...)
|
|
184
|
+
const primary = categories[0] ?? 'unclassified';
|
|
185
|
+
return { categories, primary, extractedUrls, githubUrls };
|
|
186
|
+
}
|
|
187
|
+
export function classifyCorpus(bookmarks) {
|
|
188
|
+
const results = new Map();
|
|
189
|
+
const counts = {};
|
|
190
|
+
let unclassified = 0;
|
|
191
|
+
for (const b of bookmarks) {
|
|
192
|
+
const result = classifyBookmark(b);
|
|
193
|
+
results.set(b.id, result);
|
|
194
|
+
if (result.categories.length === 0) {
|
|
195
|
+
unclassified++;
|
|
196
|
+
}
|
|
197
|
+
for (const cat of result.categories) {
|
|
198
|
+
counts[cat] = (counts[cat] ?? 0) + 1;
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
return {
|
|
202
|
+
results,
|
|
203
|
+
summary: {
|
|
204
|
+
total: bookmarks.length,
|
|
205
|
+
classified: bookmarks.length - unclassified,
|
|
206
|
+
unclassified,
|
|
207
|
+
byCategoryCount: counts,
|
|
208
|
+
},
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
// ── Format summary for CLI output ────────────────────────────────────────
|
|
212
|
+
export function formatClassificationSummary(summary) {
|
|
213
|
+
const lines = [
|
|
214
|
+
`Classified ${summary.classified}/${summary.total} bookmarks (${summary.unclassified} unclassified)`,
|
|
215
|
+
'',
|
|
216
|
+
];
|
|
217
|
+
const sorted = Object.entries(summary.byCategoryCount).sort((a, b) => b[1] - a[1]);
|
|
218
|
+
for (const [cat, count] of sorted) {
|
|
219
|
+
const pct = ((count / summary.total) * 100).toFixed(1);
|
|
220
|
+
lines.push(` ${cat.padEnd(12)} ${String(count).padStart(5)} (${pct}%)`);
|
|
221
|
+
}
|
|
222
|
+
return lines.join('\n');
|
|
223
|
+
}
|