arcfetch 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,215 @@
1
+ # arcfetch
2
+
3
+ Fetch URLs, extract clean article content, and cache as markdown. Supports automatic JavaScript rendering fallback via Playwright (local or Docker).
4
+
5
+ ## Features
6
+
7
+ - **Smart Fetching**: Simple HTTP first, automatic Playwright fallback for JS-heavy sites
8
+ - **Quality Gates**: Configurable quality thresholds with automatic retry
9
+ - **Docker Support**: Auto-launches Playwright in Docker when available
10
+ - **Clean Markdown**: Mozilla Readability + Turndown for 90-95% token reduction
11
+ - **Temp → Docs Workflow**: Cache to temp folder, promote to docs when ready
12
+ - **CLI & MCP**: Available as command-line tool and MCP server
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ bun install
18
+ ```
19
+
20
+ For Docker Playwright support (recommended):
21
+ ```bash
22
+ docker pull mcr.microsoft.com/playwright:v1.40.0-jammy
23
+ ```
24
+
25
+ ## Quick Start
26
+
27
+ ### CLI
28
+
29
+ ```bash
30
+ # Fetch a URL
31
+ arcfetch fetch https://example.com/article
32
+
33
+ # List cached references
34
+ arcfetch list
35
+
36
+ # Promote to docs folder
37
+ arcfetch promote REF-001
38
+
39
+ # Delete a reference
40
+ arcfetch delete REF-001
41
+ ```
42
+
43
+ ### MCP Server
44
+
45
+ Add to your Claude Code MCP configuration:
46
+
47
+ ```json
48
+ {
49
+ "mcpServers": {
50
+ "arcfetch": {
51
+ "command": "bun",
52
+ "args": ["run", "/path/to/arcfetch/index.ts"]
53
+ }
54
+ }
55
+ }
56
+ ```
57
+
58
+ ## CLI Commands
59
+
60
+ ### fetch
61
+
62
+ Fetch URL and save to temp folder.
63
+
64
+ ```bash
65
+ arcfetch fetch <url> [options]
66
+
67
+ Options:
68
+ -q, --query <text> Search query (saved as metadata)
69
+ -o, --output <format> Output: text, json, summary (default: text)
70
+ -v, --verbose Show detailed output
71
+ --min-quality <n> Minimum quality score 0-100 (default: 60)
72
+ --temp-dir <path> Temp folder (default: .tmp)
73
+ --docs-dir <path> Docs folder (default: docs/ai/references)
74
+ --playwright <mode> Playwright mode: auto, local, docker
75
+ ```
76
+
77
+ ### list
78
+
79
+ List all cached references.
80
+
81
+ ```bash
82
+ arcfetch list [-o json]
83
+ ```
84
+
85
+ ### promote
86
+
87
+ Move reference from temp to docs folder.
88
+
89
+ ```bash
90
+ arcfetch promote <ref-id>
91
+ ```
92
+
93
+ ### delete
94
+
95
+ Delete a cached reference.
96
+
97
+ ```bash
98
+ arcfetch delete <ref-id>
99
+ ```
100
+
101
+ ### config
102
+
103
+ Show current configuration.
104
+
105
+ ```bash
106
+ arcfetch config
107
+ ```
108
+
109
+ ## MCP Tools
110
+
111
+ | Tool | Description |
112
+ |------|-------------|
113
+ | `fetch_url` | Fetch URL with auto JS fallback, save to temp |
114
+ | `list_cached` | List all cached references |
115
+ | `promote_reference` | Move from temp to docs folder |
116
+ | `delete_cached` | Delete a cached reference |
117
+
118
+ ## Configuration
119
+
120
+ ### Config File
121
+
122
+ Create `arcfetch.config.json` in your project root:
123
+
124
+ ```json
125
+ {
126
+ "quality": {
127
+ "minScore": 60,
128
+ "jsRetryThreshold": 85
129
+ },
130
+ "paths": {
131
+ "tempDir": ".tmp",
132
+ "docsDir": "docs/ai/references"
133
+ },
134
+ "playwright": {
135
+ "mode": "auto",
136
+ "dockerImage": "mcr.microsoft.com/playwright:v1.40.0-jammy",
137
+ "timeout": 30000
138
+ }
139
+ }
140
+ ```
141
+
142
+ ### Environment Variables
143
+
144
+ ```bash
145
+ SOFETCH_MIN_SCORE=60
146
+ SOFETCH_TEMP_DIR=.tmp
147
+ SOFETCH_DOCS_DIR=docs/ai/references
148
+ SOFETCH_PLAYWRIGHT_MODE=auto
149
+ SOFETCH_DOCKER_IMAGE=mcr.microsoft.com/playwright:v1.40.0-jammy
150
+ ```
151
+
152
+ ## Quality Pipeline
153
+
154
+ ```
155
+ URL → Simple Fetch → Quality Check
156
+
157
+ ┌───────────────┼───────────────┐
158
+ ▼ ▼ ▼
159
+ Score ≥ 85 60-84 < 60
160
+ │ │ │
161
+ ▼ ▼ ▼
162
+ Save Try Playwright Try Playwright
163
+ (if better) (required)
164
+ │ │
165
+ ▼ ▼
166
+ Compare & Score ≥ 60?
167
+ use best Yes → Save
168
+ No → Error
169
+ ```
170
+
171
+ ## Playwright Modes
172
+
173
+ | Mode | Description |
174
+ |------|-------------|
175
+ | `auto` | Use Docker if available, fall back to local |
176
+ | `docker` | Docker only (fails if Docker unavailable) |
177
+ | `local` | Local Playwright only (requires `bun install`) |
178
+
179
+ ## File Structure
180
+
181
+ ```
182
+ .tmp/ # Temporary cache (default)
183
+ REF-001-article-title.md
184
+ REF-002-another-article.md
185
+
186
+ docs/ai/references/ # Permanent docs (after promote)
187
+ REF-001-article-title.md
188
+ ```
189
+
190
+ ## Examples
191
+
192
+ ### Fetch with custom quality threshold
193
+
194
+ ```bash
195
+ arcfetch fetch https://spa-heavy-site.com --min-quality 70 --playwright docker
196
+ ```
197
+
198
+ ### Fetch and get JSON output
199
+
200
+ ```bash
201
+ arcfetch fetch https://example.com -o json
202
+ ```
203
+
204
+ ### Use in scripts
205
+
206
+ ```bash
207
+ # Get just the ref ID and path
208
+ result=$(arcfetch fetch https://example.com -o summary)
209
+ ref_id=$(echo $result | cut -d'|' -f1)
210
+ filepath=$(echo $result | cut -d'|' -f2)
211
+ ```
212
+
213
+ ## License
214
+
215
+ MIT
package/cli.ts ADDED
@@ -0,0 +1,455 @@
1
+ #!/usr/bin/env bun
2
+
3
+ import { loadConfig } from './src/config/index.js';
4
+ import { fetchUrl, closeBrowser } from './src/core/pipeline.js';
5
+ import { saveToTemp, listCached, promoteReference, deleteCached } from './src/core/cache.js';
6
+
7
+ // ============================================================================
8
+ // HELP
9
+ // ============================================================================
10
+
11
+ function showHelp(): void {
12
+ console.log(`
13
+ Sofetch v3.0 - Fetch URLs and cache as clean markdown
14
+
15
+ USAGE:
16
+ arcfetch <command> [options]
17
+
18
+ COMMANDS:
19
+ fetch <url> Fetch URL and save to temp folder
20
+ list List all cached references
21
+ promote <ref-id> Move reference from temp to docs folder
22
+ delete <ref-id> Delete a cached reference
23
+ config Show current configuration
24
+ help Show this help message
25
+
26
+ OPTIONS:
27
+ -q, --query <text> Search query (saved as metadata)
28
+ -o, --output <format> Output format (default: text)
29
+ - text: Plain text (LLM-friendly)
30
+ - json: Structured JSON
31
+ - path: Just the filepath
32
+ - summary: REF-ID|filepath
33
+ --pretty Human-friendly output with emojis
34
+ -v, --verbose Show detailed output
35
+ --min-quality <n> Minimum quality score 0-100 (default: 60)
36
+ --temp-dir <path> Temp folder (default: .tmp)
37
+ --docs-dir <path> Docs folder (default: docs/ai/references)
38
+ --playwright <mode> Playwright mode: auto, local, docker
39
+
40
+ EXAMPLES:
41
+ # Fetch a URL (plain output for LLMs)
42
+ arcfetch fetch https://example.com/article
43
+
44
+ # Fetch and get just the filepath
45
+ arcfetch fetch https://example.com -o path
46
+
47
+ # Fetch with human-friendly output
48
+ arcfetch fetch https://example.com --pretty
49
+
50
+ # Fetch with JSON output
51
+ arcfetch fetch https://example.com -o json
52
+
53
+ # List cached references
54
+ arcfetch list
55
+
56
+ # Promote to docs folder
57
+ arcfetch promote REF-001
58
+
59
+ ENVIRONMENT VARIABLES:
60
+ SOFETCH_MIN_SCORE Minimum quality score
61
+ SOFETCH_TEMP_DIR Temp directory
62
+ SOFETCH_DOCS_DIR Docs directory
63
+ SOFETCH_PLAYWRIGHT_MODE Playwright mode (auto/local/docker)
64
+
65
+ CONFIG FILE:
66
+ Place arcfetch.config.json in project root for persistent settings.
67
+ `);
68
+ }
69
+
70
+ // ============================================================================
71
+ // FETCH COMMAND
72
+ // ============================================================================
73
+
74
+ interface FetchOptions {
75
+ url: string;
76
+ query?: string;
77
+ output: 'text' | 'json' | 'summary' | 'path';
78
+ verbose: boolean;
79
+ pretty: boolean;
80
+ minQuality?: number;
81
+ tempDir?: string;
82
+ docsDir?: string;
83
+ playwrightMode?: 'auto' | 'local' | 'docker';
84
+ }
85
+
86
+ async function commandFetch(options: FetchOptions): Promise<void> {
87
+ const config = loadConfig({
88
+ minQuality: options.minQuality,
89
+ tempDir: options.tempDir,
90
+ docsDir: options.docsDir,
91
+ playwrightMode: options.playwrightMode,
92
+ });
93
+
94
+ if (options.verbose) {
95
+ console.error('🔧 Config:', JSON.stringify(config, null, 2));
96
+ }
97
+
98
+ // Fetch URL
99
+ const result = await fetchUrl(options.url, config, options.verbose);
100
+
101
+ // Close browser if it was used
102
+ await closeBrowser();
103
+
104
+ if (!result.success) {
105
+ if (options.output === 'json') {
106
+ console.log(
107
+ JSON.stringify(
108
+ {
109
+ success: false,
110
+ error: result.error,
111
+ suggestion: result.suggestion,
112
+ quality: result.quality,
113
+ },
114
+ null,
115
+ 2
116
+ )
117
+ );
118
+ } else {
119
+ console.error(`Error: ${result.error}`);
120
+ if (result.suggestion) {
121
+ console.error(`Suggestion: ${result.suggestion}`);
122
+ }
123
+ if (result.quality) {
124
+ console.error(`Quality: ${result.quality.score}/100`);
125
+ }
126
+ }
127
+ process.exit(1);
128
+ }
129
+
130
+ // Save to temp
131
+ const saveResult = await saveToTemp(config, result.title!, options.url, result.markdown!, options.query);
132
+
133
+ // Small delay to ensure file is flushed to disk (Bun-specific issue)
134
+ await new Promise((resolve) => setTimeout(resolve, 100));
135
+
136
+ if (saveResult.error) {
137
+ if (options.output === 'json') {
138
+ console.log(JSON.stringify({ success: false, error: saveResult.error }, null, 2));
139
+ } else {
140
+ console.error(`Error: Save failed: ${saveResult.error}`);
141
+ }
142
+ process.exit(1);
143
+ }
144
+
145
+ // Output result
146
+ if (options.output === 'json') {
147
+ console.log(
148
+ JSON.stringify(
149
+ {
150
+ success: true,
151
+ refId: saveResult.refId,
152
+ title: result.title,
153
+ byline: result.byline,
154
+ siteName: result.siteName,
155
+ excerpt: result.excerpt,
156
+ url: options.url,
157
+ filepath: saveResult.filepath,
158
+ size: result.markdown!.length,
159
+ tokens: Math.round(result.markdown!.length / 4),
160
+ quality: result.quality?.score,
161
+ usedPlaywright: result.usedPlaywright,
162
+ playwrightReason: result.playwrightReason,
163
+ query: options.query,
164
+ },
165
+ null,
166
+ 2
167
+ )
168
+ );
169
+ } else if (options.output === 'summary') {
170
+ console.log(`${saveResult.refId}|${saveResult.filepath}`);
171
+ } else if (options.output === 'path') {
172
+ console.log(saveResult.filepath);
173
+ } else if (options.pretty) {
174
+ // Pretty output with emojis (human-friendly)
175
+ console.log(`✅ Cached: ${saveResult.refId}\n`);
176
+ console.log(`**Title**: ${result.title}`);
177
+ if (result.byline) console.log(`**Author**: ${result.byline}`);
178
+ if (result.siteName) console.log(`**Source**: ${result.siteName}`);
179
+ if (result.excerpt) {
180
+ const excerpt = result.excerpt.slice(0, 150);
181
+ console.log(`**Summary**: ${excerpt}${result.excerpt.length > 150 ? '...' : ''}`);
182
+ }
183
+ console.log(`\n**Saved to**: ${saveResult.filepath}`);
184
+ console.log(`**Size**: ${result.markdown!.length} chars (~${Math.round(result.markdown!.length / 4)} tokens)`);
185
+ console.log(`**Quality**: ${result.quality?.score}/100`);
186
+ if (result.usedPlaywright) {
187
+ console.log(`**Playwright**: Yes (${result.playwrightReason})`);
188
+ }
189
+ console.log(`\n💡 To promote to docs: arcfetch promote ${saveResult.refId}`);
190
+ } else {
191
+ // Plain output (LLM-friendly, default)
192
+ console.log(`Cached: ${saveResult.refId}`);
193
+ console.log(`Title: ${result.title}`);
194
+ if (result.byline) console.log(`Author: ${result.byline}`);
195
+ if (result.siteName) console.log(`Source: ${result.siteName}`);
196
+ if (result.excerpt) {
197
+ const excerpt = result.excerpt.slice(0, 150);
198
+ console.log(`Summary: ${excerpt}${result.excerpt.length > 150 ? '...' : ''}`);
199
+ }
200
+ console.log(`Filepath: ${saveResult.filepath}`);
201
+ console.log(`Size: ${result.markdown!.length} chars (~${Math.round(result.markdown!.length / 4)} tokens)`);
202
+ console.log(`Quality: ${result.quality?.score}/100`);
203
+ if (result.usedPlaywright) {
204
+ console.log(`Playwright: Yes (${result.playwrightReason})`);
205
+ }
206
+ }
207
+ }
208
+
209
+ // ============================================================================
210
+ // LIST COMMAND
211
+ // ============================================================================
212
+
213
+ async function commandList(output: 'text' | 'json', pretty: boolean): Promise<void> {
214
+ const config = loadConfig();
215
+ const result = listCached(config);
216
+
217
+ if (result.error) {
218
+ console.error(`Error: ${result.error}`);
219
+ process.exit(1);
220
+ }
221
+
222
+ if (output === 'json') {
223
+ console.log(JSON.stringify(result.references, null, 2));
224
+ return;
225
+ }
226
+
227
+ if (result.references.length === 0) {
228
+ console.log(`No cached references in ${config.paths.tempDir}/`);
229
+ return;
230
+ }
231
+
232
+ if (pretty) {
233
+ console.log(`📚 Cached references (${result.references.length}):\n`);
234
+ for (const ref of result.references) {
235
+ console.log(`${ref.refId} | ${ref.title.slice(0, 50)}${ref.title.length > 50 ? '...' : ''}`);
236
+ console.log(` 📅 ${ref.fetchedDate} | 📄 ${Math.round(ref.size / 1024)}KB`);
237
+ console.log(` 🔗 ${ref.url.slice(0, 60)}${ref.url.length > 60 ? '...' : ''}`);
238
+ console.log('');
239
+ }
240
+ console.log(`💡 To promote: arcfetch promote <ref-id>`);
241
+ console.log(`💡 To delete: arcfetch delete <ref-id>`);
242
+ } else {
243
+ console.log(`Cached references (${result.references.length}):\n`);
244
+ for (const ref of result.references) {
245
+ console.log(`${ref.refId} | ${ref.title.slice(0, 50)}${ref.title.length > 50 ? '...' : ''}`);
246
+ console.log(` Date: ${ref.fetchedDate} | Size: ${Math.round(ref.size / 1024)}KB`);
247
+ console.log(` URL: ${ref.url.slice(0, 60)}${ref.url.length > 60 ? '...' : ''}`);
248
+ console.log('');
249
+ }
250
+ }
251
+ }
252
+
253
+ // ============================================================================
254
+ // PROMOTE COMMAND
255
+ // ============================================================================
256
+
257
+ async function commandPromote(refId: string, output: 'text' | 'json', pretty: boolean): Promise<void> {
258
+ const config = loadConfig();
259
+ const result = promoteReference(config, refId);
260
+
261
+ if (output === 'json') {
262
+ console.log(JSON.stringify(result, null, 2));
263
+ if (!result.success) process.exit(1);
264
+ return;
265
+ }
266
+
267
+ if (!result.success) {
268
+ console.error(`Error: ${result.error}`);
269
+ process.exit(1);
270
+ }
271
+
272
+ if (pretty) {
273
+ console.log(`✅ Promoted ${refId}`);
274
+ console.log(` From: ${result.fromPath}`);
275
+ console.log(` To: ${result.toPath}`);
276
+ } else {
277
+ console.log(`Promoted: ${refId}`);
278
+ console.log(`From: ${result.fromPath}`);
279
+ console.log(`To: ${result.toPath}`);
280
+ }
281
+ }
282
+
283
+ // ============================================================================
284
+ // DELETE COMMAND
285
+ // ============================================================================
286
+
287
+ async function commandDelete(refId: string, output: 'text' | 'json', pretty: boolean): Promise<void> {
288
+ const config = loadConfig();
289
+ const result = deleteCached(config, refId);
290
+
291
+ if (output === 'json') {
292
+ console.log(JSON.stringify(result, null, 2));
293
+ if (!result.success) process.exit(1);
294
+ return;
295
+ }
296
+
297
+ if (!result.success) {
298
+ console.error(`Error: ${result.error}`);
299
+ process.exit(1);
300
+ }
301
+
302
+ if (pretty) {
303
+ console.log(`✅ Deleted ${refId}`);
304
+ console.log(` File: ${result.filepath}`);
305
+ } else {
306
+ console.log(`Deleted: ${refId}`);
307
+ console.log(`File: ${result.filepath}`);
308
+ }
309
+ }
310
+
311
+ // ============================================================================
312
+ // CONFIG COMMAND
313
+ // ============================================================================
314
+
315
+ async function commandConfig(): Promise<void> {
316
+ const config = loadConfig();
317
+ console.log('Current configuration:\n');
318
+ console.log(JSON.stringify(config, null, 2));
319
+ }
320
+
321
+ // ============================================================================
322
+ // ARGUMENT PARSING
323
+ // ============================================================================
324
+
325
+ interface ParsedOptions {
326
+ output: 'text' | 'json' | 'summary' | 'path';
327
+ verbose: boolean;
328
+ pretty: boolean;
329
+ query?: string;
330
+ minQuality?: number;
331
+ tempDir?: string;
332
+ docsDir?: string;
333
+ playwrightMode?: 'auto' | 'local' | 'docker';
334
+ }
335
+
336
+ function parseArgs(): { command: string; args: string[]; options: ParsedOptions } {
337
+ const args = process.argv.slice(2);
338
+
339
+ if (args.length === 0) {
340
+ return { command: 'help', args: [], options: { output: 'text', verbose: false, pretty: false } };
341
+ }
342
+
343
+ const command = args[0];
344
+ const options: ParsedOptions = {
345
+ output: 'text',
346
+ verbose: false,
347
+ pretty: false,
348
+ };
349
+ const positionalArgs: string[] = [];
350
+
351
+ for (let i = 1; i < args.length; i++) {
352
+ const arg = args[i];
353
+ const next = args[i + 1];
354
+
355
+ if (arg === '-q' || arg === '--query') {
356
+ options.query = next;
357
+ i++;
358
+ } else if (arg === '-o' || arg === '--output') {
359
+ if (next === 'text' || next === 'json' || next === 'summary' || next === 'path') {
360
+ options.output = next;
361
+ }
362
+ i++;
363
+ } else if (arg === '-v' || arg === '--verbose') {
364
+ options.verbose = true;
365
+ } else if (arg === '--pretty') {
366
+ options.pretty = true;
367
+ } else if (arg === '--min-quality') {
368
+ options.minQuality = parseInt(next, 10);
369
+ i++;
370
+ } else if (arg === '--temp-dir') {
371
+ options.tempDir = next;
372
+ i++;
373
+ } else if (arg === '--docs-dir') {
374
+ options.docsDir = next;
375
+ i++;
376
+ } else if (arg === '--playwright') {
377
+ if (next === 'auto' || next === 'local' || next === 'docker') {
378
+ options.playwrightMode = next;
379
+ }
380
+ i++;
381
+ } else if (arg === '-h' || arg === '--help') {
382
+ return { command: 'help', args: [], options: { output: 'text', verbose: false, pretty: false } };
383
+ } else if (!arg.startsWith('-')) {
384
+ positionalArgs.push(arg);
385
+ }
386
+ }
387
+
388
+ return { command, args: positionalArgs, options };
389
+ }
390
+
391
+ // ============================================================================
392
+ // MAIN
393
+ // ============================================================================
394
+
395
+ async function main(): Promise<void> {
396
+ const { command, args, options } = parseArgs();
397
+
398
+ try {
399
+ switch (command) {
400
+ case 'fetch':
401
+ if (args.length === 0) {
402
+ console.error('Error: URL required. Usage: arcfetch fetch <url>');
403
+ process.exit(1);
404
+ }
405
+ await commandFetch({
406
+ url: args[0],
407
+ query: options.query,
408
+ output: options.output,
409
+ verbose: options.verbose,
410
+ pretty: options.pretty,
411
+ minQuality: options.minQuality,
412
+ tempDir: options.tempDir,
413
+ docsDir: options.docsDir,
414
+ playwrightMode: options.playwrightMode,
415
+ });
416
+ break;
417
+
418
+ case 'list':
419
+ await commandList(options.output === 'json' ? 'json' : 'text', options.pretty);
420
+ break;
421
+
422
+ case 'promote':
423
+ if (args.length === 0) {
424
+ console.error('Error: Reference ID required. Usage: arcfetch promote <ref-id>');
425
+ process.exit(1);
426
+ }
427
+ await commandPromote(args[0], options.output === 'json' ? 'json' : 'text', options.pretty);
428
+ break;
429
+
430
+ case 'delete':
431
+ if (args.length === 0) {
432
+ console.error('Error: Reference ID required. Usage: arcfetch delete <ref-id>');
433
+ process.exit(1);
434
+ }
435
+ await commandDelete(args[0], options.output === 'json' ? 'json' : 'text', options.pretty);
436
+ break;
437
+
438
+ case 'config':
439
+ await commandConfig();
440
+ break;
441
+
442
+ default:
443
+ showHelp();
444
+ break;
445
+ }
446
+ } catch (error) {
447
+ console.error('Error:', error instanceof Error ? error.message : String(error));
448
+ process.exit(1);
449
+ }
450
+ }
451
+
452
+ main().catch((err) => {
453
+ console.error('Unexpected error:', err);
454
+ process.exit(1);
455
+ });