arcfetch 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,212 @@
1
+ # arcfetch
2
+
3
+ Fetch URLs, extract clean article content, and cache as markdown. Supports automatic JavaScript rendering fallback via Playwright.
4
+
5
+ ## Features
6
+
7
+ - **Smart Fetching**: Simple HTTP first, automatic Playwright fallback for JS-heavy sites
8
+ - **Quality Gates**: Configurable quality thresholds with automatic retry
9
+ - **Clean Markdown**: Mozilla Readability + Turndown for 90-95% token reduction
10
+ - **Temp → Docs Workflow**: Cache to temp folder, promote to docs when ready
11
+ - **CLI & MCP**: Available as command-line tool and MCP server
12
+
13
+ ## Installation
14
+
15
+ ```bash
16
+ # For users
17
+ npm install -g arcfetch
18
+
19
+ # For development
20
+ bun install
21
+ ```
22
+
23
+ ## Quick Start
24
+
25
+ ### CLI
26
+
27
+ ```bash
28
+ # Fetch a URL
29
+ arcfetch fetch https://example.com/article
30
+
31
+ # List cached references
32
+ arcfetch list
33
+
34
+ # Promote to docs folder
35
+ arcfetch promote REF-001
36
+
37
+ # Delete a reference
38
+ arcfetch delete REF-001
39
+ ```
40
+
41
+ ### MCP Server
42
+
43
+ Add to your Claude Code MCP configuration:
44
+
45
+ ```json
46
+ {
47
+ "mcpServers": {
48
+ "arcfetch": {
49
+ "command": "bun",
50
+ "args": ["run", "/path/to/arcfetch/index.ts"]
51
+ }
52
+ }
53
+ }
54
+ ```
55
+
56
+ ## CLI Commands
57
+
58
+ ### fetch
59
+
60
+ Fetch URL and save to temp folder.
61
+
62
+ ```bash
63
+ arcfetch fetch <url> [options]
64
+
65
+ Options:
66
+ -q, --query <text> Search query (saved as metadata)
67
+ -o, --output <format> Output: text, json, summary (default: text)
68
+ -v, --verbose Show detailed output
69
+ --pretty Human-friendly output with emojis
70
+ --min-quality <n> Minimum quality score 0-100 (default: 60)
71
+ --temp-dir <path> Temp folder (default: .tmp)
72
+ --docs-dir <path> Docs folder (default: docs/ai/references)
73
+ --wait-strategy <mode> Playwright wait strategy: networkidle, domcontentloaded, load
74
+ --force-playwright Skip simple fetch and use Playwright directly
75
+ ```
76
+
77
+ ### list
78
+
79
+ List all cached references.
80
+
81
+ ```bash
82
+ arcfetch list [-o json]
83
+ ```
84
+
85
+ ### promote
86
+
87
+ Move reference from temp to docs folder.
88
+
89
+ ```bash
90
+ arcfetch promote <ref-id>
91
+ ```
92
+
93
+ ### delete
94
+
95
+ Delete a cached reference.
96
+
97
+ ```bash
98
+ arcfetch delete <ref-id>
99
+ ```
100
+
101
+ ### config
102
+
103
+ Show current configuration.
104
+
105
+ ```bash
106
+ arcfetch config
107
+ ```
108
+
109
+ ## MCP Tools
110
+
111
+ | Tool | Description |
112
+ |------|-------------|
113
+ | `fetch_url` | Fetch URL with auto JS fallback, save to temp |
114
+ | `list_cached` | List all cached references |
115
+ | `promote_reference` | Move from temp to docs folder |
116
+ | `delete_cached` | Delete a cached reference |
117
+
118
+ ## Configuration
119
+
120
+ ### Config File
121
+
122
+ Create `arcfetch.config.json` in your project root:
123
+
124
+ ```json
125
+ {
126
+ "quality": {
127
+ "minScore": 60,
128
+ "jsRetryThreshold": 85
129
+ },
130
+ "paths": {
131
+ "tempDir": ".tmp",
132
+ "docsDir": "docs/ai/references"
133
+ },
134
+ "playwright": {
135
+ "timeout": 30000,
136
+ "waitStrategy": "networkidle"
137
+ }
138
+ }
139
+ ```
140
+
141
+ ### Environment Variables
142
+
143
+ ```bash
144
+ ARCFETCH_MIN_SCORE=60
145
+ ARCFETCH_TEMP_DIR=.tmp
146
+ ARCFETCH_DOCS_DIR=docs/ai/references
147
+ ```
148
+
149
+ ## Quality Pipeline
150
+
151
+ ```
152
+ URL → Simple Fetch → Quality Check
153
+
154
+ ┌───────────────┼───────────────┐
155
+ ▼ ▼ ▼
156
+ Score ≥ 85 60-84 < 60
157
+ │ │ │
158
+ ▼ ▼ ▼
159
+ Save Try Playwright Try Playwright
160
+ (if better) (required)
161
+ │ │
162
+ ▼ ▼
163
+ Compare & Score ≥ 60?
164
+ use best Yes → Save
165
+ No → Error
166
+ ```
167
+
168
+ ## Playwright Wait Strategies
169
+
170
+ | Strategy | Description |
171
+ |----------|-------------|
172
+ | `networkidle` | Wait until network is idle (slowest, most reliable) |
173
+ | `domcontentloaded` | Wait until DOM is loaded (faster) |
174
+ | `load` | Wait until page load event completes (fastest) |
175
+
176
+ ## File Structure
177
+
178
+ ```
179
+ .tmp/ # Temporary cache (default)
180
+ REF-001-article-title.md
181
+ REF-002-another-article.md
182
+
183
+ docs/ai/references/ # Permanent docs (after promote)
184
+ REF-001-article-title.md
185
+ ```
186
+
187
+ ## Examples
188
+
189
+ ### Force Playwright for JS-heavy sites
190
+
191
+ ```bash
192
+ arcfetch fetch https://spa-heavy-site.com --force-playwright --wait-strategy domcontentloaded
193
+ ```
194
+
195
+ ### Fetch and get JSON output
196
+
197
+ ```bash
198
+ arcfetch fetch https://example.com -o json
199
+ ```
200
+
201
+ ### Use in scripts
202
+
203
+ ```bash
204
+ # Get just the ref ID and path
205
+ result=$(arcfetch fetch https://example.com -o summary)
206
+ ref_id=$(echo $result | cut -d'|' -f1)
207
+ filepath=$(echo $result | cut -d'|' -f2)
208
+ ```
209
+
210
+ ## License
211
+
212
+ MIT
package/cli.ts ADDED
@@ -0,0 +1,461 @@
1
+ #!/usr/bin/env bun
2
+
3
+ import { getVersion } from './src/utils/version.js';
4
+ import { loadConfig } from './src/config/index.js';
5
+ import { fetchUrl, closeBrowser } from './src/core/pipeline.js';
6
+ import { saveToTemp, listCached, promoteReference, deleteCached } from './src/core/cache.js';
7
+
8
+ // ============================================================================
9
+ // HELP
10
+ // ============================================================================
11
+
12
+ function showHelp(): void {
13
+ console.log(`
14
+ Arcfetch v${getVersion()} - Fetch URLs and cache as clean markdown
15
+
16
+ USAGE:
17
+ arcfetch <command> [options]
18
+
19
+ COMMANDS:
20
+ fetch <url> Fetch URL and save to temp folder
21
+ list List all cached references
22
+ promote <ref-id> Move reference from temp to docs folder
23
+ delete <ref-id> Delete a cached reference
24
+ config Show current configuration
25
+ help Show this help message
26
+
27
+ OPTIONS:
28
+ -q, --query <text> Search query (saved as metadata)
29
+ -o, --output <format> Output format (default: text)
30
+ - text: Plain text (LLM-friendly)
31
+ - json: Structured JSON
32
+ - path: Just the filepath
33
+ - summary: REF-ID|filepath
34
+ --pretty Human-friendly output with emojis
35
+ -v, --verbose Show detailed output
36
+ --min-quality <n> Minimum quality score 0-100 (default: 60)
37
+ --temp-dir <path> Temp folder (default: .tmp)
38
+ --docs-dir <path> Docs folder (default: docs/ai/references)
39
+ --wait-strategy <mode> Playwright wait strategy: networkidle, domcontentloaded, load
40
+ --force-playwright Skip simple fetch and use Playwright directly
41
+
42
+ EXAMPLES:
43
+ # Fetch a URL (plain output for LLMs)
44
+ arcfetch fetch https://example.com/article
45
+
46
+ # Fetch and get just the filepath
47
+ arcfetch fetch https://example.com -o path
48
+
49
+ # Fetch with human-friendly output
50
+ arcfetch fetch https://example.com --pretty
51
+
52
+ # Fetch with JSON output
53
+ arcfetch fetch https://example.com -o json
54
+
55
+ # List cached references
56
+ arcfetch list
57
+
58
+ # Promote to docs folder
59
+ arcfetch promote REF-001
60
+
61
+ ENVIRONMENT VARIABLES:
62
+ SOFETCH_MIN_SCORE Minimum quality score
63
+ SOFETCH_TEMP_DIR Temp directory
64
+ SOFETCH_DOCS_DIR Docs directory
65
+
66
+ CONFIG FILE:
67
+ Place arcfetch.config.json in project root for persistent settings.
68
+ `);
69
+ }
70
+
71
+ // ============================================================================
72
+ // FETCH COMMAND
73
+ // ============================================================================
74
+
75
+ interface FetchOptions {
76
+ url: string;
77
+ query?: string;
78
+ output: 'text' | 'json' | 'summary' | 'path';
79
+ verbose: boolean;
80
+ pretty: boolean;
81
+ minQuality?: number;
82
+ tempDir?: string;
83
+ docsDir?: string;
84
+ waitStrategy?: 'networkidle' | 'domcontentloaded' | 'load';
85
+ forcePlaywright?: boolean;
86
+ }
87
+
88
+ async function commandFetch(options: FetchOptions): Promise<void> {
89
+ const config = loadConfig({
90
+ minQuality: options.minQuality,
91
+ tempDir: options.tempDir,
92
+ docsDir: options.docsDir,
93
+ waitStrategy: options.waitStrategy,
94
+ });
95
+
96
+ if (options.verbose) {
97
+ console.error('🔧 Config:', JSON.stringify(config, null, 2));
98
+ }
99
+
100
+ // Fetch URL
101
+ const result = await fetchUrl(options.url, config, options.verbose, options.forcePlaywright);
102
+
103
+ // Close browser if it was used
104
+ await closeBrowser();
105
+
106
+ if (!result.success) {
107
+ if (options.output === 'json') {
108
+ console.log(
109
+ JSON.stringify(
110
+ {
111
+ success: false,
112
+ error: result.error,
113
+ suggestion: result.suggestion,
114
+ quality: result.quality,
115
+ },
116
+ null,
117
+ 2
118
+ )
119
+ );
120
+ } else {
121
+ console.error(`Error: ${result.error}`);
122
+ if (result.suggestion) {
123
+ console.error(`Suggestion: ${result.suggestion}`);
124
+ }
125
+ if (result.quality) {
126
+ console.error(`Quality: ${result.quality.score}/100`);
127
+ }
128
+ }
129
+ process.exit(1);
130
+ }
131
+
132
+ // Save to temp
133
+ const saveResult = await saveToTemp(config, result.title!, options.url, result.markdown!, options.query);
134
+
135
+ // Small delay to ensure file is flushed to disk (Bun-specific issue)
136
+ await new Promise((resolve) => setTimeout(resolve, 100));
137
+
138
+ if (saveResult.error) {
139
+ if (options.output === 'json') {
140
+ console.log(JSON.stringify({ success: false, error: saveResult.error }, null, 2));
141
+ } else {
142
+ console.error(`Error: Save failed: ${saveResult.error}`);
143
+ }
144
+ process.exit(1);
145
+ }
146
+
147
+ // Output result
148
+ if (options.output === 'json') {
149
+ console.log(
150
+ JSON.stringify(
151
+ {
152
+ success: true,
153
+ refId: saveResult.refId,
154
+ title: result.title,
155
+ byline: result.byline,
156
+ siteName: result.siteName,
157
+ excerpt: result.excerpt,
158
+ url: options.url,
159
+ filepath: saveResult.filepath,
160
+ size: result.markdown!.length,
161
+ tokens: Math.round(result.markdown!.length / 4),
162
+ quality: result.quality?.score,
163
+ usedPlaywright: result.usedPlaywright,
164
+ playwrightReason: result.playwrightReason,
165
+ query: options.query,
166
+ },
167
+ null,
168
+ 2
169
+ )
170
+ );
171
+ } else if (options.output === 'summary') {
172
+ console.log(`${saveResult.refId}|${saveResult.filepath}`);
173
+ } else if (options.output === 'path') {
174
+ console.log(saveResult.filepath);
175
+ } else if (options.pretty) {
176
+ // Pretty output with emojis (human-friendly)
177
+ console.log(`✅ Cached: ${saveResult.refId}\n`);
178
+ console.log(`**Title**: ${result.title}`);
179
+ if (result.byline) console.log(`**Author**: ${result.byline}`);
180
+ if (result.siteName) console.log(`**Source**: ${result.siteName}`);
181
+ if (result.excerpt) {
182
+ const excerpt = result.excerpt.slice(0, 150);
183
+ console.log(`**Summary**: ${excerpt}${result.excerpt.length > 150 ? '...' : ''}`);
184
+ }
185
+ console.log(`\n**Saved to**: ${saveResult.filepath}`);
186
+ console.log(`**Size**: ${result.markdown!.length} chars (~${Math.round(result.markdown!.length / 4)} tokens)`);
187
+ console.log(`**Quality**: ${result.quality?.score}/100`);
188
+ if (result.usedPlaywright) {
189
+ console.log(`**Playwright**: Yes (${result.playwrightReason})`);
190
+ }
191
+ console.log(`\n💡 To promote to docs: arcfetch promote ${saveResult.refId}`);
192
+ } else {
193
+ // Plain output (LLM-friendly, default)
194
+ console.log(`Cached: ${saveResult.refId}`);
195
+ console.log(`Title: ${result.title}`);
196
+ if (result.byline) console.log(`Author: ${result.byline}`);
197
+ if (result.siteName) console.log(`Source: ${result.siteName}`);
198
+ if (result.excerpt) {
199
+ const excerpt = result.excerpt.slice(0, 150);
200
+ console.log(`Summary: ${excerpt}${result.excerpt.length > 150 ? '...' : ''}`);
201
+ }
202
+ console.log(`Filepath: ${saveResult.filepath}`);
203
+ console.log(`Size: ${result.markdown!.length} chars (~${Math.round(result.markdown!.length / 4)} tokens)`);
204
+ console.log(`Quality: ${result.quality?.score}/100`);
205
+ if (result.usedPlaywright) {
206
+ console.log(`Playwright: Yes (${result.playwrightReason})`);
207
+ }
208
+ }
209
+ }
210
+
211
+ // ============================================================================
212
+ // LIST COMMAND
213
+ // ============================================================================
214
+
215
+ async function commandList(output: 'text' | 'json', pretty: boolean): Promise<void> {
216
+ const config = loadConfig();
217
+ const result = listCached(config);
218
+
219
+ if (result.error) {
220
+ console.error(`Error: ${result.error}`);
221
+ process.exit(1);
222
+ }
223
+
224
+ if (output === 'json') {
225
+ console.log(JSON.stringify(result.references, null, 2));
226
+ return;
227
+ }
228
+
229
+ if (result.references.length === 0) {
230
+ console.log(`No cached references in ${config.paths.tempDir}/`);
231
+ return;
232
+ }
233
+
234
+ if (pretty) {
235
+ console.log(`📚 Cached references (${result.references.length}):\n`);
236
+ for (const ref of result.references) {
237
+ console.log(`${ref.refId} | ${ref.title.slice(0, 50)}${ref.title.length > 50 ? '...' : ''}`);
238
+ console.log(` 📅 ${ref.fetchedDate} | 📄 ${Math.round(ref.size / 1024)}KB`);
239
+ console.log(` 🔗 ${ref.url.slice(0, 60)}${ref.url.length > 60 ? '...' : ''}`);
240
+ console.log('');
241
+ }
242
+ console.log(`💡 To promote: arcfetch promote <ref-id>`);
243
+ console.log(`💡 To delete: arcfetch delete <ref-id>`);
244
+ } else {
245
+ console.log(`Cached references (${result.references.length}):\n`);
246
+ for (const ref of result.references) {
247
+ console.log(`${ref.refId} | ${ref.title.slice(0, 50)}${ref.title.length > 50 ? '...' : ''}`);
248
+ console.log(` Date: ${ref.fetchedDate} | Size: ${Math.round(ref.size / 1024)}KB`);
249
+ console.log(` URL: ${ref.url.slice(0, 60)}${ref.url.length > 60 ? '...' : ''}`);
250
+ console.log('');
251
+ }
252
+ }
253
+ }
254
+
255
+ // ============================================================================
256
+ // PROMOTE COMMAND
257
+ // ============================================================================
258
+
259
+ async function commandPromote(refId: string, output: 'text' | 'json', pretty: boolean): Promise<void> {
260
+ const config = loadConfig();
261
+ const result = promoteReference(config, refId);
262
+
263
+ if (output === 'json') {
264
+ console.log(JSON.stringify(result, null, 2));
265
+ if (!result.success) process.exit(1);
266
+ return;
267
+ }
268
+
269
+ if (!result.success) {
270
+ console.error(`Error: ${result.error}`);
271
+ process.exit(1);
272
+ }
273
+
274
+ if (pretty) {
275
+ console.log(`✅ Promoted ${refId}`);
276
+ console.log(` From: ${result.fromPath}`);
277
+ console.log(` To: ${result.toPath}`);
278
+ } else {
279
+ console.log(`Promoted: ${refId}`);
280
+ console.log(`From: ${result.fromPath}`);
281
+ console.log(`To: ${result.toPath}`);
282
+ }
283
+ }
284
+
285
+ // ============================================================================
286
+ // DELETE COMMAND
287
+ // ============================================================================
288
+
289
+ async function commandDelete(refId: string, output: 'text' | 'json', pretty: boolean): Promise<void> {
290
+ const config = loadConfig();
291
+ const result = deleteCached(config, refId);
292
+
293
+ if (output === 'json') {
294
+ console.log(JSON.stringify(result, null, 2));
295
+ if (!result.success) process.exit(1);
296
+ return;
297
+ }
298
+
299
+ if (!result.success) {
300
+ console.error(`Error: ${result.error}`);
301
+ process.exit(1);
302
+ }
303
+
304
+ if (pretty) {
305
+ console.log(`✅ Deleted ${refId}`);
306
+ console.log(` File: ${result.filepath}`);
307
+ } else {
308
+ console.log(`Deleted: ${refId}`);
309
+ console.log(`File: ${result.filepath}`);
310
+ }
311
+ }
312
+
313
+ // ============================================================================
314
+ // CONFIG COMMAND
315
+ // ============================================================================
316
+
317
+ async function commandConfig(): Promise<void> {
318
+ const config = loadConfig();
319
+ console.log('Current configuration:\n');
320
+ console.log(JSON.stringify(config, null, 2));
321
+ }
322
+
323
+ // ============================================================================
324
+ // ARGUMENT PARSING
325
+ // ============================================================================
326
+
327
+ interface ParsedOptions {
328
+ output: 'text' | 'json' | 'summary' | 'path';
329
+ verbose: boolean;
330
+ pretty: boolean;
331
+ query?: string;
332
+ minQuality?: number;
333
+ tempDir?: string;
334
+ docsDir?: string;
335
+ waitStrategy?: 'networkidle' | 'domcontentloaded' | 'load';
336
+ forcePlaywright?: boolean;
337
+ }
338
+
339
+ function parseArgs(): { command: string; args: string[]; options: ParsedOptions } {
340
+ const args = process.argv.slice(2);
341
+
342
+ if (args.length === 0) {
343
+ return { command: 'help', args: [], options: { output: 'text', verbose: false, pretty: false } };
344
+ }
345
+
346
+ const command = args[0];
347
+ const options: ParsedOptions = {
348
+ output: 'text',
349
+ verbose: false,
350
+ pretty: false,
351
+ };
352
+ const positionalArgs: string[] = [];
353
+
354
+ for (let i = 1; i < args.length; i++) {
355
+ const arg = args[i];
356
+ const next = args[i + 1];
357
+
358
+ if (arg === '-q' || arg === '--query') {
359
+ options.query = next;
360
+ i++;
361
+ } else if (arg === '-o' || arg === '--output') {
362
+ if (next === 'text' || next === 'json' || next === 'summary' || next === 'path') {
363
+ options.output = next;
364
+ }
365
+ i++;
366
+ } else if (arg === '-v' || arg === '--verbose') {
367
+ options.verbose = true;
368
+ } else if (arg === '--pretty') {
369
+ options.pretty = true;
370
+ } else if (arg === '--min-quality') {
371
+ options.minQuality = parseInt(next, 10);
372
+ i++;
373
+ } else if (arg === '--temp-dir') {
374
+ options.tempDir = next;
375
+ i++;
376
+ } else if (arg === '--docs-dir') {
377
+ options.docsDir = next;
378
+ i++;
379
+ } else if (arg === '--wait-strategy') {
380
+ if (next === 'networkidle' || next === 'domcontentloaded' || next === 'load') {
381
+ options.waitStrategy = next;
382
+ }
383
+ i++;
384
+ } else if (arg === '--force-playwright') {
385
+ options.forcePlaywright = true;
386
+ } else if (arg === '-h' || arg === '--help') {
387
+ return { command: 'help', args: [], options: { output: 'text', verbose: false, pretty: false } };
388
+ } else if (!arg.startsWith('-')) {
389
+ positionalArgs.push(arg);
390
+ }
391
+ }
392
+
393
+ return { command, args: positionalArgs, options };
394
+ }
395
+
396
+ // ============================================================================
397
+ // MAIN
398
+ // ============================================================================
399
+
400
+ async function main(): Promise<void> {
401
+ const { command, args, options } = parseArgs();
402
+
403
+ try {
404
+ switch (command) {
405
+ case 'fetch':
406
+ if (args.length === 0) {
407
+ console.error('Error: URL required. Usage: arcfetch fetch <url>');
408
+ process.exit(1);
409
+ }
410
+ await commandFetch({
411
+ url: args[0],
412
+ query: options.query,
413
+ output: options.output,
414
+ verbose: options.verbose,
415
+ pretty: options.pretty,
416
+ minQuality: options.minQuality,
417
+ tempDir: options.tempDir,
418
+ docsDir: options.docsDir,
419
+ waitStrategy: options.waitStrategy,
420
+ forcePlaywright: options.forcePlaywright,
421
+ });
422
+ break;
423
+
424
+ case 'list':
425
+ await commandList(options.output === 'json' ? 'json' : 'text', options.pretty);
426
+ break;
427
+
428
+ case 'promote':
429
+ if (args.length === 0) {
430
+ console.error('Error: Reference ID required. Usage: arcfetch promote <ref-id>');
431
+ process.exit(1);
432
+ }
433
+ await commandPromote(args[0], options.output === 'json' ? 'json' : 'text', options.pretty);
434
+ break;
435
+
436
+ case 'delete':
437
+ if (args.length === 0) {
438
+ console.error('Error: Reference ID required. Usage: arcfetch delete <ref-id>');
439
+ process.exit(1);
440
+ }
441
+ await commandDelete(args[0], options.output === 'json' ? 'json' : 'text', options.pretty);
442
+ break;
443
+
444
+ case 'config':
445
+ await commandConfig();
446
+ break;
447
+
448
+ default:
449
+ showHelp();
450
+ break;
451
+ }
452
+ } catch (error) {
453
+ console.error('Error:', error instanceof Error ? error.message : String(error));
454
+ process.exit(1);
455
+ }
456
+ }
457
+
458
+ main().catch((err) => {
459
+ console.error('Unexpected error:', err);
460
+ process.exit(1);
461
+ });