arcfetch 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/cli.ts +96 -122
- package/index.ts +71 -102
- package/package.json +2 -2
- package/src/config/defaults.ts +1 -5
- package/src/config/index.ts +1 -1
- package/src/config/loader.ts +5 -9
- package/src/config/schema.ts +1 -8
- package/src/core/cache.ts +12 -13
- package/src/core/extractor.ts +3 -7
- package/src/core/fetch-links.ts +85 -0
- package/src/core/index.ts +3 -2
- package/src/core/pipeline.ts +29 -13
- package/src/core/playwright/index.ts +1 -1
- package/src/core/playwright/local.ts +15 -6
- package/src/core/playwright/manager.ts +86 -14
- package/src/utils/markdown-cleaner.ts +41 -41
- package/src/utils/markdown-validator.ts +98 -26
package/README.md
CHANGED
|
@@ -194,7 +194,7 @@ Create `arcfetch.config.json` in your project root:
|
|
|
194
194
|
"jsRetryThreshold": 85
|
|
195
195
|
},
|
|
196
196
|
"paths": {
|
|
197
|
-
"tempDir": ".tmp",
|
|
197
|
+
"tempDir": ".tmp/arcfetch",
|
|
198
198
|
"docsDir": "docs/ai/references"
|
|
199
199
|
},
|
|
200
200
|
"playwright": {
|
|
@@ -208,7 +208,7 @@ Create `arcfetch.config.json` in your project root:
|
|
|
208
208
|
|
|
209
209
|
```bash
|
|
210
210
|
ARCFETCH_MIN_SCORE=60
|
|
211
|
-
ARCFETCH_TEMP_DIR=.tmp
|
|
211
|
+
ARCFETCH_TEMP_DIR=.tmp/arcfetch
|
|
212
212
|
ARCFETCH_DOCS_DIR=docs/ai/references
|
|
213
213
|
```
|
|
214
214
|
|
package/cli.ts
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
#!/usr/bin/env bun
|
|
2
2
|
|
|
3
|
-
import { getVersion } from './src/utils/version';
|
|
4
|
-
import { loadConfig } from './src/config/index';
|
|
5
|
-
import { fetchUrl, closeBrowser } from './src/core/pipeline';
|
|
6
|
-
import { saveToTemp, listCached, promoteReference, deleteCached, extractLinksFromCached } from './src/core/cache';
|
|
7
3
|
import { serveMcp } from './index';
|
|
4
|
+
import { loadConfig } from './src/config/index';
|
|
5
|
+
import { deleteCached, extractLinksFromCached, listCached, promoteReference, saveToTemp } from './src/core/cache';
|
|
6
|
+
import { type FetchLinkResult, fetchLinksFromRef } from './src/core/fetch-links';
|
|
7
|
+
import { closeBrowser, fetchUrl } from './src/core/pipeline';
|
|
8
|
+
import { getVersion } from './src/utils/version';
|
|
8
9
|
|
|
9
10
|
// ============================================================================
|
|
10
11
|
// HELP
|
|
@@ -39,7 +40,7 @@ OPTIONS:
|
|
|
39
40
|
--refetch Re-fetch and update even if URL already cached
|
|
40
41
|
-v, --verbose Show detailed output
|
|
41
42
|
--min-quality <n> Minimum quality score 0-100 (default: 60)
|
|
42
|
-
--temp-dir <path> Temp folder (default: .tmp)
|
|
43
|
+
--temp-dir <path> Temp folder (default: .tmp/arcfetch)
|
|
43
44
|
--docs-dir <path> Docs folder (default: docs/ai/references)
|
|
44
45
|
--wait-strategy <mode> Playwright wait strategy: networkidle, domcontentloaded, load
|
|
45
46
|
--force-playwright Skip simple fetch and use Playwright directly
|
|
@@ -142,7 +143,14 @@ async function commandFetch(options: FetchOptions): Promise<void> {
|
|
|
142
143
|
}
|
|
143
144
|
|
|
144
145
|
// Save to temp
|
|
145
|
-
const saveResult = await saveToTemp(
|
|
146
|
+
const saveResult = await saveToTemp(
|
|
147
|
+
config,
|
|
148
|
+
result.title!,
|
|
149
|
+
options.url,
|
|
150
|
+
result.markdown!,
|
|
151
|
+
options.query,
|
|
152
|
+
options.refetch
|
|
153
|
+
);
|
|
146
154
|
|
|
147
155
|
// Small delay to ensure file is flushed to disk (Bun-specific issue)
|
|
148
156
|
await new Promise((resolve) => setTimeout(resolve, 100));
|
|
@@ -159,13 +167,19 @@ async function commandFetch(options: FetchOptions): Promise<void> {
|
|
|
159
167
|
// Handle already exists case
|
|
160
168
|
if (saveResult.alreadyExists) {
|
|
161
169
|
if (options.output === 'json') {
|
|
162
|
-
console.log(
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
170
|
+
console.log(
|
|
171
|
+
JSON.stringify(
|
|
172
|
+
{
|
|
173
|
+
success: true,
|
|
174
|
+
alreadyExists: true,
|
|
175
|
+
refId: saveResult.refId,
|
|
176
|
+
filepath: saveResult.filepath,
|
|
177
|
+
message: 'URL already fetched. Use --refetch to update.',
|
|
178
|
+
},
|
|
179
|
+
null,
|
|
180
|
+
2
|
|
181
|
+
)
|
|
182
|
+
);
|
|
169
183
|
} else if (options.output === 'path') {
|
|
170
184
|
console.log(saveResult.filepath);
|
|
171
185
|
} else if (options.pretty) {
|
|
@@ -374,12 +388,18 @@ async function commandLinks(refId: string, output: 'text' | 'json', pretty: bool
|
|
|
374
388
|
}
|
|
375
389
|
|
|
376
390
|
if (output === 'json') {
|
|
377
|
-
console.log(
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
391
|
+
console.log(
|
|
392
|
+
JSON.stringify(
|
|
393
|
+
{
|
|
394
|
+
success: true,
|
|
395
|
+
sourceRef: result.sourceRef,
|
|
396
|
+
count: result.count,
|
|
397
|
+
links: result.links,
|
|
398
|
+
},
|
|
399
|
+
null,
|
|
400
|
+
2
|
|
401
|
+
)
|
|
402
|
+
);
|
|
383
403
|
return;
|
|
384
404
|
}
|
|
385
405
|
|
|
@@ -411,13 +431,6 @@ async function commandLinks(refId: string, output: 'text' | 'json', pretty: bool
|
|
|
411
431
|
// FETCH-LINKS COMMAND
|
|
412
432
|
// ============================================================================
|
|
413
433
|
|
|
414
|
-
interface FetchLinksResult {
|
|
415
|
-
url: string;
|
|
416
|
-
status: 'new' | 'cached' | 'failed';
|
|
417
|
-
refId?: string;
|
|
418
|
-
error?: string;
|
|
419
|
-
}
|
|
420
|
-
|
|
421
434
|
async function commandFetchLinks(
|
|
422
435
|
refId: string,
|
|
423
436
|
output: 'text' | 'json',
|
|
@@ -426,120 +439,75 @@ async function commandFetchLinks(
|
|
|
426
439
|
refetch: boolean
|
|
427
440
|
): Promise<void> {
|
|
428
441
|
const config = loadConfig();
|
|
429
|
-
const linksResult = extractLinksFromCached(config, refId);
|
|
430
442
|
|
|
431
|
-
|
|
443
|
+
const printProgress =
|
|
444
|
+
output !== 'json'
|
|
445
|
+
? (r: FetchLinkResult) => {
|
|
446
|
+
if (pretty) {
|
|
447
|
+
if (r.status === 'new') {
|
|
448
|
+
console.log(`\u2713 ${r.refId} (new)`);
|
|
449
|
+
} else if (r.status === 'cached') {
|
|
450
|
+
console.log(`\u25CB ${r.refId} (already cached)`);
|
|
451
|
+
} else {
|
|
452
|
+
console.log(`\u2717 ${r.url.slice(0, 50)}... (${r.error})`);
|
|
453
|
+
}
|
|
454
|
+
} else {
|
|
455
|
+
if (r.status === 'new') {
|
|
456
|
+
console.log(`new: ${r.refId}`);
|
|
457
|
+
} else if (r.status === 'cached') {
|
|
458
|
+
console.log(`cached: ${r.refId}`);
|
|
459
|
+
} else {
|
|
460
|
+
console.log(`failed: ${r.url} - ${r.error}`);
|
|
461
|
+
}
|
|
462
|
+
}
|
|
463
|
+
}
|
|
464
|
+
: undefined;
|
|
465
|
+
|
|
466
|
+
const { results, summary, error } = await fetchLinksFromRef(config, refId, {
|
|
467
|
+
refetch,
|
|
468
|
+
verbose,
|
|
469
|
+
onProgress: printProgress,
|
|
470
|
+
});
|
|
471
|
+
|
|
472
|
+
if (error) {
|
|
432
473
|
if (output === 'json') {
|
|
433
|
-
console.log(JSON.stringify({ success: false, error
|
|
474
|
+
console.log(JSON.stringify({ success: false, error }, null, 2));
|
|
434
475
|
} else {
|
|
435
|
-
console.error(`Error: ${
|
|
476
|
+
console.error(`Error: ${error}`);
|
|
436
477
|
}
|
|
437
478
|
process.exit(1);
|
|
438
479
|
}
|
|
439
480
|
|
|
440
|
-
if (
|
|
481
|
+
if (results.length === 0) {
|
|
441
482
|
if (output === 'json') {
|
|
442
483
|
console.log(JSON.stringify({ success: true, message: 'No links to fetch', results: [] }, null, 2));
|
|
443
484
|
} else if (pretty) {
|
|
444
|
-
console.log(
|
|
485
|
+
console.log(`No links found in ${refId}`);
|
|
445
486
|
} else {
|
|
446
487
|
console.log(`No links found in ${refId}`);
|
|
447
488
|
}
|
|
448
489
|
return;
|
|
449
490
|
}
|
|
450
491
|
|
|
451
|
-
if (pretty) {
|
|
452
|
-
console.log(`🔗 Fetching ${linksResult.count} links from ${refId}...\n`);
|
|
453
|
-
} else if (output !== 'json') {
|
|
454
|
-
console.log(`Fetching ${linksResult.count} links from ${refId}...\n`);
|
|
455
|
-
}
|
|
456
|
-
|
|
457
|
-
const results: FetchLinksResult[] = [];
|
|
458
|
-
const concurrency = 5;
|
|
459
|
-
const urls = linksResult.links.map(l => l.href);
|
|
460
|
-
|
|
461
|
-
// Process in batches of 5
|
|
462
|
-
for (let i = 0; i < urls.length; i += concurrency) {
|
|
463
|
-
const batch = urls.slice(i, i + concurrency);
|
|
464
|
-
const batchPromises = batch.map(async (url): Promise<FetchLinksResult> => {
|
|
465
|
-
try {
|
|
466
|
-
const fetchResult = await fetchUrl(url, config, verbose);
|
|
467
|
-
|
|
468
|
-
if (!fetchResult.success) {
|
|
469
|
-
return { url, status: 'failed', error: fetchResult.error };
|
|
470
|
-
}
|
|
471
|
-
|
|
472
|
-
const saveResult = await saveToTemp(
|
|
473
|
-
config,
|
|
474
|
-
fetchResult.title!,
|
|
475
|
-
url,
|
|
476
|
-
fetchResult.markdown!,
|
|
477
|
-
undefined,
|
|
478
|
-
refetch
|
|
479
|
-
);
|
|
480
|
-
|
|
481
|
-
if (saveResult.error) {
|
|
482
|
-
return { url, status: 'failed', error: saveResult.error };
|
|
483
|
-
}
|
|
484
|
-
|
|
485
|
-
if (saveResult.alreadyExists) {
|
|
486
|
-
return { url, status: 'cached', refId: saveResult.refId };
|
|
487
|
-
}
|
|
488
|
-
|
|
489
|
-
return { url, status: 'new', refId: saveResult.refId };
|
|
490
|
-
} catch (error) {
|
|
491
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
492
|
-
return { url, status: 'failed', error: message };
|
|
493
|
-
}
|
|
494
|
-
});
|
|
495
|
-
|
|
496
|
-
const batchResults = await Promise.all(batchPromises);
|
|
497
|
-
results.push(...batchResults);
|
|
498
|
-
|
|
499
|
-
// Print progress for non-json output
|
|
500
|
-
if (output !== 'json') {
|
|
501
|
-
for (const r of batchResults) {
|
|
502
|
-
if (pretty) {
|
|
503
|
-
if (r.status === 'new') {
|
|
504
|
-
console.log(`✓ ${r.refId} (new)`);
|
|
505
|
-
} else if (r.status === 'cached') {
|
|
506
|
-
console.log(`○ ${r.refId} (already cached)`);
|
|
507
|
-
} else {
|
|
508
|
-
console.log(`✗ ${r.url.slice(0, 50)}... (${r.error})`);
|
|
509
|
-
}
|
|
510
|
-
} else {
|
|
511
|
-
if (r.status === 'new') {
|
|
512
|
-
console.log(`new: ${r.refId}`);
|
|
513
|
-
} else if (r.status === 'cached') {
|
|
514
|
-
console.log(`cached: ${r.refId}`);
|
|
515
|
-
} else {
|
|
516
|
-
console.log(`failed: ${r.url} - ${r.error}`);
|
|
517
|
-
}
|
|
518
|
-
}
|
|
519
|
-
}
|
|
520
|
-
}
|
|
521
|
-
}
|
|
522
|
-
|
|
523
|
-
// Close browser after all fetches
|
|
524
|
-
await closeBrowser();
|
|
525
|
-
|
|
526
|
-
const newCount = results.filter(r => r.status === 'new').length;
|
|
527
|
-
const cachedCount = results.filter(r => r.status === 'cached').length;
|
|
528
|
-
const failedCount = results.filter(r => r.status === 'failed').length;
|
|
529
|
-
|
|
530
492
|
if (output === 'json') {
|
|
531
|
-
console.log(
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
493
|
+
console.log(
|
|
494
|
+
JSON.stringify(
|
|
495
|
+
{
|
|
496
|
+
success: true,
|
|
497
|
+
sourceRef: refId,
|
|
498
|
+
summary,
|
|
499
|
+
results,
|
|
500
|
+
},
|
|
501
|
+
null,
|
|
502
|
+
2
|
|
503
|
+
)
|
|
504
|
+
);
|
|
537
505
|
} else {
|
|
538
506
|
console.log('');
|
|
539
507
|
if (pretty) {
|
|
540
|
-
console.log(
|
|
508
|
+
console.log(`Summary: ${summary.new} new, ${summary.cached} cached, ${summary.failed} failed`);
|
|
541
509
|
} else {
|
|
542
|
-
console.log(`Summary: ${
|
|
510
|
+
console.log(`Summary: ${summary.new} new, ${summary.cached} cached, ${summary.failed} failed`);
|
|
543
511
|
}
|
|
544
512
|
}
|
|
545
513
|
}
|
|
@@ -548,7 +516,7 @@ async function commandFetchLinks(
|
|
|
548
516
|
// ARGUMENT PARSING
|
|
549
517
|
// ============================================================================
|
|
550
518
|
|
|
551
|
-
interface ParsedOptions {
|
|
519
|
+
export interface ParsedOptions {
|
|
552
520
|
output: 'text' | 'json' | 'summary' | 'path';
|
|
553
521
|
verbose: boolean;
|
|
554
522
|
pretty: boolean;
|
|
@@ -561,7 +529,7 @@ interface ParsedOptions {
|
|
|
561
529
|
forcePlaywright?: boolean;
|
|
562
530
|
}
|
|
563
531
|
|
|
564
|
-
function parseArgs(): { command: string; args: string[]; options: ParsedOptions } {
|
|
532
|
+
export function parseArgs(): { command: string; args: string[]; options: ParsedOptions } {
|
|
565
533
|
const args = process.argv.slice(2);
|
|
566
534
|
|
|
567
535
|
if (args.length === 0) {
|
|
@@ -691,7 +659,13 @@ async function main(): Promise<void> {
|
|
|
691
659
|
console.error('Error: Reference ID required. Usage: arcfetch fetch-links <ref-id>');
|
|
692
660
|
process.exit(1);
|
|
693
661
|
}
|
|
694
|
-
await commandFetchLinks(
|
|
662
|
+
await commandFetchLinks(
|
|
663
|
+
args[0],
|
|
664
|
+
options.output === 'json' ? 'json' : 'text',
|
|
665
|
+
options.pretty,
|
|
666
|
+
options.verbose,
|
|
667
|
+
options.refetch
|
|
668
|
+
);
|
|
695
669
|
break;
|
|
696
670
|
|
|
697
671
|
default:
|
package/index.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
#!/usr/bin/env bun
|
|
2
|
+
|
|
2
3
|
/**
|
|
3
4
|
* Arcfetch MCP Server
|
|
4
5
|
*
|
|
@@ -9,13 +10,14 @@
|
|
|
9
10
|
* - delete_cached: Delete a cached reference
|
|
10
11
|
*/
|
|
11
12
|
|
|
12
|
-
import { getVersion } from './src/utils/version';
|
|
13
13
|
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
|
|
14
14
|
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
15
15
|
import { CallToolRequestSchema, ListToolsRequestSchema } from '@modelcontextprotocol/sdk/types.js';
|
|
16
16
|
import { loadConfig } from './src/config/index';
|
|
17
|
-
import {
|
|
18
|
-
import {
|
|
17
|
+
import { deleteCached, extractLinksFromCached, listCached, promoteReference, saveToTemp } from './src/core/cache';
|
|
18
|
+
import { fetchLinksFromRef } from './src/core/fetch-links';
|
|
19
|
+
import { closeBrowser, fetchUrl } from './src/core/pipeline';
|
|
20
|
+
import { getVersion } from './src/utils/version';
|
|
19
21
|
|
|
20
22
|
const server = new Server(
|
|
21
23
|
{
|
|
@@ -59,7 +61,7 @@ Returns summary with title, author, excerpt. Use Read tool to access full conten
|
|
|
59
61
|
},
|
|
60
62
|
tempDir: {
|
|
61
63
|
type: 'string',
|
|
62
|
-
description: 'Optional: Temp folder path (default: .tmp)',
|
|
64
|
+
description: 'Optional: Temp folder path (default: .tmp/arcfetch)',
|
|
63
65
|
},
|
|
64
66
|
outputFormat: {
|
|
65
67
|
type: 'string',
|
|
@@ -83,7 +85,7 @@ Returns summary with title, author, excerpt. Use Read tool to access full conten
|
|
|
83
85
|
properties: {
|
|
84
86
|
tempDir: {
|
|
85
87
|
type: 'string',
|
|
86
|
-
description: 'Optional: Temp folder path (default: .tmp)',
|
|
88
|
+
description: 'Optional: Temp folder path (default: .tmp/arcfetch)',
|
|
87
89
|
},
|
|
88
90
|
},
|
|
89
91
|
},
|
|
@@ -123,7 +125,8 @@ Returns summary with title, author, excerpt. Use Read tool to access full conten
|
|
|
123
125
|
},
|
|
124
126
|
{
|
|
125
127
|
name: 'extract_links',
|
|
126
|
-
description:
|
|
128
|
+
description:
|
|
129
|
+
'Extract all http/https links from a cached reference markdown. Returns list of links with their text and URLs.',
|
|
127
130
|
inputSchema: {
|
|
128
131
|
type: 'object',
|
|
129
132
|
properties: {
|
|
@@ -142,7 +145,8 @@ Returns summary with title, author, excerpt. Use Read tool to access full conten
|
|
|
142
145
|
},
|
|
143
146
|
{
|
|
144
147
|
name: 'fetch_links',
|
|
145
|
-
description:
|
|
148
|
+
description:
|
|
149
|
+
'Fetch all links from a cached reference. Extracts links and fetches each one, caching as new references. Uses parallel fetching (max 5 concurrent).',
|
|
146
150
|
inputSchema: {
|
|
147
151
|
type: 'object',
|
|
148
152
|
properties: {
|
|
@@ -270,23 +274,31 @@ async function handleFetchUrl(args: {
|
|
|
270
274
|
}
|
|
271
275
|
if (outputFormat === 'json') {
|
|
272
276
|
return {
|
|
273
|
-
content: [
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
277
|
+
content: [
|
|
278
|
+
{
|
|
279
|
+
type: 'text',
|
|
280
|
+
text: JSON.stringify(
|
|
281
|
+
{
|
|
282
|
+
success: true,
|
|
283
|
+
alreadyExists: true,
|
|
284
|
+
refId: saveResult.refId,
|
|
285
|
+
filepath: saveResult.filepath,
|
|
286
|
+
message: 'URL already fetched. Use refetch: true to update.',
|
|
287
|
+
},
|
|
288
|
+
null,
|
|
289
|
+
2
|
|
290
|
+
),
|
|
291
|
+
},
|
|
292
|
+
],
|
|
283
293
|
};
|
|
284
294
|
}
|
|
285
295
|
return {
|
|
286
|
-
content: [
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
296
|
+
content: [
|
|
297
|
+
{
|
|
298
|
+
type: 'text',
|
|
299
|
+
text: `Already cached: ${saveResult.refId}\nFilepath: ${saveResult.filepath}\n\nUse refetch: true to update.`,
|
|
300
|
+
},
|
|
301
|
+
],
|
|
290
302
|
};
|
|
291
303
|
}
|
|
292
304
|
|
|
@@ -425,15 +437,21 @@ async function handleExtractLinks(args: { refId: string; outputFormat?: 'summary
|
|
|
425
437
|
|
|
426
438
|
if (args.outputFormat === 'json') {
|
|
427
439
|
return {
|
|
428
|
-
content: [
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
440
|
+
content: [
|
|
441
|
+
{
|
|
442
|
+
type: 'text',
|
|
443
|
+
text: JSON.stringify(
|
|
444
|
+
{
|
|
445
|
+
success: true,
|
|
446
|
+
sourceRef: result.sourceRef,
|
|
447
|
+
count: result.count,
|
|
448
|
+
links: result.links,
|
|
449
|
+
},
|
|
450
|
+
null,
|
|
451
|
+
2
|
|
452
|
+
),
|
|
453
|
+
},
|
|
454
|
+
],
|
|
437
455
|
};
|
|
438
456
|
}
|
|
439
457
|
|
|
@@ -453,27 +471,22 @@ async function handleExtractLinks(args: { refId: string; outputFormat?: 'summary
|
|
|
453
471
|
};
|
|
454
472
|
}
|
|
455
473
|
|
|
456
|
-
interface FetchLinksResult {
|
|
457
|
-
url: string;
|
|
458
|
-
status: 'new' | 'cached' | 'failed';
|
|
459
|
-
refId?: string;
|
|
460
|
-
error?: string;
|
|
461
|
-
}
|
|
462
|
-
|
|
463
474
|
async function handleFetchLinks(args: { refId: string; refetch?: boolean; outputFormat?: 'summary' | 'json' }) {
|
|
464
475
|
const config = loadConfig();
|
|
465
|
-
const
|
|
476
|
+
const { results, summary, error } = await fetchLinksFromRef(config, args.refId, { refetch: args.refetch });
|
|
466
477
|
|
|
467
|
-
if (
|
|
478
|
+
if (error) {
|
|
468
479
|
return {
|
|
469
|
-
content: [{ type: 'text', text: `Error: ${
|
|
480
|
+
content: [{ type: 'text', text: `Error: ${error}` }],
|
|
470
481
|
};
|
|
471
482
|
}
|
|
472
483
|
|
|
473
|
-
if (
|
|
484
|
+
if (results.length === 0) {
|
|
474
485
|
if (args.outputFormat === 'json') {
|
|
475
486
|
return {
|
|
476
|
-
content: [
|
|
487
|
+
content: [
|
|
488
|
+
{ type: 'text', text: JSON.stringify({ success: true, message: 'No links to fetch', results: [] }, null, 2) },
|
|
489
|
+
],
|
|
477
490
|
};
|
|
478
491
|
}
|
|
479
492
|
return {
|
|
@@ -481,67 +494,23 @@ async function handleFetchLinks(args: { refId: string; refetch?: boolean; output
|
|
|
481
494
|
};
|
|
482
495
|
}
|
|
483
496
|
|
|
484
|
-
const results: FetchLinksResult[] = [];
|
|
485
|
-
const concurrency = 5;
|
|
486
|
-
const urls = linksResult.links.map(l => l.href);
|
|
487
|
-
|
|
488
|
-
// Process in batches of 5
|
|
489
|
-
for (let i = 0; i < urls.length; i += concurrency) {
|
|
490
|
-
const batch = urls.slice(i, i + concurrency);
|
|
491
|
-
const batchPromises = batch.map(async (url): Promise<FetchLinksResult> => {
|
|
492
|
-
try {
|
|
493
|
-
const fetchResult = await fetchUrl(url, config, false);
|
|
494
|
-
|
|
495
|
-
if (!fetchResult.success) {
|
|
496
|
-
return { url, status: 'failed', error: fetchResult.error };
|
|
497
|
-
}
|
|
498
|
-
|
|
499
|
-
const saveResult = await saveToTemp(
|
|
500
|
-
config,
|
|
501
|
-
fetchResult.title!,
|
|
502
|
-
url,
|
|
503
|
-
fetchResult.markdown!,
|
|
504
|
-
undefined,
|
|
505
|
-
args.refetch
|
|
506
|
-
);
|
|
507
|
-
|
|
508
|
-
if (saveResult.error) {
|
|
509
|
-
return { url, status: 'failed', error: saveResult.error };
|
|
510
|
-
}
|
|
511
|
-
|
|
512
|
-
if (saveResult.alreadyExists) {
|
|
513
|
-
return { url, status: 'cached', refId: saveResult.refId };
|
|
514
|
-
}
|
|
515
|
-
|
|
516
|
-
return { url, status: 'new', refId: saveResult.refId };
|
|
517
|
-
} catch (error) {
|
|
518
|
-
const message = error instanceof Error ? error.message : String(error);
|
|
519
|
-
return { url, status: 'failed', error: message };
|
|
520
|
-
}
|
|
521
|
-
});
|
|
522
|
-
|
|
523
|
-
const batchResults = await Promise.all(batchPromises);
|
|
524
|
-
results.push(...batchResults);
|
|
525
|
-
}
|
|
526
|
-
|
|
527
|
-
// Close browser after all fetches
|
|
528
|
-
await closeBrowser();
|
|
529
|
-
|
|
530
|
-
const newCount = results.filter(r => r.status === 'new').length;
|
|
531
|
-
const cachedCount = results.filter(r => r.status === 'cached').length;
|
|
532
|
-
const failedCount = results.filter(r => r.status === 'failed').length;
|
|
533
|
-
|
|
534
497
|
if (args.outputFormat === 'json') {
|
|
535
498
|
return {
|
|
536
|
-
content: [
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
499
|
+
content: [
|
|
500
|
+
{
|
|
501
|
+
type: 'text',
|
|
502
|
+
text: JSON.stringify(
|
|
503
|
+
{
|
|
504
|
+
success: true,
|
|
505
|
+
sourceRef: args.refId,
|
|
506
|
+
summary,
|
|
507
|
+
results,
|
|
508
|
+
},
|
|
509
|
+
null,
|
|
510
|
+
2
|
|
511
|
+
),
|
|
512
|
+
},
|
|
513
|
+
],
|
|
545
514
|
};
|
|
546
515
|
}
|
|
547
516
|
|
|
@@ -555,7 +524,7 @@ async function handleFetchLinks(args: { refId: string; refetch?: boolean; output
|
|
|
555
524
|
text += `failed: ${r.url} - ${r.error}\n`;
|
|
556
525
|
}
|
|
557
526
|
}
|
|
558
|
-
text += `\nSummary: ${
|
|
527
|
+
text += `\nSummary: ${summary.new} new, ${summary.cached} cached, ${summary.failed} failed`;
|
|
559
528
|
|
|
560
529
|
return {
|
|
561
530
|
content: [{ type: 'text', text }],
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "arcfetch",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.2.0",
|
|
4
4
|
"description": "Fetch URLs, extract clean article content, and cache as markdown. Supports automatic JavaScript rendering via Playwright.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.ts",
|
|
@@ -49,7 +49,7 @@
|
|
|
49
49
|
"@semantic-release/changelog": "^6.0.3",
|
|
50
50
|
"@semantic-release/git": "^10.0.1",
|
|
51
51
|
"@semantic-release/github": "^12.0.2",
|
|
52
|
-
"@types/bun": "
|
|
52
|
+
"@types/bun": "^1.3.0",
|
|
53
53
|
"@types/turndown": "^5.0.5",
|
|
54
54
|
"semantic-release": "^25.0.2"
|
|
55
55
|
},
|
package/src/config/defaults.ts
CHANGED
|
@@ -6,15 +6,11 @@ export const DEFAULT_CONFIG: FetchiConfig = {
|
|
|
6
6
|
jsRetryThreshold: 85,
|
|
7
7
|
},
|
|
8
8
|
paths: {
|
|
9
|
-
tempDir: '.tmp',
|
|
9
|
+
tempDir: '.tmp/arcfetch',
|
|
10
10
|
docsDir: 'docs/ai/references',
|
|
11
11
|
},
|
|
12
12
|
playwright: {
|
|
13
13
|
timeout: 30000,
|
|
14
14
|
waitStrategy: 'networkidle',
|
|
15
15
|
},
|
|
16
|
-
retry: {
|
|
17
|
-
maxAttempts: 2,
|
|
18
|
-
backoffMs: 1000,
|
|
19
|
-
},
|
|
20
16
|
};
|
package/src/config/index.ts
CHANGED
package/src/config/loader.ts
CHANGED
|
@@ -1,17 +1,13 @@
|
|
|
1
1
|
import { existsSync, readFileSync } from 'node:fs';
|
|
2
2
|
import { join } from 'node:path';
|
|
3
|
-
import { FetchiConfigSchema, type FetchiConfig } from './schema';
|
|
4
3
|
import { DEFAULT_CONFIG } from './defaults';
|
|
4
|
+
import { type FetchiConfig, FetchiConfigSchema } from './schema';
|
|
5
5
|
|
|
6
6
|
type DeepPartial<T> = {
|
|
7
7
|
[P in keyof T]?: T[P] extends object ? DeepPartial<T[P]> : T[P];
|
|
8
8
|
};
|
|
9
9
|
|
|
10
|
-
const CONFIG_FILES = [
|
|
11
|
-
'arcfetch.config.json',
|
|
12
|
-
'.arcfetchrc',
|
|
13
|
-
'.arcfetchrc.json',
|
|
14
|
-
];
|
|
10
|
+
const CONFIG_FILES = ['arcfetch.config.json', '.arcfetchrc', '.arcfetchrc.json'];
|
|
15
11
|
|
|
16
12
|
export function findConfigFile(cwd: string = process.cwd()): string | null {
|
|
17
13
|
for (const file of CONFIG_FILES) {
|
|
@@ -69,16 +65,16 @@ export interface CliConfigOverrides {
|
|
|
69
65
|
export function loadConfig(cliOverrides: CliConfigOverrides = {}): FetchiConfig {
|
|
70
66
|
// Deep copy to avoid mutating DEFAULT_CONFIG
|
|
71
67
|
let config: FetchiConfig = JSON.parse(JSON.stringify(DEFAULT_CONFIG));
|
|
72
|
-
|
|
68
|
+
|
|
73
69
|
const configFile = findConfigFile();
|
|
74
70
|
if (configFile) {
|
|
75
71
|
const fileConfig = loadConfigFromFile(configFile);
|
|
76
72
|
config = deepMerge(config, fileConfig);
|
|
77
73
|
}
|
|
78
|
-
|
|
74
|
+
|
|
79
75
|
const envConfig = loadConfigFromEnv();
|
|
80
76
|
config = deepMerge(config, envConfig);
|
|
81
|
-
|
|
77
|
+
|
|
82
78
|
if (cliOverrides.minQuality !== undefined) {
|
|
83
79
|
config.quality.minScore = cliOverrides.minQuality;
|
|
84
80
|
}
|