arcfetch 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +215 -0
- package/cli.ts +455 -0
- package/index.ts +331 -0
- package/package.json +70 -0
- package/src/config/defaults.ts +22 -0
- package/src/config/index.ts +3 -0
- package/src/config/loader.ts +130 -0
- package/src/config/schema.ts +36 -0
- package/src/core/cache.ts +260 -0
- package/src/core/extractor.ts +87 -0
- package/src/core/index.ts +4 -0
- package/src/core/pipeline.ts +181 -0
- package/src/core/playwright/docker.ts +138 -0
- package/src/core/playwright/index.ts +3 -0
- package/src/core/playwright/local.ts +38 -0
- package/src/core/playwright/manager.ts +89 -0
- package/src/core/playwright/types.ts +12 -0
- package/src/types/turndown-plugin-gfm.d.ts +8 -0
- package/src/utils/markdown-cleaner.ts +79 -0
- package/src/utils/markdown-validator.ts +136 -0
package/index.ts
ADDED
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* Sofetch MCP Server v3.0
|
|
4
|
+
*
|
|
5
|
+
* Tools:
|
|
6
|
+
* - fetch_url: Fetch URL with automatic JS fallback, save to temp
|
|
7
|
+
* - list_cached: List all cached references
|
|
8
|
+
* - promote_reference: Move from temp to docs folder
|
|
9
|
+
* - delete_cached: Delete a cached reference
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
|
|
13
|
+
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
14
|
+
import { CallToolRequestSchema, ListToolsRequestSchema } from '@modelcontextprotocol/sdk/types.js';
|
|
15
|
+
import { loadConfig } from './src/config/index.js';
|
|
16
|
+
import { fetchUrl, closeBrowser } from './src/core/pipeline.js';
|
|
17
|
+
import { saveToTemp, listCached, promoteReference, deleteCached } from './src/core/cache.js';
|
|
18
|
+
|
|
19
|
+
const server = new Server(
|
|
20
|
+
{
|
|
21
|
+
name: 'arcfetch',
|
|
22
|
+
version: '3.0.0',
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
capabilities: {
|
|
26
|
+
tools: {},
|
|
27
|
+
},
|
|
28
|
+
}
|
|
29
|
+
);
|
|
30
|
+
|
|
31
|
+
server.setRequestHandler(ListToolsRequestSchema, async () => {
|
|
32
|
+
return {
|
|
33
|
+
tools: [
|
|
34
|
+
{
|
|
35
|
+
name: 'fetch_url',
|
|
36
|
+
description: `Fetch URL, extract article content, convert to clean markdown, and save to temp folder.
|
|
37
|
+
|
|
38
|
+
Features:
|
|
39
|
+
- Automatic JavaScript rendering fallback (via Playwright/Docker)
|
|
40
|
+
- Quality validation with configurable thresholds
|
|
41
|
+
- 90-95% token reduction vs raw HTML
|
|
42
|
+
|
|
43
|
+
Returns summary with title, author, excerpt. Use Read tool to access full content.`,
|
|
44
|
+
inputSchema: {
|
|
45
|
+
type: 'object',
|
|
46
|
+
properties: {
|
|
47
|
+
url: {
|
|
48
|
+
type: 'string',
|
|
49
|
+
description: 'URL to fetch',
|
|
50
|
+
},
|
|
51
|
+
query: {
|
|
52
|
+
type: 'string',
|
|
53
|
+
description: "Optional: What you're looking for (saved as metadata)",
|
|
54
|
+
},
|
|
55
|
+
minQuality: {
|
|
56
|
+
type: 'number',
|
|
57
|
+
description: 'Optional: Minimum quality score 0-100 (default: 60)',
|
|
58
|
+
},
|
|
59
|
+
tempDir: {
|
|
60
|
+
type: 'string',
|
|
61
|
+
description: 'Optional: Temp folder path (default: .tmp)',
|
|
62
|
+
},
|
|
63
|
+
outputFormat: {
|
|
64
|
+
type: 'string',
|
|
65
|
+
description: 'Output format: summary (default), path (filepath only), json (structured data)',
|
|
66
|
+
enum: ['summary', 'path', 'json'],
|
|
67
|
+
},
|
|
68
|
+
},
|
|
69
|
+
required: ['url'],
|
|
70
|
+
},
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
name: 'list_cached',
|
|
74
|
+
description:
|
|
75
|
+
'List all cached references in the temp folder. Shows ref ID, title, date, size, and URL for each.',
|
|
76
|
+
inputSchema: {
|
|
77
|
+
type: 'object',
|
|
78
|
+
properties: {
|
|
79
|
+
tempDir: {
|
|
80
|
+
type: 'string',
|
|
81
|
+
description: 'Optional: Temp folder path (default: .tmp)',
|
|
82
|
+
},
|
|
83
|
+
},
|
|
84
|
+
},
|
|
85
|
+
},
|
|
86
|
+
{
|
|
87
|
+
name: 'promote_reference',
|
|
88
|
+
description:
|
|
89
|
+
"Move a cached reference from temp folder to permanent docs folder. Updates status from 'temporary' to 'permanent'.",
|
|
90
|
+
inputSchema: {
|
|
91
|
+
type: 'object',
|
|
92
|
+
properties: {
|
|
93
|
+
refId: {
|
|
94
|
+
type: 'string',
|
|
95
|
+
description: 'Reference ID (e.g., REF-001)',
|
|
96
|
+
},
|
|
97
|
+
docsDir: {
|
|
98
|
+
type: 'string',
|
|
99
|
+
description: 'Optional: Docs folder path (default: docs/ai/references)',
|
|
100
|
+
},
|
|
101
|
+
},
|
|
102
|
+
required: ['refId'],
|
|
103
|
+
},
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
name: 'delete_cached',
|
|
107
|
+
description: 'Delete a cached reference from the temp folder.',
|
|
108
|
+
inputSchema: {
|
|
109
|
+
type: 'object',
|
|
110
|
+
properties: {
|
|
111
|
+
refId: {
|
|
112
|
+
type: 'string',
|
|
113
|
+
description: 'Reference ID to delete (e.g., REF-001)',
|
|
114
|
+
},
|
|
115
|
+
},
|
|
116
|
+
required: ['refId'],
|
|
117
|
+
},
|
|
118
|
+
},
|
|
119
|
+
],
|
|
120
|
+
};
|
|
121
|
+
});
|
|
122
|
+
|
|
123
|
+
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
124
|
+
const { name, arguments: args } = request.params;
|
|
125
|
+
|
|
126
|
+
switch (name) {
|
|
127
|
+
case 'fetch_url':
|
|
128
|
+
return handleFetchUrl(
|
|
129
|
+
args as {
|
|
130
|
+
url: string;
|
|
131
|
+
query?: string;
|
|
132
|
+
minQuality?: number;
|
|
133
|
+
tempDir?: string;
|
|
134
|
+
outputFormat?: 'summary' | 'path' | 'json';
|
|
135
|
+
}
|
|
136
|
+
);
|
|
137
|
+
|
|
138
|
+
case 'list_cached':
|
|
139
|
+
return handleListCached(
|
|
140
|
+
args as {
|
|
141
|
+
tempDir?: string;
|
|
142
|
+
}
|
|
143
|
+
);
|
|
144
|
+
|
|
145
|
+
case 'promote_reference':
|
|
146
|
+
return handlePromoteReference(
|
|
147
|
+
args as {
|
|
148
|
+
refId: string;
|
|
149
|
+
docsDir?: string;
|
|
150
|
+
}
|
|
151
|
+
);
|
|
152
|
+
|
|
153
|
+
case 'delete_cached':
|
|
154
|
+
return handleDeleteCached(
|
|
155
|
+
args as {
|
|
156
|
+
refId: string;
|
|
157
|
+
}
|
|
158
|
+
);
|
|
159
|
+
|
|
160
|
+
default:
|
|
161
|
+
throw new Error(`Unknown tool: ${name}`);
|
|
162
|
+
}
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
async function handleFetchUrl(args: {
|
|
166
|
+
url: string;
|
|
167
|
+
query?: string;
|
|
168
|
+
minQuality?: number;
|
|
169
|
+
tempDir?: string;
|
|
170
|
+
outputFormat?: 'summary' | 'path' | 'json';
|
|
171
|
+
}) {
|
|
172
|
+
const config = loadConfig({
|
|
173
|
+
minQuality: args.minQuality,
|
|
174
|
+
tempDir: args.tempDir,
|
|
175
|
+
});
|
|
176
|
+
|
|
177
|
+
const result = await fetchUrl(args.url, config, false);
|
|
178
|
+
|
|
179
|
+
await closeBrowser();
|
|
180
|
+
|
|
181
|
+
if (!result.success) {
|
|
182
|
+
const errorText =
|
|
183
|
+
`Error: ${result.error}` +
|
|
184
|
+
(result.suggestion ? `\nSuggestion: ${result.suggestion}` : '') +
|
|
185
|
+
(result.quality ? `\nQuality: ${result.quality.score}/100` : '');
|
|
186
|
+
return {
|
|
187
|
+
content: [{ type: 'text', text: errorText }],
|
|
188
|
+
};
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
const saveResult = await saveToTemp(config, result.title!, args.url, result.markdown!, args.query);
|
|
192
|
+
|
|
193
|
+
if (saveResult.error) {
|
|
194
|
+
return {
|
|
195
|
+
content: [{ type: 'text', text: `Error: Save failed: ${saveResult.error}` }],
|
|
196
|
+
};
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
const outputFormat = args.outputFormat || 'summary';
|
|
200
|
+
|
|
201
|
+
// Path-only output
|
|
202
|
+
if (outputFormat === 'path') {
|
|
203
|
+
return {
|
|
204
|
+
content: [{ type: 'text', text: saveResult.filepath }],
|
|
205
|
+
};
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
// JSON output
|
|
209
|
+
if (outputFormat === 'json') {
|
|
210
|
+
const jsonData = {
|
|
211
|
+
success: true,
|
|
212
|
+
refId: saveResult.refId,
|
|
213
|
+
title: result.title,
|
|
214
|
+
byline: result.byline,
|
|
215
|
+
siteName: result.siteName,
|
|
216
|
+
excerpt: result.excerpt,
|
|
217
|
+
url: args.url,
|
|
218
|
+
filepath: saveResult.filepath,
|
|
219
|
+
size: result.markdown!.length,
|
|
220
|
+
tokens: Math.round(result.markdown!.length / 4),
|
|
221
|
+
quality: result.quality?.score,
|
|
222
|
+
usedPlaywright: result.usedPlaywright,
|
|
223
|
+
playwrightReason: result.playwrightReason,
|
|
224
|
+
query: args.query,
|
|
225
|
+
};
|
|
226
|
+
return {
|
|
227
|
+
content: [{ type: 'text', text: JSON.stringify(jsonData, null, 2) }],
|
|
228
|
+
};
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// Summary output (default) - clean, LLM-friendly
|
|
232
|
+
let text = `Cached: ${saveResult.refId}\n\n`;
|
|
233
|
+
text += `Title: ${result.title}\n`;
|
|
234
|
+
if (result.byline) text += `Author: ${result.byline}\n`;
|
|
235
|
+
if (result.siteName) text += `Source: ${result.siteName}\n`;
|
|
236
|
+
if (result.excerpt) {
|
|
237
|
+
const excerpt = result.excerpt.slice(0, 150);
|
|
238
|
+
text += `Summary: ${excerpt}${result.excerpt.length > 150 ? '...' : ''}\n`;
|
|
239
|
+
}
|
|
240
|
+
text += `\nFilepath: ${saveResult.filepath}\n`;
|
|
241
|
+
text += `Size: ${result.markdown!.length} chars (~${Math.round(result.markdown!.length / 4)} tokens)\n`;
|
|
242
|
+
text += `Quality: ${result.quality?.score}/100`;
|
|
243
|
+
|
|
244
|
+
if (result.usedPlaywright) {
|
|
245
|
+
text += `\nPlaywright: Yes (${result.playwrightReason})`;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
return {
|
|
249
|
+
content: [{ type: 'text', text }],
|
|
250
|
+
};
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
async function handleListCached(args: { tempDir?: string }) {
|
|
254
|
+
const config = loadConfig({ tempDir: args.tempDir });
|
|
255
|
+
const result = listCached(config);
|
|
256
|
+
|
|
257
|
+
if (result.error) {
|
|
258
|
+
return {
|
|
259
|
+
content: [{ type: 'text', text: `Error: ${result.error}` }],
|
|
260
|
+
};
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
if (result.references.length === 0) {
|
|
264
|
+
return {
|
|
265
|
+
content: [{ type: 'text', text: `No cached references in ${config.paths.tempDir}/` }],
|
|
266
|
+
};
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
let text = `Cached references (${result.references.length}):\n\n`;
|
|
270
|
+
|
|
271
|
+
for (const ref of result.references) {
|
|
272
|
+
text += `${ref.refId} | ${ref.title.slice(0, 50)}${ref.title.length > 50 ? '...' : ''}\n`;
|
|
273
|
+
text += ` Date: ${ref.fetchedDate} | Size: ${Math.round(ref.size / 1024)}KB\n`;
|
|
274
|
+
text += ` URL: ${ref.url.slice(0, 60)}${ref.url.length > 60 ? '...' : ''}\n\n`;
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
return {
|
|
278
|
+
content: [{ type: 'text', text: text.trim() }],
|
|
279
|
+
};
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
async function handlePromoteReference(args: { refId: string; docsDir?: string }) {
|
|
283
|
+
const config = loadConfig({ docsDir: args.docsDir });
|
|
284
|
+
const result = promoteReference(config, args.refId);
|
|
285
|
+
|
|
286
|
+
if (!result.success) {
|
|
287
|
+
return {
|
|
288
|
+
content: [{ type: 'text', text: `Error: ${result.error}` }],
|
|
289
|
+
};
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
return {
|
|
293
|
+
content: [
|
|
294
|
+
{
|
|
295
|
+
type: 'text',
|
|
296
|
+
text: `Promoted: ${args.refId}\nFrom: ${result.fromPath}\nTo: ${result.toPath}`,
|
|
297
|
+
},
|
|
298
|
+
],
|
|
299
|
+
};
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
async function handleDeleteCached(args: { refId: string }) {
|
|
303
|
+
const config = loadConfig();
|
|
304
|
+
const result = deleteCached(config, args.refId);
|
|
305
|
+
|
|
306
|
+
if (!result.success) {
|
|
307
|
+
return {
|
|
308
|
+
content: [{ type: 'text', text: `Error: ${result.error}` }],
|
|
309
|
+
};
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
return {
|
|
313
|
+
content: [
|
|
314
|
+
{
|
|
315
|
+
type: 'text',
|
|
316
|
+
text: `Deleted: ${args.refId}\nFile: ${result.filepath}`,
|
|
317
|
+
},
|
|
318
|
+
],
|
|
319
|
+
};
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
async function main() {
|
|
323
|
+
const transport = new StdioServerTransport();
|
|
324
|
+
await server.connect(transport);
|
|
325
|
+
console.error('Sofetch MCP server v3.0 running on stdio');
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
main().catch((error) => {
|
|
329
|
+
console.error('Server error:', error);
|
|
330
|
+
process.exit(1);
|
|
331
|
+
});
|
package/package.json
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "arcfetch",
|
|
3
|
+
"version": "3.0.0",
|
|
4
|
+
"description": "Fetch URLs, extract clean article content, and cache as markdown. Supports automatic JavaScript rendering via Playwright/Docker.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "index.ts",
|
|
7
|
+
"bin": {
|
|
8
|
+
"arcfetch": "cli.ts"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"cli.ts",
|
|
12
|
+
"index.ts",
|
|
13
|
+
"src",
|
|
14
|
+
"README.md",
|
|
15
|
+
"LICENSE"
|
|
16
|
+
],
|
|
17
|
+
"scripts": {
|
|
18
|
+
"dev": "bun run index.ts",
|
|
19
|
+
"start": "bun run index.ts",
|
|
20
|
+
"cli": "bun run cli.ts",
|
|
21
|
+
"fetch": "bun run cli.ts fetch",
|
|
22
|
+
"list": "bun run cli.ts list",
|
|
23
|
+
"test": "bun test",
|
|
24
|
+
"test:unit": "bun test tests/unit/",
|
|
25
|
+
"test:integration": "bun test tests/integration/",
|
|
26
|
+
"test:e2e": "bun test tests/e2e/",
|
|
27
|
+
"test:coverage": "bun test --coverage",
|
|
28
|
+
"typecheck": "tsc --noEmit",
|
|
29
|
+
"lint": "biome lint .",
|
|
30
|
+
"lint:fix": "biome lint --write .",
|
|
31
|
+
"format": "biome format --write .",
|
|
32
|
+
"check": "biome check .",
|
|
33
|
+
"check:fix": "biome check --write .",
|
|
34
|
+
"postinstall": "playwright install chromium || true"
|
|
35
|
+
},
|
|
36
|
+
"dependencies": {
|
|
37
|
+
"@modelcontextprotocol/sdk": "^0.6.0",
|
|
38
|
+
"@mozilla/readability": "^0.6.0",
|
|
39
|
+
"linkedom": "^0.18.12",
|
|
40
|
+
"playwright": "^1.56.0",
|
|
41
|
+
"playwright-extra": "^4.3.6",
|
|
42
|
+
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
|
43
|
+
"turndown": "^7.2.0",
|
|
44
|
+
"turndown-plugin-gfm": "^1.0.2",
|
|
45
|
+
"zod": "^3.22.4"
|
|
46
|
+
},
|
|
47
|
+
"devDependencies": {
|
|
48
|
+
"@biomejs/biome": "^2.3.10",
|
|
49
|
+
"@types/bun": "latest",
|
|
50
|
+
"@types/turndown": "^5.0.5"
|
|
51
|
+
},
|
|
52
|
+
"keywords": [
|
|
53
|
+
"fetch",
|
|
54
|
+
"markdown",
|
|
55
|
+
"readability",
|
|
56
|
+
"mcp",
|
|
57
|
+
"playwright",
|
|
58
|
+
"web-scraping",
|
|
59
|
+
"article-extraction"
|
|
60
|
+
],
|
|
61
|
+
"author": "",
|
|
62
|
+
"license": "MIT",
|
|
63
|
+
"repository": {
|
|
64
|
+
"type": "git",
|
|
65
|
+
"url": "git+https://github.com/briansunter/arcfetch.git"
|
|
66
|
+
},
|
|
67
|
+
"publishConfig": {
|
|
68
|
+
"access": "public"
|
|
69
|
+
}
|
|
70
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import type { FetchiConfig } from './schema.js';
|
|
2
|
+
|
|
3
|
+
export const DEFAULT_CONFIG: FetchiConfig = {
|
|
4
|
+
quality: {
|
|
5
|
+
minScore: 60,
|
|
6
|
+
jsRetryThreshold: 85,
|
|
7
|
+
},
|
|
8
|
+
paths: {
|
|
9
|
+
tempDir: '.tmp',
|
|
10
|
+
docsDir: 'docs/ai/references',
|
|
11
|
+
},
|
|
12
|
+
playwright: {
|
|
13
|
+
mode: 'auto',
|
|
14
|
+
dockerImage: 'mcr.microsoft.com/playwright:v1.40.0-jammy',
|
|
15
|
+
timeout: 30000,
|
|
16
|
+
waitStrategy: 'networkidle',
|
|
17
|
+
},
|
|
18
|
+
retry: {
|
|
19
|
+
maxAttempts: 2,
|
|
20
|
+
backoffMs: 1000,
|
|
21
|
+
},
|
|
22
|
+
};
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
import { existsSync, readFileSync } from 'node:fs';
|
|
2
|
+
import { join } from 'node:path';
|
|
3
|
+
import { FetchiConfigSchema, type FetchiConfig } from './schema.js';
|
|
4
|
+
import { DEFAULT_CONFIG } from './defaults.js';
|
|
5
|
+
|
|
6
|
+
type DeepPartial<T> = {
|
|
7
|
+
[P in keyof T]?: T[P] extends object ? DeepPartial<T[P]> : T[P];
|
|
8
|
+
};
|
|
9
|
+
|
|
10
|
+
const CONFIG_FILES = [
|
|
11
|
+
'arcfetch.config.json',
|
|
12
|
+
'.arcfetchrc',
|
|
13
|
+
'.arcfetchrc.json',
|
|
14
|
+
];
|
|
15
|
+
|
|
16
|
+
export function findConfigFile(cwd: string = process.cwd()): string | null {
|
|
17
|
+
for (const file of CONFIG_FILES) {
|
|
18
|
+
const path = join(cwd, file);
|
|
19
|
+
if (existsSync(path)) {
|
|
20
|
+
return path;
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
return null;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export function loadConfigFromFile(path: string): Partial<FetchiConfig> {
|
|
27
|
+
try {
|
|
28
|
+
const content = readFileSync(path, 'utf-8');
|
|
29
|
+
return JSON.parse(content);
|
|
30
|
+
} catch {
|
|
31
|
+
console.warn(`Warning: Could not load config from ${path}`);
|
|
32
|
+
return {};
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export function loadConfigFromEnv(): DeepPartial<FetchiConfig> {
|
|
37
|
+
const config: DeepPartial<FetchiConfig> = {};
|
|
38
|
+
|
|
39
|
+
if (process.env.SOFETCH_MIN_SCORE) {
|
|
40
|
+
config.quality = config.quality || {};
|
|
41
|
+
config.quality.minScore = parseInt(process.env.SOFETCH_MIN_SCORE, 10);
|
|
42
|
+
}
|
|
43
|
+
if (process.env.SOFETCH_JS_RETRY_THRESHOLD) {
|
|
44
|
+
config.quality = config.quality || {};
|
|
45
|
+
config.quality.jsRetryThreshold = parseInt(process.env.SOFETCH_JS_RETRY_THRESHOLD, 10);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
if (process.env.SOFETCH_TEMP_DIR) {
|
|
49
|
+
config.paths = config.paths || {};
|
|
50
|
+
config.paths.tempDir = process.env.SOFETCH_TEMP_DIR;
|
|
51
|
+
}
|
|
52
|
+
if (process.env.SOFETCH_DOCS_DIR) {
|
|
53
|
+
config.paths = config.paths || {};
|
|
54
|
+
config.paths.docsDir = process.env.SOFETCH_DOCS_DIR;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
if (process.env.SOFETCH_PLAYWRIGHT_MODE) {
|
|
58
|
+
const mode = process.env.SOFETCH_PLAYWRIGHT_MODE;
|
|
59
|
+
if (mode === 'local' || mode === 'docker' || mode === 'auto') {
|
|
60
|
+
config.playwright = config.playwright || {};
|
|
61
|
+
config.playwright.mode = mode;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
if (process.env.SOFETCH_DOCKER_IMAGE) {
|
|
65
|
+
config.playwright = config.playwright || {};
|
|
66
|
+
config.playwright.dockerImage = process.env.SOFETCH_DOCKER_IMAGE;
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
return config;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
export interface CliConfigOverrides {
|
|
73
|
+
minQuality?: number;
|
|
74
|
+
jsRetryThreshold?: number;
|
|
75
|
+
tempDir?: string;
|
|
76
|
+
docsDir?: string;
|
|
77
|
+
playwrightMode?: 'local' | 'docker' | 'auto';
|
|
78
|
+
timeout?: number;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
export function loadConfig(cliOverrides: CliConfigOverrides = {}): FetchiConfig {
|
|
82
|
+
// Deep copy to avoid mutating DEFAULT_CONFIG
|
|
83
|
+
let config: FetchiConfig = JSON.parse(JSON.stringify(DEFAULT_CONFIG));
|
|
84
|
+
|
|
85
|
+
const configFile = findConfigFile();
|
|
86
|
+
if (configFile) {
|
|
87
|
+
const fileConfig = loadConfigFromFile(configFile);
|
|
88
|
+
config = deepMerge(config, fileConfig);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const envConfig = loadConfigFromEnv();
|
|
92
|
+
config = deepMerge(config, envConfig);
|
|
93
|
+
|
|
94
|
+
if (cliOverrides.minQuality !== undefined) {
|
|
95
|
+
config.quality.minScore = cliOverrides.minQuality;
|
|
96
|
+
}
|
|
97
|
+
if (cliOverrides.jsRetryThreshold !== undefined) {
|
|
98
|
+
config.quality.jsRetryThreshold = cliOverrides.jsRetryThreshold;
|
|
99
|
+
}
|
|
100
|
+
if (cliOverrides.tempDir !== undefined) {
|
|
101
|
+
config.paths.tempDir = cliOverrides.tempDir;
|
|
102
|
+
}
|
|
103
|
+
if (cliOverrides.docsDir !== undefined) {
|
|
104
|
+
config.paths.docsDir = cliOverrides.docsDir;
|
|
105
|
+
}
|
|
106
|
+
if (cliOverrides.playwrightMode !== undefined) {
|
|
107
|
+
config.playwright.mode = cliOverrides.playwrightMode;
|
|
108
|
+
}
|
|
109
|
+
if (cliOverrides.timeout !== undefined) {
|
|
110
|
+
config.playwright.timeout = cliOverrides.timeout;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
return FetchiConfigSchema.parse(config);
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
function deepMerge<T extends Record<string, unknown>>(target: T, source: DeepPartial<T>): T {
|
|
117
|
+
const result = { ...target } as T;
|
|
118
|
+
for (const key of Object.keys(source) as (keyof T)[]) {
|
|
119
|
+
const sourceValue = source[key];
|
|
120
|
+
if (sourceValue && typeof sourceValue === 'object' && !Array.isArray(sourceValue)) {
|
|
121
|
+
(result as Record<string, unknown>)[key as string] = deepMerge(
|
|
122
|
+
(result[key] || {}) as Record<string, unknown>,
|
|
123
|
+
sourceValue as DeepPartial<Record<string, unknown>>
|
|
124
|
+
);
|
|
125
|
+
} else if (sourceValue !== undefined) {
|
|
126
|
+
(result as Record<string, unknown>)[key as string] = sourceValue;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
return result;
|
|
130
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
|
|
3
|
+
export const QualityConfigSchema = z.object({
|
|
4
|
+
minScore: z.number().min(0).max(100).default(60),
|
|
5
|
+
jsRetryThreshold: z.number().min(0).max(100).default(85),
|
|
6
|
+
});
|
|
7
|
+
|
|
8
|
+
export const PathsConfigSchema = z.object({
|
|
9
|
+
tempDir: z.string().default('.tmp'),
|
|
10
|
+
docsDir: z.string().default('docs/ai/references'),
|
|
11
|
+
});
|
|
12
|
+
|
|
13
|
+
export const PlaywrightConfigSchema = z.object({
|
|
14
|
+
mode: z.enum(['local', 'docker', 'auto']).default('auto'),
|
|
15
|
+
dockerImage: z.string().default('mcr.microsoft.com/playwright:v1.40.0-jammy'),
|
|
16
|
+
timeout: z.number().default(30000),
|
|
17
|
+
waitStrategy: z.enum(['networkidle', 'domcontentloaded', 'load']).default('networkidle'),
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
export const RetryConfigSchema = z.object({
|
|
21
|
+
maxAttempts: z.number().default(2),
|
|
22
|
+
backoffMs: z.number().default(1000),
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
export const FetchiConfigSchema = z.object({
|
|
26
|
+
quality: QualityConfigSchema.default({}),
|
|
27
|
+
paths: PathsConfigSchema.default({}),
|
|
28
|
+
playwright: PlaywrightConfigSchema.default({}),
|
|
29
|
+
retry: RetryConfigSchema.default({}),
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
export type FetchiConfig = z.infer<typeof FetchiConfigSchema>;
|
|
33
|
+
export type QualityConfig = z.infer<typeof QualityConfigSchema>;
|
|
34
|
+
export type PathsConfig = z.infer<typeof PathsConfigSchema>;
|
|
35
|
+
export type PlaywrightConfig = z.infer<typeof PlaywrightConfigSchema>;
|
|
36
|
+
export type RetryConfig = z.infer<typeof RetryConfigSchema>;
|