mallmaverick-store-scraper 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/mcp-server.js +94 -1
- package/src/storeExtractor.js +6 -0
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mallmaverick-store-scraper",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.4",
|
|
4
4
|
"description": "MCP server + CLI for scraping shopping mall store directories. Hours-first layered pipeline + image classification.",
|
|
5
5
|
"main": "src/main.js",
|
|
6
6
|
"type": "commonjs",
|
package/src/mcp-server.js
CHANGED
|
@@ -113,6 +113,15 @@ const TOOLS = [
|
|
|
113
113
|
required: ['store_url'],
|
|
114
114
|
},
|
|
115
115
|
},
|
|
116
|
+
{
|
|
117
|
+
name: 'check_status',
|
|
118
|
+
description:
|
|
119
|
+
'Returns the running mall-scraper-mcp version, auth mode, and Worker ' +
|
|
120
|
+
'connectivity. Use this BEFORE running scrape_directory to verify the ' +
|
|
121
|
+
'tool is wired up correctly — confirms version, that the OpenAI proxy ' +
|
|
122
|
+
'is reachable, and that the shared secret is valid.',
|
|
123
|
+
inputSchema: { type: 'object', properties: {} },
|
|
124
|
+
},
|
|
116
125
|
{
|
|
117
126
|
name: 'validate_image_url',
|
|
118
127
|
description:
|
|
@@ -129,7 +138,7 @@ const TOOLS = [
|
|
|
129
138
|
},
|
|
130
139
|
];
|
|
131
140
|
|
|
132
|
-
const PACKAGE_VERSION = '0.1.
|
|
141
|
+
const PACKAGE_VERSION = '0.1.4';
|
|
133
142
|
|
|
134
143
|
const server = new Server(
|
|
135
144
|
{ name: 'mall-scraper-mcp', version: PACKAGE_VERSION },
|
|
@@ -145,6 +154,7 @@ server.setRequestHandler(CallToolRequestSchema, async (req) => {
|
|
|
145
154
|
case 'scrape_directory': return await handleScrapeDirectory(args || {});
|
|
146
155
|
case 'get_store_hours': return await handleGetStoreHours(args || {});
|
|
147
156
|
case 'validate_image_url': return await handleValidateImageUrl(args || {});
|
|
157
|
+
case 'check_status': return await handleCheckStatus(args || {});
|
|
148
158
|
default:
|
|
149
159
|
return errorResult(`Unknown tool: ${name}`);
|
|
150
160
|
}
|
|
@@ -225,6 +235,9 @@ async function handleScrapeDirectory({ directory_url, max_stores = 10, concurren
|
|
|
225
235
|
stores_extracted: stores.length,
|
|
226
236
|
hours_layer_breakdown: bySource,
|
|
227
237
|
llm_usage: usage,
|
|
238
|
+
llm_failed: usage.errors > 0
|
|
239
|
+
? `⚠ ${usage.errors} LLM calls failed (description/categories/etc. fields will be empty). Last error: ${usage.lastError}. Run check_status to diagnose.`
|
|
240
|
+
: null,
|
|
228
241
|
written_files: writtenPaths,
|
|
229
242
|
write_error: writeError,
|
|
230
243
|
auth_mode: creds.mode,
|
|
@@ -321,6 +334,86 @@ async function handleGetStoreHours({ store_url, mall_root_url }) {
|
|
|
321
334
|
}
|
|
322
335
|
}
|
|
323
336
|
|
|
337
|
+
async function handleCheckStatus() {
|
|
338
|
+
const creds = describeCredentials();
|
|
339
|
+
const status = {
|
|
340
|
+
mcp_version: PACKAGE_VERSION,
|
|
341
|
+
node_version: process.version,
|
|
342
|
+
auth_mode: creds.mode,
|
|
343
|
+
auth_endpoint: creds.endpoint,
|
|
344
|
+
worker_reachable: null,
|
|
345
|
+
worker_health: null,
|
|
346
|
+
worker_auth_ok: null,
|
|
347
|
+
};
|
|
348
|
+
|
|
349
|
+
// If we're in proxy mode, ping the Worker /health endpoint and probe auth.
|
|
350
|
+
if (creds.mode === 'proxy' && creds.endpoint) {
|
|
351
|
+
try {
|
|
352
|
+
const healthUrl = creds.endpoint.replace(/\/+$/, '') + '/health';
|
|
353
|
+
const health = await new Promise((resolve) => {
|
|
354
|
+
const req = https.get(healthUrl, { timeout: 6000 }, (res) => {
|
|
355
|
+
let body = '';
|
|
356
|
+
res.on('data', (c) => { body += c; });
|
|
357
|
+
res.on('end', () => resolve({ status: res.statusCode, body }));
|
|
358
|
+
});
|
|
359
|
+
req.on('error', () => resolve(null));
|
|
360
|
+
req.on('timeout', () => { req.destroy(); resolve(null); });
|
|
361
|
+
});
|
|
362
|
+
if (health) {
|
|
363
|
+
status.worker_reachable = true;
|
|
364
|
+
status.worker_health = health.body.slice(0, 200);
|
|
365
|
+
} else {
|
|
366
|
+
status.worker_reachable = false;
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
// Probe auth: a tiny POST to /v1/models with the shared secret.
|
|
370
|
+
// OpenAI's /v1/models is a cheap, no-tokens endpoint that proves the
|
|
371
|
+
// Worker is forwarding and the key works.
|
|
372
|
+
const token = process.env.MALL_SCRAPER_TOKEN || '';
|
|
373
|
+
const modelsUrl = creds.endpoint.replace(/\/+$/, '') + '/v1/models';
|
|
374
|
+
const auth = await new Promise((resolve) => {
|
|
375
|
+
const req = https.get(modelsUrl, {
|
|
376
|
+
timeout: 8000,
|
|
377
|
+
headers: { 'X-Mall-Scraper-Token': token },
|
|
378
|
+
}, (res) => {
|
|
379
|
+
let body = '';
|
|
380
|
+
res.on('data', (c) => { body += c; });
|
|
381
|
+
res.on('end', () => resolve({ status: res.statusCode, body }));
|
|
382
|
+
});
|
|
383
|
+
req.on('error', () => resolve(null));
|
|
384
|
+
req.on('timeout', () => { req.destroy(); resolve(null); });
|
|
385
|
+
});
|
|
386
|
+
if (auth) {
|
|
387
|
+
status.worker_auth_ok = auth.status === 200;
|
|
388
|
+
if (auth.status !== 200) {
|
|
389
|
+
status.worker_auth_error = auth.body.slice(0, 200);
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
} catch (err) {
|
|
393
|
+
status.worker_probe_error = err.message;
|
|
394
|
+
}
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
// Verdict line for the user
|
|
398
|
+
let verdict;
|
|
399
|
+
if (creds.mode === 'none') {
|
|
400
|
+
verdict = '⚠ No credentials configured. Set MALL_SCRAPER_PROXY_URL+MALL_SCRAPER_TOKEN or OPENAI_API_KEY.';
|
|
401
|
+
} else if (creds.mode === 'proxy') {
|
|
402
|
+
if (status.worker_reachable && status.worker_auth_ok) verdict = '✅ All good — version, Worker, and auth all working.';
|
|
403
|
+
else if (!status.worker_reachable) verdict = '⚠ Worker is unreachable. Check MALL_SCRAPER_PROXY_URL.';
|
|
404
|
+
else if (!status.worker_auth_ok) verdict = '⚠ Worker is reachable but rejected the token. MALL_SCRAPER_TOKEN does not match the SHARED_SECRET on the Worker.';
|
|
405
|
+
else verdict = '⚠ Partial — see fields below.';
|
|
406
|
+
} else {
|
|
407
|
+
verdict = `✅ Direct mode (using OPENAI_API_KEY env var). Version ${PACKAGE_VERSION}.`;
|
|
408
|
+
}
|
|
409
|
+
|
|
410
|
+
return {
|
|
411
|
+
content: [
|
|
412
|
+
{ type: 'text', text: verdict + '\n\n' + JSON.stringify(status, null, 2) },
|
|
413
|
+
],
|
|
414
|
+
};
|
|
415
|
+
}
|
|
416
|
+
|
|
324
417
|
function handleValidateImageUrl({ url }) {
|
|
325
418
|
if (!url) return Promise.resolve(errorResult('url is required'));
|
|
326
419
|
return new Promise((resolve) => {
|
package/src/storeExtractor.js
CHANGED
|
@@ -128,6 +128,8 @@ class StoreExtractor {
|
|
|
128
128
|
this.totalTokensOutput = 0;
|
|
129
129
|
this.totalCost = 0;
|
|
130
130
|
this.extractionCount = 0;
|
|
131
|
+
this.errorCount = 0;
|
|
132
|
+
this.lastError = null;
|
|
131
133
|
}
|
|
132
134
|
|
|
133
135
|
async extract(pageData, hoursCanonical) {
|
|
@@ -152,6 +154,8 @@ class StoreExtractor {
|
|
|
152
154
|
this._trackUsage(resp);
|
|
153
155
|
raw = JSON.parse(resp.choices[0].message.content);
|
|
154
156
|
} catch (err) {
|
|
157
|
+
this.errorCount++;
|
|
158
|
+
this.lastError = err.message;
|
|
155
159
|
if (this.logger) this.logger.warn(` ⚠ Store LLM extract failed: ${err.message}`);
|
|
156
160
|
return { fields: {}, confidence: 0 };
|
|
157
161
|
}
|
|
@@ -218,6 +222,8 @@ class StoreExtractor {
|
|
|
218
222
|
return {
|
|
219
223
|
model: this.model,
|
|
220
224
|
extractions: this.extractionCount,
|
|
225
|
+
errors: this.errorCount,
|
|
226
|
+
lastError: this.lastError,
|
|
221
227
|
totalInputTokens: this.totalTokensInput,
|
|
222
228
|
totalOutputTokens: this.totalTokensOutput,
|
|
223
229
|
estimatedCost: `$${this.totalCost.toFixed(4)}`,
|