mallmaverick-store-scraper 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mallmaverick-store-scraper",
3
- "version": "0.1.2",
3
+ "version": "0.1.4",
4
4
  "description": "MCP server + CLI for scraping shopping mall store directories. Hours-first layered pipeline + image classification.",
5
5
  "main": "src/main.js",
6
6
  "type": "commonjs",
package/src/mcp-server.js CHANGED
@@ -60,11 +60,11 @@ const TOOLS = [
60
60
  '(name, hours, phone, logo, brand image, categories, etc.). Use this ' +
61
61
  'when the user wants to capture a directory like ' +
62
62
  'https://grasslands.ca/store-directory/.\n\n' +
63
- 'AFTER RUNNING THIS TOOL: paste the full CSV content (from the ' +
64
- '"--- CSV ---" block of the response) into your reply inside a fenced ' +
65
- 'code block so the user can copy it directly into their CMS. ' +
66
- 'Also state the saved file path and a one-line summary of how many ' +
67
- 'stores were extracted. Do NOT summarize away the CSV — show it in full.',
63
+ 'AFTER RUNNING THIS TOOL: give a short summary how many stores were ' +
64
+ 'extracted, hours-layer breakdown, and the saved file path. The CSV is ' +
65
+ 'attached as a resource in the tool response (the user can download/' +
66
+ 'preview it from there) do NOT paste the CSV text into your reply. ' +
67
+ 'Keep your text reply short.',
68
68
  inputSchema: {
69
69
  type: 'object',
70
70
  properties: {
@@ -113,6 +113,15 @@ const TOOLS = [
113
113
  required: ['store_url'],
114
114
  },
115
115
  },
116
+ {
117
+ name: 'check_status',
118
+ description:
119
+ 'Returns the running mall-scraper-mcp version, auth mode, and Worker ' +
120
+ 'connectivity. Use this BEFORE running scrape_directory to verify the ' +
121
+ 'tool is wired up correctly — confirms version, that the OpenAI proxy ' +
122
+ 'is reachable, and that the shared secret is valid.',
123
+ inputSchema: { type: 'object', properties: {} },
124
+ },
116
125
  {
117
126
  name: 'validate_image_url',
118
127
  description:
@@ -129,7 +138,7 @@ const TOOLS = [
129
138
  },
130
139
  ];
131
140
 
132
- const PACKAGE_VERSION = '0.1.2';
141
+ const PACKAGE_VERSION = '0.1.4';
133
142
 
134
143
  const server = new Server(
135
144
  { name: 'mall-scraper-mcp', version: PACKAGE_VERSION },
@@ -145,6 +154,7 @@ server.setRequestHandler(CallToolRequestSchema, async (req) => {
145
154
  case 'scrape_directory': return await handleScrapeDirectory(args || {});
146
155
  case 'get_store_hours': return await handleGetStoreHours(args || {});
147
156
  case 'validate_image_url': return await handleValidateImageUrl(args || {});
157
+ case 'check_status': return await handleCheckStatus(args || {});
148
158
  default:
149
159
  return errorResult(`Unknown tool: ${name}`);
150
160
  }
@@ -225,26 +235,53 @@ async function handleScrapeDirectory({ directory_url, max_stores = 10, concurren
225
235
  stores_extracted: stores.length,
226
236
  hours_layer_breakdown: bySource,
227
237
  llm_usage: usage,
238
+ llm_failed: usage.errors > 0
239
+ ? `⚠ ${usage.errors} LLM calls failed (description/categories/etc. fields will be empty). Last error: ${usage.lastError}. Run check_status to diagnose.`
240
+ : null,
228
241
  written_files: writtenPaths,
229
242
  write_error: writeError,
230
243
  auth_mode: creds.mode,
231
244
  mcp_version: PACKAGE_VERSION,
232
245
  };
233
246
 
234
- // Order matters Claude is more likely to surface the first content
235
- // blocks. Lead with the CSV so it can't be summarized away.
247
+ // Build a short brief + return the CSV as an embedded resource so
248
+ // Claude Desktop can render it as an attachment card instead of inline
249
+ // text. Falls back to inline-rendering if the client doesn't support
250
+ // resources, but most clients (including Claude Desktop) do.
251
+ const host = new URL(directory_url).hostname.replace(/^www\./, '');
252
+ const csvFilename = writtenPaths
253
+ ? path.basename(writtenPaths.csv)
254
+ : `stores_v5_${host}.csv`;
255
+ const csvUri = writtenPaths
256
+ ? `file://${writtenPaths.csv}`
257
+ : `file:///tmp/${csvFilename}`;
258
+
259
+ const brief =
260
+ `mall-scraper-mcp v${PACKAGE_VERSION}\n` +
261
+ `Scraped ${stores.length} store${stores.length === 1 ? '' : 's'} from ${host}.\n` +
262
+ `Hours-layer breakdown: ${Object.entries(bySource).map(([k, v]) => `${k}=${v}`).join(', ')}.\n` +
263
+ (writtenPaths
264
+ ? `Saved to: ${writtenPaths.csv}`
265
+ : `⚠ disk write failed (${writeError}); CSV is in the attached resource only.`);
266
+
236
267
  return {
237
268
  content: [
269
+ { type: 'text', text: brief },
270
+ {
271
+ type: 'resource',
272
+ resource: {
273
+ uri: csvUri,
274
+ name: csvFilename,
275
+ mimeType: 'text/csv',
276
+ text: csvText,
277
+ },
278
+ },
279
+ // Keep the JSON summary at the end for any debugging the user asks for,
280
+ // but it's far enough down that it doesn't dominate the chat.
238
281
  {
239
282
  type: 'text',
240
- text:
241
- `mall-scraper-mcp v${PACKAGE_VERSION}\n` +
242
- 'CSV ready — paste the block below into your CMS. ' +
243
- `Also saved to: ${writtenPaths ? writtenPaths.csv : '(disk write failed; CSV is inline only)'}.\n\n` +
244
- '```csv\n' + csvText + '\n```',
283
+ text: '\n--- Run summary ---\n' + JSON.stringify(summary, null, 2),
245
284
  },
246
- { type: 'text', text: '\n--- Run summary ---\n' + JSON.stringify(summary, null, 2) },
247
- { type: 'text', text: '\n--- Stores (JSON for debugging) ---\n' + JSON.stringify(stores, null, 2) },
248
285
  ],
249
286
  };
250
287
  } finally {
@@ -297,6 +334,86 @@ async function handleGetStoreHours({ store_url, mall_root_url }) {
297
334
  }
298
335
  }
299
336
 
337
+ async function handleCheckStatus() {
338
+ const creds = describeCredentials();
339
+ const status = {
340
+ mcp_version: PACKAGE_VERSION,
341
+ node_version: process.version,
342
+ auth_mode: creds.mode,
343
+ auth_endpoint: creds.endpoint,
344
+ worker_reachable: null,
345
+ worker_health: null,
346
+ worker_auth_ok: null,
347
+ };
348
+
349
+ // If we're in proxy mode, ping the Worker /health endpoint and probe auth.
350
+ if (creds.mode === 'proxy' && creds.endpoint) {
351
+ try {
352
+ const healthUrl = creds.endpoint.replace(/\/+$/, '') + '/health';
353
+ const health = await new Promise((resolve) => {
354
+ const req = https.get(healthUrl, { timeout: 6000 }, (res) => {
355
+ let body = '';
356
+ res.on('data', (c) => { body += c; });
357
+ res.on('end', () => resolve({ status: res.statusCode, body }));
358
+ });
359
+ req.on('error', () => resolve(null));
360
+ req.on('timeout', () => { req.destroy(); resolve(null); });
361
+ });
362
+ if (health) {
363
+ status.worker_reachable = true;
364
+ status.worker_health = health.body.slice(0, 200);
365
+ } else {
366
+ status.worker_reachable = false;
367
+ }
368
+
369
+ // Probe auth: a tiny POST to /v1/models with the shared secret.
370
+ // OpenAI's /v1/models is a cheap, no-tokens endpoint that proves the
371
+ // Worker is forwarding and the key works.
372
+ const token = process.env.MALL_SCRAPER_TOKEN || '';
373
+ const modelsUrl = creds.endpoint.replace(/\/+$/, '') + '/v1/models';
374
+ const auth = await new Promise((resolve) => {
375
+ const req = https.get(modelsUrl, {
376
+ timeout: 8000,
377
+ headers: { 'X-Mall-Scraper-Token': token },
378
+ }, (res) => {
379
+ let body = '';
380
+ res.on('data', (c) => { body += c; });
381
+ res.on('end', () => resolve({ status: res.statusCode, body }));
382
+ });
383
+ req.on('error', () => resolve(null));
384
+ req.on('timeout', () => { req.destroy(); resolve(null); });
385
+ });
386
+ if (auth) {
387
+ status.worker_auth_ok = auth.status === 200;
388
+ if (auth.status !== 200) {
389
+ status.worker_auth_error = auth.body.slice(0, 200);
390
+ }
391
+ }
392
+ } catch (err) {
393
+ status.worker_probe_error = err.message;
394
+ }
395
+ }
396
+
397
+ // Verdict line for the user
398
+ let verdict;
399
+ if (creds.mode === 'none') {
400
+ verdict = '⚠ No credentials configured. Set MALL_SCRAPER_PROXY_URL+MALL_SCRAPER_TOKEN or OPENAI_API_KEY.';
401
+ } else if (creds.mode === 'proxy') {
402
+ if (status.worker_reachable && status.worker_auth_ok) verdict = '✅ All good — version, Worker, and auth all working.';
403
+ else if (!status.worker_reachable) verdict = '⚠ Worker is unreachable. Check MALL_SCRAPER_PROXY_URL.';
404
+ else if (!status.worker_auth_ok) verdict = '⚠ Worker is reachable but rejected the token. MALL_SCRAPER_TOKEN does not match the SHARED_SECRET on the Worker.';
405
+ else verdict = '⚠ Partial — see fields below.';
406
+ } else {
407
+ verdict = `✅ Direct mode (using OPENAI_API_KEY env var). Version ${PACKAGE_VERSION}.`;
408
+ }
409
+
410
+ return {
411
+ content: [
412
+ { type: 'text', text: verdict + '\n\n' + JSON.stringify(status, null, 2) },
413
+ ],
414
+ };
415
+ }
416
+
300
417
  function handleValidateImageUrl({ url }) {
301
418
  if (!url) return Promise.resolve(errorResult('url is required'));
302
419
  return new Promise((resolve) => {
@@ -128,6 +128,8 @@ class StoreExtractor {
128
128
  this.totalTokensOutput = 0;
129
129
  this.totalCost = 0;
130
130
  this.extractionCount = 0;
131
+ this.errorCount = 0;
132
+ this.lastError = null;
131
133
  }
132
134
 
133
135
  async extract(pageData, hoursCanonical) {
@@ -152,6 +154,8 @@ class StoreExtractor {
152
154
  this._trackUsage(resp);
153
155
  raw = JSON.parse(resp.choices[0].message.content);
154
156
  } catch (err) {
157
+ this.errorCount++;
158
+ this.lastError = err.message;
155
159
  if (this.logger) this.logger.warn(` ⚠ Store LLM extract failed: ${err.message}`);
156
160
  return { fields: {}, confidence: 0 };
157
161
  }
@@ -218,6 +222,8 @@ class StoreExtractor {
218
222
  return {
219
223
  model: this.model,
220
224
  extractions: this.extractionCount,
225
+ errors: this.errorCount,
226
+ lastError: this.lastError,
221
227
  totalInputTokens: this.totalTokensInput,
222
228
  totalOutputTokens: this.totalTokensOutput,
223
229
  estimatedCost: `$${this.totalCost.toFixed(4)}`,