npm - mallmaverick-store-scraper - Versions diffs - 0.1.2 → 0.1.4 - Mend

mallmaverick-store-scraper 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "mallmaverick-store-scraper",
-  "version": "0.1.2",
+  "version": "0.1.4",
   "description": "MCP server + CLI for scraping shopping mall store directories. Hours-first layered pipeline + image classification.",
   "main": "src/main.js",
   "type": "commonjs",

package/src/mcp-server.js CHANGED Viewed

@@ -60,11 +60,11 @@ const TOOLS = [
       '(name, hours, phone, logo, brand image, categories, etc.). Use this ' +
       'when the user wants to capture a directory like ' +
       'https://grasslands.ca/store-directory/.\n\n' +
-      'AFTER RUNNING THIS TOOL: paste the full CSV content (from the ' +
-      '"--- CSV ---" block of the response) into your reply inside a fenced ' +
-      'code block so the user can copy it directly into their CMS. ' +
-      'Also state the saved file path and a one-line summary of how many ' +
-      'stores were extracted. Do NOT summarize away the CSV — show it in full.',
+      'AFTER RUNNING THIS TOOL: give a short summary — how many stores were ' +
+      'extracted, hours-layer breakdown, and the saved file path. The CSV is ' +
+      'attached as a resource in the tool response (the user can download/' +
+      'preview it from there) — do NOT paste the CSV text into your reply. ' +
+      'Keep your text reply short.',
     inputSchema: {
       type: 'object',
       properties: {
@@ -113,6 +113,15 @@ const TOOLS = [
       required: ['store_url'],
     },
   },
+  {
+    name: 'check_status',
+    description:
+      'Returns the running mall-scraper-mcp version, auth mode, and Worker ' +
+      'connectivity. Use this BEFORE running scrape_directory to verify the ' +
+      'tool is wired up correctly — confirms version, that the OpenAI proxy ' +
+      'is reachable, and that the shared secret is valid.',
+    inputSchema: { type: 'object', properties: {} },
+  },
   {
     name: 'validate_image_url',
     description:
@@ -129,7 +138,7 @@ const TOOLS = [
   },
 ];
-const PACKAGE_VERSION = '0.1.2';
+const PACKAGE_VERSION = '0.1.4';
 const server = new Server(
   { name: 'mall-scraper-mcp', version: PACKAGE_VERSION },
@@ -145,6 +154,7 @@ server.setRequestHandler(CallToolRequestSchema, async (req) => {
       case 'scrape_directory': return await handleScrapeDirectory(args || {});
       case 'get_store_hours':  return await handleGetStoreHours(args || {});
       case 'validate_image_url': return await handleValidateImageUrl(args || {});
+      case 'check_status':     return await handleCheckStatus(args || {});
       default:
         return errorResult(`Unknown tool: ${name}`);
     }
@@ -225,26 +235,53 @@ async function handleScrapeDirectory({ directory_url, max_stores = 10, concurren
       stores_extracted: stores.length,
       hours_layer_breakdown: bySource,
       llm_usage: usage,
+      llm_failed: usage.errors > 0
+        ? `⚠ ${usage.errors} LLM calls failed (description/categories/etc. fields will be empty). Last error: ${usage.lastError}. Run check_status to diagnose.`
+        : null,
       written_files: writtenPaths,
       write_error: writeError,
       auth_mode: creds.mode,
       mcp_version: PACKAGE_VERSION,
     };
-    // Order matters — Claude is more likely to surface the first content
-    // blocks. Lead with the CSV so it can't be summarized away.
+    // Build a short brief + return the CSV as an embedded resource so
+    // Claude Desktop can render it as an attachment card instead of inline
+    // text. Falls back to inline-rendering if the client doesn't support
+    // resources, but most clients (including Claude Desktop) do.
+    const host = new URL(directory_url).hostname.replace(/^www\./, '');
+    const csvFilename = writtenPaths
+      ? path.basename(writtenPaths.csv)
+      : `stores_v5_${host}.csv`;
+    const csvUri = writtenPaths
+      ? `file://${writtenPaths.csv}`
+      : `file:///tmp/${csvFilename}`;
+    const brief =
+      `mall-scraper-mcp v${PACKAGE_VERSION}\n` +
+      `Scraped ${stores.length} store${stores.length === 1 ? '' : 's'} from ${host}.\n` +
+      `Hours-layer breakdown: ${Object.entries(bySource).map(([k, v]) => `${k}=${v}`).join(', ')}.\n` +
+      (writtenPaths
+        ? `Saved to: ${writtenPaths.csv}`
+        : `⚠ disk write failed (${writeError}); CSV is in the attached resource only.`);
     return {
       content: [
+        { type: 'text', text: brief },
+        {
+          type: 'resource',
+          resource: {
+            uri: csvUri,
+            name: csvFilename,
+            mimeType: 'text/csv',
+            text: csvText,
+          },
+        },
+        // Keep the JSON summary at the end for any debugging the user asks for,
+        // but it's far enough down that it doesn't dominate the chat.
         {
           type: 'text',
-          text:
-            `mall-scraper-mcp v${PACKAGE_VERSION}\n` +
-            'CSV ready — paste the block below into your CMS. ' +
-            `Also saved to: ${writtenPaths ? writtenPaths.csv : '(disk write failed; CSV is inline only)'}.\n\n` +
-            '```csv\n' + csvText + '\n```',
+          text: '\n--- Run summary ---\n' + JSON.stringify(summary, null, 2),
         },
-        { type: 'text', text: '\n--- Run summary ---\n' + JSON.stringify(summary, null, 2) },
-        { type: 'text', text: '\n--- Stores (JSON for debugging) ---\n' + JSON.stringify(stores, null, 2) },
       ],
     };
   } finally {
@@ -297,6 +334,86 @@ async function handleGetStoreHours({ store_url, mall_root_url }) {
   }
 }
+async function handleCheckStatus() {
+  const creds = describeCredentials();
+  const status = {
+    mcp_version: PACKAGE_VERSION,
+    node_version: process.version,
+    auth_mode: creds.mode,
+    auth_endpoint: creds.endpoint,
+    worker_reachable: null,
+    worker_health: null,
+    worker_auth_ok: null,
+  };
+  // If we're in proxy mode, ping the Worker /health endpoint and probe auth.
+  if (creds.mode === 'proxy' && creds.endpoint) {
+    try {
+      const healthUrl = creds.endpoint.replace(/\/+$/, '') + '/health';
+      const health = await new Promise((resolve) => {
+        const req = https.get(healthUrl, { timeout: 6000 }, (res) => {
+          let body = '';
+          res.on('data', (c) => { body += c; });
+          res.on('end', () => resolve({ status: res.statusCode, body }));
+        });
+        req.on('error', () => resolve(null));
+        req.on('timeout', () => { req.destroy(); resolve(null); });
+      });
+      if (health) {
+        status.worker_reachable = true;
+        status.worker_health = health.body.slice(0, 200);
+      } else {
+        status.worker_reachable = false;
+      }
+      // Probe auth: a tiny POST to /v1/models with the shared secret.
+      // OpenAI's /v1/models is a cheap, no-tokens endpoint that proves the
+      // Worker is forwarding and the key works.
+      const token = process.env.MALL_SCRAPER_TOKEN || '';
+      const modelsUrl = creds.endpoint.replace(/\/+$/, '') + '/v1/models';
+      const auth = await new Promise((resolve) => {
+        const req = https.get(modelsUrl, {
+          timeout: 8000,
+          headers: { 'X-Mall-Scraper-Token': token },
+        }, (res) => {
+          let body = '';
+          res.on('data', (c) => { body += c; });
+          res.on('end', () => resolve({ status: res.statusCode, body }));
+        });
+        req.on('error', () => resolve(null));
+        req.on('timeout', () => { req.destroy(); resolve(null); });
+      });
+      if (auth) {
+        status.worker_auth_ok = auth.status === 200;
+        if (auth.status !== 200) {
+          status.worker_auth_error = auth.body.slice(0, 200);
+        }
+      }
+    } catch (err) {
+      status.worker_probe_error = err.message;
+    }
+  }
+  // Verdict line for the user
+  let verdict;
+  if (creds.mode === 'none') {
+    verdict = '⚠ No credentials configured. Set MALL_SCRAPER_PROXY_URL+MALL_SCRAPER_TOKEN or OPENAI_API_KEY.';
+  } else if (creds.mode === 'proxy') {
+    if (status.worker_reachable && status.worker_auth_ok) verdict = '✅ All good — version, Worker, and auth all working.';
+    else if (!status.worker_reachable) verdict = '⚠ Worker is unreachable. Check MALL_SCRAPER_PROXY_URL.';
+    else if (!status.worker_auth_ok) verdict = '⚠ Worker is reachable but rejected the token. MALL_SCRAPER_TOKEN does not match the SHARED_SECRET on the Worker.';
+    else verdict = '⚠ Partial — see fields below.';
+  } else {
+    verdict = `✅ Direct mode (using OPENAI_API_KEY env var). Version ${PACKAGE_VERSION}.`;
+  }
+  return {
+    content: [
+      { type: 'text', text: verdict + '\n\n' + JSON.stringify(status, null, 2) },
+    ],
+  };
+}
 function handleValidateImageUrl({ url }) {
   if (!url) return Promise.resolve(errorResult('url is required'));
   return new Promise((resolve) => {

package/src/storeExtractor.js CHANGED Viewed

@@ -128,6 +128,8 @@ class StoreExtractor {
     this.totalTokensOutput = 0;
     this.totalCost = 0;
     this.extractionCount = 0;
+    this.errorCount = 0;
+    this.lastError = null;
   }
   async extract(pageData, hoursCanonical) {
@@ -152,6 +154,8 @@ class StoreExtractor {
       this._trackUsage(resp);
       raw = JSON.parse(resp.choices[0].message.content);
     } catch (err) {
+      this.errorCount++;
+      this.lastError = err.message;
       if (this.logger) this.logger.warn(`   ⚠ Store LLM extract failed: ${err.message}`);
       return { fields: {}, confidence: 0 };
     }
@@ -218,6 +222,8 @@ class StoreExtractor {
     return {
       model: this.model,
       extractions: this.extractionCount,
+      errors: this.errorCount,
+      lastError: this.lastError,
       totalInputTokens: this.totalTokensInput,
       totalOutputTokens: this.totalTokensOutput,
       estimatedCost: `$${this.totalCost.toFixed(4)}`,