npm - mallmaverick-store-scraper - Versions diffs - 0.1.0 - Mend

mallmaverick-store-scraper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/README.md +225 -0
package/package.json +41 -0
package/src/brandSiteFallback.js +272 -0
package/src/browser.js +234 -0
package/src/deterministic.js +235 -0
package/src/discovery.js +298 -0
package/src/externalFollow.js +89 -0
package/src/hoursParser.js +313 -0
package/src/hoursPipeline.js +151 -0
package/src/imageExtraction.js +331 -0
package/src/llmExtract.js +99 -0
package/src/logoExtraction.js +130 -0
package/src/main.js +330 -0
package/src/mallContext.js +201 -0
package/src/mcp-server.js +425 -0
package/src/openai-proxy.js +52 -0
package/src/output.js +21 -0
package/src/retryStrategy.js +60 -0
package/src/storeExtractor.js +239 -0
package/src/storeModel.js +147 -0

package/src/mcp-server.js ADDED Viewed

@@ -0,0 +1,425 @@
+#!/usr/bin/env node
+'use strict';
+/**
+ * MCP server for mall-scraper-mcp.
+ *
+ * Exposes scraping tools via the Model Context Protocol so Claude Desktop /
+ * Claude Code can drive directory scrapes conversationally. Communicates over
+ * stdio.
+ *
+ * Tools:
+ *   - scrape_directory : full per-store extraction across a directory listing.
+ *   - get_store_hours  : run just the hours pipeline on a single store URL.
+ *   - validate_image_url : HEAD-check that a URL returns a real image.
+ *
+ * Env vars (auth):
+ *   - MALL_SCRAPER_PROXY_URL + MALL_SCRAPER_TOKEN (recommended for shared use)
+ *   - OPENAI_API_KEY (direct OpenAI, dev fallback)
+ */
+require('dotenv').config();
+const fs = require('fs');
+const path = require('path');
+const { URL } = require('url');
+const http = require('http');
+const https = require('https');
+const { Server } = require('@modelcontextprotocol/sdk/server/index.js');
+const { StdioServerTransport } = require('@modelcontextprotocol/sdk/server/stdio.js');
+const {
+  CallToolRequestSchema,
+  ListToolsRequestSchema,
+} = require('@modelcontextprotocol/sdk/types.js');
+const { launchBrowser, newPage, loadPageWithStrategy, attachXhrInterceptor } = require('./browser');
+const { discoverStores } = require('./discovery');
+const { getMallContext } = require('./mallContext');
+const { extractHours } = require('./hoursPipeline');
+const { classifyImages, pickImages } = require('./imageExtraction');
+const { fetchBrandLogo } = require('./brandSiteFallback');
+const { extractPhone, extractSocials, extractWebsite, detectStatusFlags } = require('./deterministic');
+const { StoreExtractor } = require('./storeExtractor');
+const { mergeExtracted, storesToCSV } = require('./storeModel');
+const { createOpenAIClient, describeCredentials } = require('./openai-proxy');
+// --- silent logger (anything to stdout would corrupt MCP framing) ---
+const logs = [];
+const logger = {
+  info: (...a) => logs.push(`[info] ${a.join(' ')}`),
+  warn: (...a) => logs.push(`[warn] ${a.join(' ')}`),
+  error: (...a) => logs.push(`[error] ${a.join(' ')}`),
+};
+const TOOLS = [
+  {
+    name: 'scrape_directory',
+    description:
+      'Scrape a shopping-mall store directory and return per-store records ' +
+      '(name, hours, phone, logo, brand image, categories, etc.). Use this ' +
+      'when the user wants to capture a directory like ' +
+      'https://grasslands.ca/store-directory/.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        directory_url: {
+          type: 'string',
+          description: 'Full URL to the directory listing page.',
+        },
+        max_stores: {
+          type: 'number',
+          description: 'Max number of stores to scrape (0 = all). Default 10.',
+          default: 10,
+        },
+        concurrency: {
+          type: 'number',
+          description: 'Parallel pages. Default 2; max 5.',
+          default: 2,
+        },
+        model: {
+          type: 'string',
+          description: 'OpenAI model. Default gpt-5.4-mini.',
+          default: 'gpt-5.4-mini',
+        },
+        write_csv: {
+          type: 'boolean',
+          description: 'Also write a CSV + JSON to extracted_stores/. Default true.',
+          default: true,
+        },
+      },
+      required: ['directory_url'],
+    },
+  },
+  {
+    name: 'get_store_hours',
+    description:
+      'Run only the layered hours pipeline on a single store page. Cheap, ' +
+      'fast — useful for debugging when a store\'s hours look wrong.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        store_url: { type: 'string', description: 'Store detail page URL.' },
+        mall_root_url: {
+          type: 'string',
+          description: 'Optional mall homepage URL — enables sync-with-mall-hours layer.',
+        },
+      },
+      required: ['store_url'],
+    },
+  },
+  {
+    name: 'validate_image_url',
+    description:
+      'HEAD-check a URL: returns whether it serves a real image, with ' +
+      'content-type, size, and final URL after redirects. Use when a logo ' +
+      'isn\'t loading in your CMS — confirms whether the URL itself is bad.',
+    inputSchema: {
+      type: 'object',
+      properties: {
+        url: { type: 'string', description: 'URL to check.' },
+      },
+      required: ['url'],
+    },
+  },
+];
+const server = new Server(
+  { name: 'mall-scraper-mcp', version: '0.1.0' },
+  { capabilities: { tools: {} } }
+);
+server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: TOOLS }));
+server.setRequestHandler(CallToolRequestSchema, async (req) => {
+  const { name, arguments: args } = req.params;
+  try {
+    switch (name) {
+      case 'scrape_directory': return await handleScrapeDirectory(args || {});
+      case 'get_store_hours':  return await handleGetStoreHours(args || {});
+      case 'validate_image_url': return await handleValidateImageUrl(args || {});
+      default:
+        return errorResult(`Unknown tool: ${name}`);
+    }
+  } catch (err) {
+    return errorResult(`Tool ${name} failed: ${err.message}`);
+  }
+});
+// ---------------------------------------------------------------------------
+// Tool implementations
+// ---------------------------------------------------------------------------
+async function handleScrapeDirectory({ directory_url, max_stores = 10, concurrency = 2, model = 'gpt-5.4-mini', write_csv = true }) {
+  if (!directory_url) return errorResult('directory_url is required');
+  const creds = describeCredentials();
+  if (creds.mode === 'none') {
+    return errorResult(
+      'No OpenAI credentials available. Set MALL_SCRAPER_PROXY_URL + ' +
+      'MALL_SCRAPER_TOKEN, or OPENAI_API_KEY.'
+    );
+  }
+  const client = createOpenAIClient();
+  const browser = await launchBrowser({ headless: true });
+  const extractor = new StoreExtractor({ client, model, useVision: false, logger });
+  const conc = Math.min(5, Math.max(1, parseInt(concurrency, 10) || 2));
+  const max = Math.max(0, parseInt(max_stores, 10) || 0);
+  try {
+    const mallRoot = new URL(directory_url).origin;
+    const mallContext = await getMallContext(browser, mallRoot);
+    const { storeUrls: allUrls, logoMap } = await discoverStores(browser, directory_url, logger);
+    const storeCardLogos = Array.from(logoMap.values());
+    const urls = max > 0 ? allUrls.slice(0, max) : allUrls;
+    const stores = [];
+    let mmId = 1;
+    // Sequential within the MCP context (concurrency adds nondeterminism that's
+    // less useful here than a clear per-store progress trail in the result).
+    const pLimit = require('p-limit')(conc);
+    const tasks = urls.map((url) => pLimit(async () => {
+      const myId = mmId++;
+      const directoryLogoUrl = logoMap.get(url.replace(/\/+$/, '').toLowerCase()) || null;
+      const store = await scrapeOneStore({
+        url, mmId: myId, browser, client, model, extractor,
+        directoryLogoUrl, mallContext, mallOrigin: mallRoot, storeCardLogos,
+      });
+      if (store) stores.push(store);
+      return store;
+    }));
+    await Promise.all(tasks);
+    stores.sort((a, b) => a.mm_id - b.mm_id);
+    let writtenPaths = null;
+    if (write_csv) {
+      writtenPaths = writeResults(directory_url, stores);
+    }
+    const bySource = {};
+    for (const s of stores) {
+      const k = s.hours_source || '(none)';
+      bySource[k] = (bySource[k] || 0) + 1;
+    }
+    const usage = extractor.getUsageSummary();
+    const summary = {
+      directory_url,
+      stores_extracted: stores.length,
+      hours_layer_breakdown: bySource,
+      llm_usage: usage,
+      written_files: writtenPaths,
+      auth_mode: creds.mode,
+    };
+    return {
+      content: [
+        { type: 'text', text: JSON.stringify(summary, null, 2) },
+        { type: 'text', text: '\nStores:\n' + JSON.stringify(stores, null, 2) },
+      ],
+    };
+  } finally {
+    try { await browser.close(); } catch (_) {}
+  }
+}
+async function handleGetStoreHours({ store_url, mall_root_url }) {
+  if (!store_url) return errorResult('store_url is required');
+  const browser = await launchBrowser({ headless: true });
+  try {
+    const mallContext = mall_root_url
+      ? await getMallContext(browser, mall_root_url)
+      : { canonical: '', mallSocials: {}, mallEcosystemDomains: [], mallChromeImages: [] };
+    const page = await newPage(browser);
+    try {
+      const data = await loadPageWithStrategy(page, store_url, { attempt: 1 });
+      const links = await page.evaluate(() => Array.from(document.querySelectorAll('a[href]'))
+        .map(a => ({ href: a.href, text: (a.innerText || '').trim() })).filter(o => o.href));
+      const result = await extractHours({
+        url: store_url, text: data.text, html: data.html, jsonLd: data.jsonLd,
+        metaTags: data.metaTags, links,
+      }, {
+        mallContext,
+        client: null,         // skip LLM layer for the lightweight tool
+        model: null,
+        browser,
+        mallOrigin: mall_root_url ? new URL(mall_root_url).origin : null,
+        logger,
+      });
+      return {
+        content: [{
+          type: 'text',
+          text: JSON.stringify({
+            store_url,
+            store_hours: result.canonical,
+            source: result.source,
+            confidence: result.confidence,
+            sync_with_centre_hours: result.sync_with_centre_hours,
+          }, null, 2),
+        }],
+      };
+    } finally {
+      try { await page.close(); } catch (_) {}
+    }
+  } finally {
+    try { await browser.close(); } catch (_) {}
+  }
+}
+function handleValidateImageUrl({ url }) {
+  if (!url) return Promise.resolve(errorResult('url is required'));
+  return new Promise((resolve) => {
+    let finalUrl = url;
+    let redirects = 0;
+    const attempt = (u) => {
+      let parsed;
+      try { parsed = new URL(u); } catch { return resolve(errorResult(`Invalid URL: ${u}`)); }
+      const mod = parsed.protocol === 'https:' ? https : http;
+      const req = mod.request(u, {
+        method: 'HEAD',
+        timeout: 10000,
+        headers: {
+          'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36',
+          'Accept': 'image/*,*/*;q=0.5',
+        },
+      }, (res) => {
+        if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location && redirects < 4) {
+          redirects++;
+          try { finalUrl = new URL(res.headers.location, u).toString(); return attempt(finalUrl); }
+          catch { /* fall through to report */ }
+        }
+        const ct = (res.headers['content-type'] || '').toLowerCase();
+        const cl = parseInt(res.headers['content-length'] || '0', 10) || null;
+        const isImage = ct.startsWith('image/');
+        resolve({
+          content: [{
+            type: 'text',
+            text: JSON.stringify({
+              url, final_url: finalUrl, status: res.statusCode,
+              content_type: ct, content_length: cl,
+              is_image: isImage && res.statusCode === 200,
+              verdict: isImage && res.statusCode === 200
+                ? `OK — serves a real image (${ct}, ${cl ?? '?'} B)`
+                : `BAD — status ${res.statusCode}, content-type "${ct}". CMS upload will likely fail.`,
+            }, null, 2),
+          }],
+        });
+      });
+      req.on('error', (e) => resolve(errorResult(`HEAD failed: ${e.message}`)));
+      req.on('timeout', () => { req.destroy(); resolve(errorResult('HEAD request timed out')); });
+      req.end();
+    };
+    attempt(url);
+  });
+}
+// ---------------------------------------------------------------------------
+// Per-store scrape (port of main.js processStoreWithRetry, no-retry version)
+// ---------------------------------------------------------------------------
+async function scrapeOneStore({
+  url, mmId, browser, client, model, extractor,
+  directoryLogoUrl, mallContext, mallOrigin, storeCardLogos,
+}) {
+  const page = await newPage(browser);
+  const { interceptedJson } = await attachXhrInterceptor(page, { directoryMode: false });
+  try {
+    const data = await loadPageWithStrategy(page, url, { attempt: 1 });
+    const links = await page.evaluate(() => Array.from(document.querySelectorAll('a[href]'))
+      .map(a => ({ href: a.href, text: (a.innerText || '').trim() })).filter(o => o.href));
+    const urlSlug = (() => {
+      try {
+        const parts = new URL(url).pathname.replace(/\/$/, '').split('/').filter(Boolean);
+        return parts[parts.length - 1] || '';
+      } catch { return ''; }
+    })();
+    const name = data.h1 || slugToName(urlSlug) || data.title || '';
+    const hours = await extractHours({
+      url, text: data.text, html: data.html, jsonLd: data.jsonLd,
+      metaTags: data.metaTags, links,
+    }, { mallContext, client, model, browser, mallOrigin, logger });
+    const phone = extractPhone(data.text, data.jsonLd);
+    const socials = extractSocials(links, mallContext.mallSocials);
+    const website = extractWebsite(links, mallOrigin, name, mallContext.mallEcosystemDomains || []);
+    const flagsFromText = detectStatusFlags(data.text);
+    const rawCands = await classifyImages(page, url, {
+      storeName: name,
+      mallName: mallContext.mallName || '',
+      mallEcosystem: mallContext.mallEcosystemDomains || [],
+      mallChromeImages: mallContext.mallChromeImages || [],
+      storeCardLogos: storeCardLogos || [],
+    });
+    const picks = pickImages(rawCands, { directoryLogoUrl, storeName: name });
+    let logoUrl = picks.logo_image_url || '';
+    const isGifUrl = /\.gif(\?|$)/i.test(logoUrl);
+    if ((!logoUrl || isGifUrl) && website) {
+      try {
+        const fallback = await fetchBrandLogo(browser, website, name, { logger });
+        if (fallback && fallback.url) {
+          if (!logoUrl || (isGifUrl && !/\.gif(\?|$)/i.test(fallback.url))) logoUrl = fallback.url;
+        }
+      } catch (_) {}
+    }
+    const llmInput = {
+      url, urlSlug, h1: data.h1, title: data.title,
+      textContent: data.text, jsonLd: data.jsonLd, metaTags: data.metaTags,
+      interceptedJson: interceptedJson.slice(0, 3),
+    };
+    const { fields: llmFields } = await extractor.extract(llmInput, hours.canonical);
+    return mergeExtracted(mmId, {
+      name, website, phone, ...socials, ...flagsFromText, ...llmFields,
+      logo_image_url: logoUrl,
+      brand_image_url: picks.brand_image_url || '',
+      store_front_image_url: picks.store_front_image_url || '',
+      store_hours: hours.canonical,
+      hours_source: hours.source || '',
+      hours_confidence: hours.confidence,
+      sync_with_centre_hours: hours.sync_with_centre_hours || false,
+    });
+  } finally {
+    try { await page.close(); } catch (_) {}
+  }
+}
+function slugToName(slug) {
+  if (!slug) return '';
+  return slug.split('-').map(w => w.charAt(0).toUpperCase() + w.slice(1)).join(' ');
+}
+function writeResults(directoryUrl, stores) {
+  const outDir = path.join(process.cwd(), 'extracted_stores');
+  fs.mkdirSync(outDir, { recursive: true });
+  const host = new URL(directoryUrl).hostname.replace(/^www\./, '');
+  const ts = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
+  const base = path.join(outDir, `stores_v5_${host}_${ts}`);
+  fs.writeFileSync(`${base}.json`, JSON.stringify(stores, null, 2));
+  fs.writeFileSync(`${base}.csv`, storesToCSV(stores));
+  return { json: `${base}.json`, csv: `${base}.csv` };
+}
+function errorResult(message) {
+  return { isError: true, content: [{ type: 'text', text: message }] };
+}
+// ---------------------------------------------------------------------------
+// Start
+// ---------------------------------------------------------------------------
+async function main() {
+  const transport = new StdioServerTransport();
+  await server.connect(transport);
+  // Server is now running; stdio loop is alive for the life of the process.
+}
+main().catch((err) => {
+  // Errors here happen before stdio is up — safe to write to stderr.
+  console.error('Fatal:', err.stack || err.message);
+  process.exit(1);
+});

package/src/openai-proxy.js ADDED Viewed

@@ -0,0 +1,52 @@
+'use strict';
+const { OpenAI } = require('openai');
+/**
+ * Build an OpenAI client that either:
+ *   (a) Hits api.openai.com directly with OPENAI_API_KEY — used for local dev
+ *       and as a fallback.
+ *   (b) Hits the Cloudflare Worker proxy at MALL_SCRAPER_PROXY_URL with the
+ *       shared secret MALL_SCRAPER_TOKEN. The Worker injects the real OpenAI
+ *       key server-side so coworkers never see it.
+ *
+ * Priority: if BOTH proxy env vars are set, use the proxy; otherwise fall back
+ * to direct OpenAI.
+ */
+function createOpenAIClient() {
+  const proxyUrl = process.env.MALL_SCRAPER_PROXY_URL;
+  const proxyToken = process.env.MALL_SCRAPER_TOKEN;
+  const directKey = process.env.OPENAI_API_KEY;
+  if (proxyUrl && proxyToken) {
+    return new OpenAI({
+      apiKey: 'proxy-managed',
+      baseURL: proxyUrl.replace(/\/+$/, '') + '/v1',
+      defaultHeaders: { 'X-Mall-Scraper-Token': proxyToken },
+    });
+  }
+  if (directKey) {
+    return new OpenAI({ apiKey: directKey });
+  }
+  throw new Error(
+    'No OpenAI credentials available. Set either MALL_SCRAPER_PROXY_URL + ' +
+    'MALL_SCRAPER_TOKEN (recommended), or OPENAI_API_KEY for direct access.'
+  );
+}
+/**
+ * Quick env probe used by the MCP server and CLI to give a clear error message
+ * before any scraping starts.
+ */
+function describeCredentials() {
+  const proxyUrl = process.env.MALL_SCRAPER_PROXY_URL;
+  const proxyToken = process.env.MALL_SCRAPER_TOKEN;
+  const directKey = process.env.OPENAI_API_KEY;
+  if (proxyUrl && proxyToken) return { mode: 'proxy', endpoint: proxyUrl };
+  if (directKey) return { mode: 'direct', endpoint: 'https://api.openai.com' };
+  return { mode: 'none', endpoint: null };
+}
+module.exports = { createOpenAIClient, describeCredentials };

package/src/output.js ADDED Viewed

@@ -0,0 +1,21 @@
+'use strict';
+const fs = require('fs');
+const path = require('path');
+const { URL } = require('url');
+const { storesToCSV } = require('./storeModel');
+function writeResults(directoryUrl, stores) {
+  const outDir = path.join(__dirname, '..', 'extracted_stores');
+  if (!fs.existsSync(outDir)) fs.mkdirSync(outDir, { recursive: true });
+  const host = new URL(directoryUrl).hostname.replace(/^www\./, '');
+  const ts = new Date().toISOString().replace(/[:.]/g, '-');
+  const base = path.join(outDir, `stores_v5_${host}_${ts}`);
+  fs.writeFileSync(`${base}.json`, JSON.stringify(stores, null, 2));
+  fs.writeFileSync(`${base}.csv`, storesToCSV(stores));
+  return { json: `${base}.json`, csv: `${base}.csv` };
+}
+module.exports = { writeResults };

package/src/retryStrategy.js ADDED Viewed

@@ -0,0 +1,60 @@
+'use strict';
+/**
+ * Per-store retry with escalating wait strategies.
+ *
+ *   attempt 1: domcontentloaded + single scroll
+ *   attempt 2: domcontentloaded + scroll + clickExpandables (2s wait)
+ *   attempt 3: networkidle2 + scroll + clickExpandables (3s wait)
+ *
+ * Each attempt runs the full pipeline (hours + deterministic + LLM).
+ * Accept on first attempt whose combined confidence >= threshold.
+ * Return the best result if all attempts fall short.
+ */
+const MAX_ATTEMPTS = 3;
+const DEFAULT_THRESHOLD = 0.80;
+async function scrapeWithRetry({ runOnce, threshold = DEFAULT_THRESHOLD, logger }) {
+  let best = null;
+  for (let attempt = 1; attempt <= MAX_ATTEMPTS; attempt++) {
+    let result;
+    try {
+      result = await runOnce(attempt);
+    } catch (err) {
+      if (logger) logger.warn(`   ⚠ Attempt ${attempt} failed: ${err.message}`);
+      if (attempt === MAX_ATTEMPTS) break;
+      await sleep(1500 * attempt);
+      continue;
+    }
+    if (!best || result.combinedConfidence > best.combinedConfidence) best = result;
+    if (result.combinedConfidence >= threshold) {
+      if (attempt > 1 && logger) {
+        logger.info(`   ✅ Accepted on attempt ${attempt} (conf ${pct(result.combinedConfidence)})`);
+      }
+      return { ...result, attempt, strategy: label(attempt), needs_review: false };
+    }
+    if (attempt < MAX_ATTEMPTS) {
+      if (logger) logger.info(`   ⚡ Conf ${pct(result.combinedConfidence)} < ${pct(threshold)} — retrying`);
+      await sleep(800 * attempt);
+    }
+  }
+  if (!best) {
+    return {
+      store: null, combinedConfidence: 0,
+      attempt: MAX_ATTEMPTS, strategy: 'failed', needs_review: true,
+    };
+  }
+  best.attempt = best.attempt || MAX_ATTEMPTS;
+  best.strategy = label(best.attempt);
+  best.needs_review = best.combinedConfidence < threshold;
+  return best;
+}
+function label(a) {
+  return a === 1 ? 'baseline' : a === 2 ? 'deeper' : a === 3 ? 'network_focus' : `attempt_${a}`;
+}
+function pct(n) { return `${(n * 100).toFixed(0)}%`; }
+function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
+module.exports = { scrapeWithRetry, DEFAULT_THRESHOLD, MAX_ATTEMPTS };