npm - mallmaverick-store-scraper - Versions diffs - 0.1.5 → 0.2.0 - Mend

mallmaverick-store-scraper 0.1.5 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "mallmaverick-store-scraper",
-  "version": "0.1.5",
+  "version": "0.2.0",
   "description": "MCP server + CLI for scraping shopping mall store directories. Hours-first layered pipeline + image classification.",
   "main": "src/main.js",
   "type": "commonjs",

package/src/mcp-server.js CHANGED Viewed

@@ -56,15 +56,22 @@ const TOOLS = [
   {
     name: 'scrape_directory',
     description:
-      'Scrape a shopping-mall store directory and return per-store records ' +
-      '(name, hours, phone, logo, brand image, categories, etc.). Use this ' +
-      'when the user wants to capture a directory like ' +
+      'Scrape a shopping-mall store directory in batches. Each call processes ' +
+      'up to `max_stores` stores starting from `start_offset` (default 30 stores ' +
+      'per call). Use when the user wants to capture a directory like ' +
       'https://grasslands.ca/store-directory/.\n\n' +
-      'AFTER RUNNING THIS TOOL: reply with ONE short sentence — the count ' +
-      'and the file path. The CSV file is attached as a resource_link in the ' +
-      'tool response; do NOT paste CSV text into your reply, do NOT print the ' +
-      'JSON, do NOT summarize each store. If the tool response includes an ' +
-      'error block, surface that error verbatim to the user.',
+      'BATCHING — IMPORTANT: Claude Desktop has a ~4-min tool-call timeout. ' +
+      'For large directories (>30 stores) you MUST chain multiple calls:\n' +
+      '  1. First call: scrape_directory(directory_url) — uses defaults ' +
+      '     (max_stores=30, start_offset=0). Save the returned `csv_file_path`.\n' +
+      '  2. If the response has `is_complete=false`, IMMEDIATELY call again with ' +
+      '     start_offset=<next_offset from response> AND append_to=<csv_file_path> ' +
+      '     so all batches merge into the same CSV file.\n' +
+      '  3. Repeat until is_complete=true. Then announce the single final file.\n\n' +
+      'AFTER EACH BATCH: reply with ONE short sentence — the batch count and ' +
+      'overall progress (e.g. "Batch 2 done — 60/120 stores"). Do NOT paste ' +
+      'CSV text or JSON. The CSV is attached as a resource_link. If the tool ' +
+      'response includes an error block, surface that error verbatim.',
     inputSchema: {
       type: 'object',
       properties: {
@@ -74,8 +81,17 @@ const TOOLS = [
         },
         max_stores: {
           type: 'number',
-          description: 'Max number of stores to scrape (0 = all). Default 10.',
-          default: 10,
+          description: 'Max stores to scrape in THIS batch (0 = scrape all in this batch — only safe for ≤30 known small dirs). Default 30.',
+          default: 30,
+        },
+        start_offset: {
+          type: 'number',
+          description: 'Skip the first N discovered stores. Use this with append_to to chain batches. Default 0.',
+          default: 0,
+        },
+        append_to: {
+          type: 'string',
+          description: 'Absolute path to a CSV file produced by a previous batch. When set, this batch\'s rows are appended (no duplicate header) so all batches merge into one file. Get this value from the previous batch\'s `csv_file_path` response field.',
         },
         concurrency: {
           type: 'number',
@@ -87,11 +103,6 @@ const TOOLS = [
           description: 'OpenAI model. Default gpt-5.4-mini.',
           default: 'gpt-5.4-mini',
         },
-        write_csv: {
-          type: 'boolean',
-          description: 'Also write a CSV + JSON to extracted_stores/. Default true.',
-          default: true,
-        },
       },
       required: ['directory_url'],
     },
@@ -138,7 +149,7 @@ const TOOLS = [
   },
 ];
-const PACKAGE_VERSION = '0.1.5';
+const PACKAGE_VERSION = '0.2.0';
 const server = new Server(
   { name: 'mall-scraper-mcp', version: PACKAGE_VERSION },
@@ -167,7 +178,10 @@ server.setRequestHandler(CallToolRequestSchema, async (req) => {
 // Tool implementations
 // ---------------------------------------------------------------------------
-async function handleScrapeDirectory({ directory_url, max_stores = 10, concurrency = 2, model = 'gpt-5.4-mini', write_csv = true }) {
+async function handleScrapeDirectory({
+  directory_url, max_stores = 30, start_offset = 0, append_to,
+  concurrency = 2, model = 'gpt-5.4-mini',
+}) {
   if (!directory_url) return errorResult('directory_url is required');
   const creds = describeCredentials();
   if (creds.mode === 'none') {
@@ -182,21 +196,24 @@ async function handleScrapeDirectory({ directory_url, max_stores = 10, concurren
   const extractor = new StoreExtractor({ client, model, useVision: false, logger });
   const conc = Math.min(5, Math.max(1, parseInt(concurrency, 10) || 2));
   const max = Math.max(0, parseInt(max_stores, 10) || 0);
+  const offset = Math.max(0, parseInt(start_offset, 10) || 0);
+  const isAppending = !!append_to;
   try {
     const mallRoot = new URL(directory_url).origin;
     const mallContext = await getMallContext(browser, mallRoot);
     const { storeUrls: allUrls, logoMap } = await discoverStores(browser, directory_url, logger);
     const storeCardLogos = Array.from(logoMap.values());
-    const urls = max > 0 ? allUrls.slice(0, max) : allUrls;
+    const totalAvailable = allUrls.length;
+    const sliced = allUrls.slice(offset, max > 0 ? offset + max : undefined);
     const stores = [];
-    let mmId = 1;
-    // Sequential within the MCP context (concurrency adds nondeterminism that's
-    // less useful here than a clear per-store progress trail in the result).
+    // mm_id reflects position in the OVERALL directory (offset + index), so
+    // ids are unique across all merged batches.
     const pLimit = require('p-limit')(conc);
-    const tasks = urls.map((url) => pLimit(async () => {
-      const myId = mmId++;
+    const tasks = sliced.map((url, idx) => pLimit(async () => {
+      const myId = offset + idx + 1;
       const directoryLogoUrl = logoMap.get(url.replace(/\/+$/, '').toLowerCase()) || null;
       const store = await scrapeOneStore({
         url, mmId: myId, browser, client, model, extractor,
@@ -208,19 +225,28 @@ async function handleScrapeDirectory({ directory_url, max_stores = 10, concurren
     await Promise.all(tasks);
     stores.sort((a, b) => a.mm_id - b.mm_id);
-    // Generate the CSV regardless of whether we manage to write it to disk —
-    // it's always returned inline so the user gets it back automatically.
-    const csvText = storesToCSV(stores);
+    const extractedInThisCall = stores.length;
+    const nextOffset = offset + sliced.length;
+    const isComplete = nextOffset >= totalAvailable;
+    // Two write modes:
+    //   - Appending to a prior batch's file (no BOM, no header, rows only)
+    //   - Fresh file (full CSV with BOM + header)
     let writtenPaths = null;
     let writeError = null;
-    if (write_csv) {
-      try {
-        writtenPaths = writeResults(directory_url, stores, csvText);
-      } catch (err) {
-        writeError = err.message;
-        // Don't fail the tool — the CSV is still returned inline below.
+    try {
+      if (isAppending) {
+        const rowsOnly = storesToCSV(stores, { rowsOnly: true });
+        appendRowsToCSV(append_to, rowsOnly);
+        const jsonPath = append_to.replace(/\.csv$/, '.json');
+        appendStoresToJSON(jsonPath, stores);
+        writtenPaths = { json: jsonPath, csv: append_to, dir: path.dirname(append_to) };
+      } else {
+        const fullCsv = storesToCSV(stores);
+        writtenPaths = writeResults(directory_url, stores, fullCsv);
       }
+    } catch (err) {
+      writeError = err.message;
     }
     const bySource = {};
@@ -232,25 +258,22 @@ async function handleScrapeDirectory({ directory_url, max_stores = 10, concurren
     const summary = {
       directory_url,
-      stores_extracted: stores.length,
+      total_available: totalAvailable,
+      extracted_in_this_call: extractedInThisCall,
+      start_offset: offset,
+      next_offset: isComplete ? null : nextOffset,
+      is_complete: isComplete,
+      csv_file_path: writtenPaths ? writtenPaths.csv : null,
       hours_layer_breakdown: bySource,
       llm_usage: usage,
       llm_failed: usage.errors > 0
-        ? `⚠ ${usage.errors} LLM calls failed (description/categories/etc. fields will be empty). Last error: ${usage.lastError}. Run check_status to diagnose.`
+        ? `⚠ ${usage.errors} LLM calls failed. Last error: ${usage.lastError}. Run check_status to diagnose.`
         : null,
-      written_files: writtenPaths,
       write_error: writeError,
       auth_mode: creds.mode,
       mcp_version: PACKAGE_VERSION,
     };
-    // Response design:
-    //   1. Brief status line (always) — what the user sees in the chat reply
-    //   2. resource_link to the CSV — file attachment with user-priority annotations
-    //   3. ONLY on error: a loud error block so the user knows something failed
-    //
-    // No JSON dump / no inline CSV preview when things succeed — keeps the chat
-    // reply minimal.
     const host = new URL(directory_url).hostname.replace(/^www\./, '');
     const csvFilename = writtenPaths
       ? path.basename(writtenPaths.csv)
@@ -263,10 +286,22 @@ async function handleScrapeDirectory({ directory_url, max_stores = 10, concurren
     const hasWriteFailure = !!writeError;
     const anyFailure = hasLlmFailure || hasWriteFailure;
-    const brief =
-      `✅ mall-scraper-mcp v${PACKAGE_VERSION}\n` +
-      `${stores.length} store${stores.length === 1 ? '' : 's'} from ${host}\n` +
-      (writtenPaths ? `📄 ${writtenPaths.csv}` : '⚠ Disk write failed');
+    // Brief differs depending on whether this is the final batch or a continuation
+    const progress = `${offset + extractedInThisCall}/${totalAvailable}`;
+    const versionTag = `mall-scraper-mcp v${PACKAGE_VERSION}`;
+    let brief;
+    if (isComplete) {
+      brief =
+        `✅ ${versionTag}\n` +
+        `Done — ${progress} stores from ${host}\n` +
+        (writtenPaths ? `📄 ${writtenPaths.csv}` : '⚠ Disk write failed');
+    } else {
+      brief =
+        `⏳ ${versionTag}\n` +
+        `Batch done — ${progress} stores from ${host}\n` +
+        (writtenPaths ? `📄 (in progress) ${writtenPaths.csv}` : '⚠ Disk write failed') + '\n' +
+        `→ More to scrape. Call again with start_offset=${nextOffset} and append_to=${writtenPaths ? writtenPaths.csv : '<csv path>'}`;
+    }
     const content = [
       {
@@ -276,21 +311,19 @@ async function handleScrapeDirectory({ directory_url, max_stores = 10, concurren
       },
     ];
-    // resource_link only if we have a real file path (file:// URI must point at
-    // an existing file for the client to do anything useful with it).
     if (csvUri) {
       content.push({
         type: 'resource_link',
         uri: csvUri,
         name: csvFilename,
-        description: `Store directory scrape — ${stores.length} stores from ${host}`,
+        description: isComplete
+          ? `Final CSV — ${totalAvailable} stores from ${host}`
+          : `Partial CSV (${progress}) — more batches coming`,
         mimeType: 'text/csv',
-        annotations: { audience: ['user'], priority: 0.9 },
+        annotations: { audience: ['user'], priority: isComplete ? 1.0 : 0.5 },
       });
     }
-    // Loud error block — only when something failed. The user explicitly asked
-    // for nothing other than a status rundown UNLESS something broke.
     if (anyFailure) {
       const errLines = [];
       if (hasLlmFailure) {
@@ -310,6 +343,20 @@ async function handleScrapeDirectory({ directory_url, max_stores = 10, concurren
       });
     }
+    // Include a small machine-readable hint so Claude can grab next_offset
+    // reliably without parsing the brief.
+    content.push({
+      type: 'text',
+      text: '\n' + JSON.stringify({
+        is_complete: isComplete,
+        next_offset: isComplete ? null : nextOffset,
+        csv_file_path: writtenPaths ? writtenPaths.csv : null,
+        total_available: totalAvailable,
+        extracted_in_this_call: extractedInThisCall,
+      }, null, 2),
+      annotations: { audience: ['assistant'], priority: 0.4 },
+    });
     return { content };
   } finally {
     try { await browser.close(); } catch (_) {}
@@ -610,6 +657,39 @@ function writeResults(directoryUrl, stores, csvText) {
   return { json: `${base}.json`, csv: `${base}.csv`, dir: outDir };
 }
+/**
+ * Append CSV rows (header-stripped) to an existing CSV file.
+ * Validates that the target exists — otherwise the caller chained without
+ * a prior fresh batch, which would be a usage error.
+ */
+function appendRowsToCSV(csvPath, rowsOnlyText) {
+  if (!fs.existsSync(csvPath)) {
+    throw new Error(`append_to path does not exist: ${csvPath}. The first batch must run without append_to to create the file.`);
+  }
+  fs.appendFileSync(csvPath, rowsOnlyText);
+}
+/**
+ * Append stores to an existing JSON array file (which holds prior batches).
+ * Reads the file, parses, concats, rewrites. OK for the sizes we deal with.
+ */
+function appendStoresToJSON(jsonPath, stores) {
+  let existing = [];
+  if (fs.existsSync(jsonPath)) {
+    try {
+      const raw = fs.readFileSync(jsonPath, 'utf8');
+      const parsed = JSON.parse(raw);
+      if (Array.isArray(parsed)) existing = parsed;
+    } catch (_) {
+      // If parse fails, start a sibling .partial file rather than overwriting.
+      const partial = jsonPath.replace(/\.json$/, '.partial.json');
+      fs.writeFileSync(partial, JSON.stringify(stores, null, 2));
+      return;
+    }
+  }
+  fs.writeFileSync(jsonPath, JSON.stringify([...existing, ...stores], null, 2));
+}
 function errorResult(message) {
   return { isError: true, content: [{ type: 'text', text: message }] };
 }

package/src/storeModel.js CHANGED Viewed

@@ -124,7 +124,16 @@ function csvCell(val, { alwaysQuote = false } = {}) {
  * Booleans and numeric fields are left unquoted so destination systems can
  * type-detect them.
  */
-function storesToCSV(stores, { lineEnding = '\r\n', bom = true, alwaysQuoteStrings = true } = {}) {
+/**
+ * Serialize stores to CSV.
+ *
+ * Options:
+ *   - rowsOnly: skip the BOM + header line (for appending to an existing CSV)
+ *   - lineEnding, bom, alwaysQuoteStrings: as before
+ */
+function storesToCSV(stores, {
+  lineEnding = '\r\n', bom = true, alwaysQuoteStrings = true, rowsOnly = false,
+} = {}) {
   const csvFields = STORE_FIELDS.filter(f => !CSV_EXCLUDE_FIELDS.has(f));
   const formatCell = (field, val) => {
     if (BOOLEAN_FIELDS.has(field) || NUMERIC_FIELDS.has(field)) {
@@ -136,11 +145,13 @@ function storesToCSV(stores, { lineEnding = '\r\n', bom = true, alwaysQuoteStrin
     .map(f => csvCell(f, { alwaysQuote: alwaysQuoteStrings }))
     .join(',');
   if (!stores || stores.length === 0) {
+    if (rowsOnly) return '';
     return (bom ? '' : '') + headerLine + lineEnding;
   }
   const rows = stores.map(store =>
     csvFields.map(f => formatCell(f, store[f] == null ? '' : store[f])).join(',')
   );
+  if (rowsOnly) return rows.join(lineEnding) + lineEnding;
   return (bom ? '' : '') + [headerLine, ...rows].join(lineEnding) + lineEnding;
 }