mallmaverick-store-scraper 0.1.5 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mallmaverick-store-scraper",
3
- "version": "0.1.5",
3
+ "version": "0.2.0",
4
4
  "description": "MCP server + CLI for scraping shopping mall store directories. Hours-first layered pipeline + image classification.",
5
5
  "main": "src/main.js",
6
6
  "type": "commonjs",
package/src/mcp-server.js CHANGED
@@ -56,15 +56,22 @@ const TOOLS = [
56
56
  {
57
57
  name: 'scrape_directory',
58
58
  description:
59
- 'Scrape a shopping-mall store directory and return per-store records ' +
60
- '(name, hours, phone, logo, brand image, categories, etc.). Use this ' +
61
- 'when the user wants to capture a directory like ' +
59
+ 'Scrape a shopping-mall store directory in batches. Each call processes ' +
60
+ 'up to `max_stores` stores starting from `start_offset` (default 30 stores ' +
61
+ 'per call). Use when the user wants to capture a directory like ' +
62
62
  'https://grasslands.ca/store-directory/.\n\n' +
63
- 'AFTER RUNNING THIS TOOL: reply with ONE short sentence the count ' +
64
- 'and the file path. The CSV file is attached as a resource_link in the ' +
65
- 'tool response; do NOT paste CSV text into your reply, do NOT print the ' +
66
- 'JSON, do NOT summarize each store. If the tool response includes an ' +
67
- 'error block, surface that error verbatim to the user.',
63
+ 'BATCHING IMPORTANT: Claude Desktop has a ~4-min tool-call timeout. ' +
64
+ 'For large directories (>30 stores) you MUST chain multiple calls:\n' +
65
+ ' 1. First call: scrape_directory(directory_url) uses defaults ' +
66
+ ' (max_stores=30, start_offset=0). Save the returned `csv_file_path`.\n' +
67
+ ' 2. If the response has `is_complete=false`, IMMEDIATELY call again with ' +
68
+ ' start_offset=<next_offset from response> AND append_to=<csv_file_path> ' +
69
+ ' so all batches merge into the same CSV file.\n' +
70
+ ' 3. Repeat until is_complete=true. Then announce the single final file.\n\n' +
71
+ 'AFTER EACH BATCH: reply with ONE short sentence — the batch count and ' +
72
+ 'overall progress (e.g. "Batch 2 done — 60/120 stores"). Do NOT paste ' +
73
+ 'CSV text or JSON. The CSV is attached as a resource_link. If the tool ' +
74
+ 'response includes an error block, surface that error verbatim.',
68
75
  inputSchema: {
69
76
  type: 'object',
70
77
  properties: {
@@ -74,8 +81,17 @@ const TOOLS = [
74
81
  },
75
82
  max_stores: {
76
83
  type: 'number',
77
- description: 'Max number of stores to scrape (0 = all). Default 10.',
78
- default: 10,
84
+ description: 'Max stores to scrape in THIS batch (0 = scrape all in this batch — only safe for ≤30 known small dirs). Default 30.',
85
+ default: 30,
86
+ },
87
+ start_offset: {
88
+ type: 'number',
89
+ description: 'Skip the first N discovered stores. Use this with append_to to chain batches. Default 0.',
90
+ default: 0,
91
+ },
92
+ append_to: {
93
+ type: 'string',
94
+ description: 'Absolute path to a CSV file produced by a previous batch. When set, this batch\'s rows are appended (no duplicate header) so all batches merge into one file. Get this value from the previous batch\'s `csv_file_path` response field.',
79
95
  },
80
96
  concurrency: {
81
97
  type: 'number',
@@ -87,11 +103,6 @@ const TOOLS = [
87
103
  description: 'OpenAI model. Default gpt-5.4-mini.',
88
104
  default: 'gpt-5.4-mini',
89
105
  },
90
- write_csv: {
91
- type: 'boolean',
92
- description: 'Also write a CSV + JSON to extracted_stores/. Default true.',
93
- default: true,
94
- },
95
106
  },
96
107
  required: ['directory_url'],
97
108
  },
@@ -138,7 +149,7 @@ const TOOLS = [
138
149
  },
139
150
  ];
140
151
 
141
- const PACKAGE_VERSION = '0.1.5';
152
+ const PACKAGE_VERSION = '0.2.0';
142
153
 
143
154
  const server = new Server(
144
155
  { name: 'mall-scraper-mcp', version: PACKAGE_VERSION },
@@ -167,7 +178,10 @@ server.setRequestHandler(CallToolRequestSchema, async (req) => {
167
178
  // Tool implementations
168
179
  // ---------------------------------------------------------------------------
169
180
 
170
- async function handleScrapeDirectory({ directory_url, max_stores = 10, concurrency = 2, model = 'gpt-5.4-mini', write_csv = true }) {
181
+ async function handleScrapeDirectory({
182
+ directory_url, max_stores = 30, start_offset = 0, append_to,
183
+ concurrency = 2, model = 'gpt-5.4-mini',
184
+ }) {
171
185
  if (!directory_url) return errorResult('directory_url is required');
172
186
  const creds = describeCredentials();
173
187
  if (creds.mode === 'none') {
@@ -182,21 +196,24 @@ async function handleScrapeDirectory({ directory_url, max_stores = 10, concurren
182
196
  const extractor = new StoreExtractor({ client, model, useVision: false, logger });
183
197
  const conc = Math.min(5, Math.max(1, parseInt(concurrency, 10) || 2));
184
198
  const max = Math.max(0, parseInt(max_stores, 10) || 0);
199
+ const offset = Math.max(0, parseInt(start_offset, 10) || 0);
200
+ const isAppending = !!append_to;
185
201
 
186
202
  try {
187
203
  const mallRoot = new URL(directory_url).origin;
188
204
  const mallContext = await getMallContext(browser, mallRoot);
189
205
  const { storeUrls: allUrls, logoMap } = await discoverStores(browser, directory_url, logger);
190
206
  const storeCardLogos = Array.from(logoMap.values());
191
- const urls = max > 0 ? allUrls.slice(0, max) : allUrls;
207
+
208
+ const totalAvailable = allUrls.length;
209
+ const sliced = allUrls.slice(offset, max > 0 ? offset + max : undefined);
192
210
 
193
211
  const stores = [];
194
- let mmId = 1;
195
- // Sequential within the MCP context (concurrency adds nondeterminism that's
196
- // less useful here than a clear per-store progress trail in the result).
212
+ // mm_id reflects position in the OVERALL directory (offset + index), so
213
+ // ids are unique across all merged batches.
197
214
  const pLimit = require('p-limit')(conc);
198
- const tasks = urls.map((url) => pLimit(async () => {
199
- const myId = mmId++;
215
+ const tasks = sliced.map((url, idx) => pLimit(async () => {
216
+ const myId = offset + idx + 1;
200
217
  const directoryLogoUrl = logoMap.get(url.replace(/\/+$/, '').toLowerCase()) || null;
201
218
  const store = await scrapeOneStore({
202
219
  url, mmId: myId, browser, client, model, extractor,
@@ -208,19 +225,28 @@ async function handleScrapeDirectory({ directory_url, max_stores = 10, concurren
208
225
  await Promise.all(tasks);
209
226
  stores.sort((a, b) => a.mm_id - b.mm_id);
210
227
 
211
- // Generate the CSV regardless of whether we manage to write it to disk —
212
- // it's always returned inline so the user gets it back automatically.
213
- const csvText = storesToCSV(stores);
228
+ const extractedInThisCall = stores.length;
229
+ const nextOffset = offset + sliced.length;
230
+ const isComplete = nextOffset >= totalAvailable;
214
231
 
232
+ // Two write modes:
233
+ // - Appending to a prior batch's file (no BOM, no header, rows only)
234
+ // - Fresh file (full CSV with BOM + header)
215
235
  let writtenPaths = null;
216
236
  let writeError = null;
217
- if (write_csv) {
218
- try {
219
- writtenPaths = writeResults(directory_url, stores, csvText);
220
- } catch (err) {
221
- writeError = err.message;
222
- // Don't fail the tool — the CSV is still returned inline below.
237
+ try {
238
+ if (isAppending) {
239
+ const rowsOnly = storesToCSV(stores, { rowsOnly: true });
240
+ appendRowsToCSV(append_to, rowsOnly);
241
+ const jsonPath = append_to.replace(/\.csv$/, '.json');
242
+ appendStoresToJSON(jsonPath, stores);
243
+ writtenPaths = { json: jsonPath, csv: append_to, dir: path.dirname(append_to) };
244
+ } else {
245
+ const fullCsv = storesToCSV(stores);
246
+ writtenPaths = writeResults(directory_url, stores, fullCsv);
223
247
  }
248
+ } catch (err) {
249
+ writeError = err.message;
224
250
  }
225
251
 
226
252
  const bySource = {};
@@ -232,25 +258,22 @@ async function handleScrapeDirectory({ directory_url, max_stores = 10, concurren
232
258
 
233
259
  const summary = {
234
260
  directory_url,
235
- stores_extracted: stores.length,
261
+ total_available: totalAvailable,
262
+ extracted_in_this_call: extractedInThisCall,
263
+ start_offset: offset,
264
+ next_offset: isComplete ? null : nextOffset,
265
+ is_complete: isComplete,
266
+ csv_file_path: writtenPaths ? writtenPaths.csv : null,
236
267
  hours_layer_breakdown: bySource,
237
268
  llm_usage: usage,
238
269
  llm_failed: usage.errors > 0
239
- ? `⚠ ${usage.errors} LLM calls failed (description/categories/etc. fields will be empty). Last error: ${usage.lastError}. Run check_status to diagnose.`
270
+ ? `⚠ ${usage.errors} LLM calls failed. Last error: ${usage.lastError}. Run check_status to diagnose.`
240
271
  : null,
241
- written_files: writtenPaths,
242
272
  write_error: writeError,
243
273
  auth_mode: creds.mode,
244
274
  mcp_version: PACKAGE_VERSION,
245
275
  };
246
276
 
247
- // Response design:
248
- // 1. Brief status line (always) — what the user sees in the chat reply
249
- // 2. resource_link to the CSV — file attachment with user-priority annotations
250
- // 3. ONLY on error: a loud error block so the user knows something failed
251
- //
252
- // No JSON dump / no inline CSV preview when things succeed — keeps the chat
253
- // reply minimal.
254
277
  const host = new URL(directory_url).hostname.replace(/^www\./, '');
255
278
  const csvFilename = writtenPaths
256
279
  ? path.basename(writtenPaths.csv)
@@ -263,10 +286,22 @@ async function handleScrapeDirectory({ directory_url, max_stores = 10, concurren
263
286
  const hasWriteFailure = !!writeError;
264
287
  const anyFailure = hasLlmFailure || hasWriteFailure;
265
288
 
266
- const brief =
267
- `✅ mall-scraper-mcp v${PACKAGE_VERSION}\n` +
268
- `${stores.length} store${stores.length === 1 ? '' : 's'} from ${host}\n` +
269
- (writtenPaths ? `📄 ${writtenPaths.csv}` : '⚠ Disk write failed');
289
+ // Brief differs depending on whether this is the final batch or a continuation
290
+ const progress = `${offset + extractedInThisCall}/${totalAvailable}`;
291
+ const versionTag = `mall-scraper-mcp v${PACKAGE_VERSION}`;
292
+ let brief;
293
+ if (isComplete) {
294
+ brief =
295
+ `✅ ${versionTag}\n` +
296
+ `Done — ${progress} stores from ${host}\n` +
297
+ (writtenPaths ? `📄 ${writtenPaths.csv}` : '⚠ Disk write failed');
298
+ } else {
299
+ brief =
300
+ `⏳ ${versionTag}\n` +
301
+ `Batch done — ${progress} stores from ${host}\n` +
302
+ (writtenPaths ? `📄 (in progress) ${writtenPaths.csv}` : '⚠ Disk write failed') + '\n' +
303
+ `→ More to scrape. Call again with start_offset=${nextOffset} and append_to=${writtenPaths ? writtenPaths.csv : '<csv path>'}`;
304
+ }
270
305
 
271
306
  const content = [
272
307
  {
@@ -276,21 +311,19 @@ async function handleScrapeDirectory({ directory_url, max_stores = 10, concurren
276
311
  },
277
312
  ];
278
313
 
279
- // resource_link only if we have a real file path (file:// URI must point at
280
- // an existing file for the client to do anything useful with it).
281
314
  if (csvUri) {
282
315
  content.push({
283
316
  type: 'resource_link',
284
317
  uri: csvUri,
285
318
  name: csvFilename,
286
- description: `Store directory scrape — ${stores.length} stores from ${host}`,
319
+ description: isComplete
320
+ ? `Final CSV — ${totalAvailable} stores from ${host}`
321
+ : `Partial CSV (${progress}) — more batches coming`,
287
322
  mimeType: 'text/csv',
288
- annotations: { audience: ['user'], priority: 0.9 },
323
+ annotations: { audience: ['user'], priority: isComplete ? 1.0 : 0.5 },
289
324
  });
290
325
  }
291
326
 
292
- // Loud error block — only when something failed. The user explicitly asked
293
- // for nothing other than a status rundown UNLESS something broke.
294
327
  if (anyFailure) {
295
328
  const errLines = [];
296
329
  if (hasLlmFailure) {
@@ -310,6 +343,20 @@ async function handleScrapeDirectory({ directory_url, max_stores = 10, concurren
310
343
  });
311
344
  }
312
345
 
346
+ // Include a small machine-readable hint so Claude can grab next_offset
347
+ // reliably without parsing the brief.
348
+ content.push({
349
+ type: 'text',
350
+ text: '\n' + JSON.stringify({
351
+ is_complete: isComplete,
352
+ next_offset: isComplete ? null : nextOffset,
353
+ csv_file_path: writtenPaths ? writtenPaths.csv : null,
354
+ total_available: totalAvailable,
355
+ extracted_in_this_call: extractedInThisCall,
356
+ }, null, 2),
357
+ annotations: { audience: ['assistant'], priority: 0.4 },
358
+ });
359
+
313
360
  return { content };
314
361
  } finally {
315
362
  try { await browser.close(); } catch (_) {}
@@ -610,6 +657,39 @@ function writeResults(directoryUrl, stores, csvText) {
610
657
  return { json: `${base}.json`, csv: `${base}.csv`, dir: outDir };
611
658
  }
612
659
 
660
+ /**
661
+ * Append CSV rows (header-stripped) to an existing CSV file.
662
+ * Validates that the target exists — otherwise the caller chained without
663
+ * a prior fresh batch, which would be a usage error.
664
+ */
665
+ function appendRowsToCSV(csvPath, rowsOnlyText) {
666
+ if (!fs.existsSync(csvPath)) {
667
+ throw new Error(`append_to path does not exist: ${csvPath}. The first batch must run without append_to to create the file.`);
668
+ }
669
+ fs.appendFileSync(csvPath, rowsOnlyText);
670
+ }
671
+
672
+ /**
673
+ * Append stores to an existing JSON array file (which holds prior batches).
674
+ * Reads the file, parses, concats, rewrites. OK for the sizes we deal with.
675
+ */
676
+ function appendStoresToJSON(jsonPath, stores) {
677
+ let existing = [];
678
+ if (fs.existsSync(jsonPath)) {
679
+ try {
680
+ const raw = fs.readFileSync(jsonPath, 'utf8');
681
+ const parsed = JSON.parse(raw);
682
+ if (Array.isArray(parsed)) existing = parsed;
683
+ } catch (_) {
684
+ // If parse fails, start a sibling .partial file rather than overwriting.
685
+ const partial = jsonPath.replace(/\.json$/, '.partial.json');
686
+ fs.writeFileSync(partial, JSON.stringify(stores, null, 2));
687
+ return;
688
+ }
689
+ }
690
+ fs.writeFileSync(jsonPath, JSON.stringify([...existing, ...stores], null, 2));
691
+ }
692
+
613
693
  function errorResult(message) {
614
694
  return { isError: true, content: [{ type: 'text', text: message }] };
615
695
  }
package/src/storeModel.js CHANGED
@@ -124,7 +124,16 @@ function csvCell(val, { alwaysQuote = false } = {}) {
124
124
  * Booleans and numeric fields are left unquoted so destination systems can
125
125
  * type-detect them.
126
126
  */
127
- function storesToCSV(stores, { lineEnding = '\r\n', bom = true, alwaysQuoteStrings = true } = {}) {
127
+ /**
128
+ * Serialize stores to CSV.
129
+ *
130
+ * Options:
131
+ * - rowsOnly: skip the BOM + header line (for appending to an existing CSV)
132
+ * - lineEnding, bom, alwaysQuoteStrings: as before
133
+ */
134
+ function storesToCSV(stores, {
135
+ lineEnding = '\r\n', bom = true, alwaysQuoteStrings = true, rowsOnly = false,
136
+ } = {}) {
128
137
  const csvFields = STORE_FIELDS.filter(f => !CSV_EXCLUDE_FIELDS.has(f));
129
138
  const formatCell = (field, val) => {
130
139
  if (BOOLEAN_FIELDS.has(field) || NUMERIC_FIELDS.has(field)) {
@@ -136,11 +145,13 @@ function storesToCSV(stores, { lineEnding = '\r\n', bom = true, alwaysQuoteStrin
136
145
  .map(f => csvCell(f, { alwaysQuote: alwaysQuoteStrings }))
137
146
  .join(',');
138
147
  if (!stores || stores.length === 0) {
148
+ if (rowsOnly) return '';
139
149
  return (bom ? '' : '') + headerLine + lineEnding;
140
150
  }
141
151
  const rows = stores.map(store =>
142
152
  csvFields.map(f => formatCell(f, store[f] == null ? '' : store[f])).join(',')
143
153
  );
154
+ if (rowsOnly) return rows.join(lineEnding) + lineEnding;
144
155
  return (bom ? '' : '') + [headerLine, ...rows].join(lineEnding) + lineEnding;
145
156
  }
146
157