mallmaverick-store-scraper 0.1.5 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/mcp-server.js +132 -52
- package/src/storeModel.js +12 -1
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mallmaverick-store-scraper",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0",
|
|
4
4
|
"description": "MCP server + CLI for scraping shopping mall store directories. Hours-first layered pipeline + image classification.",
|
|
5
5
|
"main": "src/main.js",
|
|
6
6
|
"type": "commonjs",
|
package/src/mcp-server.js
CHANGED
|
@@ -56,15 +56,22 @@ const TOOLS = [
|
|
|
56
56
|
{
|
|
57
57
|
name: 'scrape_directory',
|
|
58
58
|
description:
|
|
59
|
-
'Scrape a shopping-mall store directory
|
|
60
|
-
'
|
|
61
|
-
'when the user wants to capture a directory like ' +
|
|
59
|
+
'Scrape a shopping-mall store directory in batches. Each call processes ' +
|
|
60
|
+
'up to `max_stores` stores starting from `start_offset` (default 30 stores ' +
|
|
61
|
+
'per call). Use when the user wants to capture a directory like ' +
|
|
62
62
|
'https://grasslands.ca/store-directory/.\n\n' +
|
|
63
|
-
'
|
|
64
|
-
'
|
|
65
|
-
'
|
|
66
|
-
'
|
|
67
|
-
'
|
|
63
|
+
'BATCHING — IMPORTANT: Claude Desktop has a ~4-min tool-call timeout. ' +
|
|
64
|
+
'For large directories (>30 stores) you MUST chain multiple calls:\n' +
|
|
65
|
+
' 1. First call: scrape_directory(directory_url) — uses defaults ' +
|
|
66
|
+
' (max_stores=30, start_offset=0). Save the returned `csv_file_path`.\n' +
|
|
67
|
+
' 2. If the response has `is_complete=false`, IMMEDIATELY call again with ' +
|
|
68
|
+
' start_offset=<next_offset from response> AND append_to=<csv_file_path> ' +
|
|
69
|
+
' so all batches merge into the same CSV file.\n' +
|
|
70
|
+
' 3. Repeat until is_complete=true. Then announce the single final file.\n\n' +
|
|
71
|
+
'AFTER EACH BATCH: reply with ONE short sentence — the batch count and ' +
|
|
72
|
+
'overall progress (e.g. "Batch 2 done — 60/120 stores"). Do NOT paste ' +
|
|
73
|
+
'CSV text or JSON. The CSV is attached as a resource_link. If the tool ' +
|
|
74
|
+
'response includes an error block, surface that error verbatim.',
|
|
68
75
|
inputSchema: {
|
|
69
76
|
type: 'object',
|
|
70
77
|
properties: {
|
|
@@ -74,8 +81,17 @@ const TOOLS = [
|
|
|
74
81
|
},
|
|
75
82
|
max_stores: {
|
|
76
83
|
type: 'number',
|
|
77
|
-
description: 'Max
|
|
78
|
-
default:
|
|
84
|
+
description: 'Max stores to scrape in THIS batch (0 = scrape all in this batch — only safe for ≤30 known small dirs). Default 30.',
|
|
85
|
+
default: 30,
|
|
86
|
+
},
|
|
87
|
+
start_offset: {
|
|
88
|
+
type: 'number',
|
|
89
|
+
description: 'Skip the first N discovered stores. Use this with append_to to chain batches. Default 0.',
|
|
90
|
+
default: 0,
|
|
91
|
+
},
|
|
92
|
+
append_to: {
|
|
93
|
+
type: 'string',
|
|
94
|
+
description: 'Absolute path to a CSV file produced by a previous batch. When set, this batch\'s rows are appended (no duplicate header) so all batches merge into one file. Get this value from the previous batch\'s `csv_file_path` response field.',
|
|
79
95
|
},
|
|
80
96
|
concurrency: {
|
|
81
97
|
type: 'number',
|
|
@@ -87,11 +103,6 @@ const TOOLS = [
|
|
|
87
103
|
description: 'OpenAI model. Default gpt-5.4-mini.',
|
|
88
104
|
default: 'gpt-5.4-mini',
|
|
89
105
|
},
|
|
90
|
-
write_csv: {
|
|
91
|
-
type: 'boolean',
|
|
92
|
-
description: 'Also write a CSV + JSON to extracted_stores/. Default true.',
|
|
93
|
-
default: true,
|
|
94
|
-
},
|
|
95
106
|
},
|
|
96
107
|
required: ['directory_url'],
|
|
97
108
|
},
|
|
@@ -138,7 +149,7 @@ const TOOLS = [
|
|
|
138
149
|
},
|
|
139
150
|
];
|
|
140
151
|
|
|
141
|
-
const PACKAGE_VERSION = '0.
|
|
152
|
+
const PACKAGE_VERSION = '0.2.0';
|
|
142
153
|
|
|
143
154
|
const server = new Server(
|
|
144
155
|
{ name: 'mall-scraper-mcp', version: PACKAGE_VERSION },
|
|
@@ -167,7 +178,10 @@ server.setRequestHandler(CallToolRequestSchema, async (req) => {
|
|
|
167
178
|
// Tool implementations
|
|
168
179
|
// ---------------------------------------------------------------------------
|
|
169
180
|
|
|
170
|
-
async function handleScrapeDirectory({
|
|
181
|
+
async function handleScrapeDirectory({
|
|
182
|
+
directory_url, max_stores = 30, start_offset = 0, append_to,
|
|
183
|
+
concurrency = 2, model = 'gpt-5.4-mini',
|
|
184
|
+
}) {
|
|
171
185
|
if (!directory_url) return errorResult('directory_url is required');
|
|
172
186
|
const creds = describeCredentials();
|
|
173
187
|
if (creds.mode === 'none') {
|
|
@@ -182,21 +196,24 @@ async function handleScrapeDirectory({ directory_url, max_stores = 10, concurren
|
|
|
182
196
|
const extractor = new StoreExtractor({ client, model, useVision: false, logger });
|
|
183
197
|
const conc = Math.min(5, Math.max(1, parseInt(concurrency, 10) || 2));
|
|
184
198
|
const max = Math.max(0, parseInt(max_stores, 10) || 0);
|
|
199
|
+
const offset = Math.max(0, parseInt(start_offset, 10) || 0);
|
|
200
|
+
const isAppending = !!append_to;
|
|
185
201
|
|
|
186
202
|
try {
|
|
187
203
|
const mallRoot = new URL(directory_url).origin;
|
|
188
204
|
const mallContext = await getMallContext(browser, mallRoot);
|
|
189
205
|
const { storeUrls: allUrls, logoMap } = await discoverStores(browser, directory_url, logger);
|
|
190
206
|
const storeCardLogos = Array.from(logoMap.values());
|
|
191
|
-
|
|
207
|
+
|
|
208
|
+
const totalAvailable = allUrls.length;
|
|
209
|
+
const sliced = allUrls.slice(offset, max > 0 ? offset + max : undefined);
|
|
192
210
|
|
|
193
211
|
const stores = [];
|
|
194
|
-
|
|
195
|
-
//
|
|
196
|
-
// less useful here than a clear per-store progress trail in the result).
|
|
212
|
+
// mm_id reflects position in the OVERALL directory (offset + index), so
|
|
213
|
+
// ids are unique across all merged batches.
|
|
197
214
|
const pLimit = require('p-limit')(conc);
|
|
198
|
-
const tasks =
|
|
199
|
-
const myId =
|
|
215
|
+
const tasks = sliced.map((url, idx) => pLimit(async () => {
|
|
216
|
+
const myId = offset + idx + 1;
|
|
200
217
|
const directoryLogoUrl = logoMap.get(url.replace(/\/+$/, '').toLowerCase()) || null;
|
|
201
218
|
const store = await scrapeOneStore({
|
|
202
219
|
url, mmId: myId, browser, client, model, extractor,
|
|
@@ -208,19 +225,28 @@ async function handleScrapeDirectory({ directory_url, max_stores = 10, concurren
|
|
|
208
225
|
await Promise.all(tasks);
|
|
209
226
|
stores.sort((a, b) => a.mm_id - b.mm_id);
|
|
210
227
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
const
|
|
228
|
+
const extractedInThisCall = stores.length;
|
|
229
|
+
const nextOffset = offset + sliced.length;
|
|
230
|
+
const isComplete = nextOffset >= totalAvailable;
|
|
214
231
|
|
|
232
|
+
// Two write modes:
|
|
233
|
+
// - Appending to a prior batch's file (no BOM, no header, rows only)
|
|
234
|
+
// - Fresh file (full CSV with BOM + header)
|
|
215
235
|
let writtenPaths = null;
|
|
216
236
|
let writeError = null;
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
237
|
+
try {
|
|
238
|
+
if (isAppending) {
|
|
239
|
+
const rowsOnly = storesToCSV(stores, { rowsOnly: true });
|
|
240
|
+
appendRowsToCSV(append_to, rowsOnly);
|
|
241
|
+
const jsonPath = append_to.replace(/\.csv$/, '.json');
|
|
242
|
+
appendStoresToJSON(jsonPath, stores);
|
|
243
|
+
writtenPaths = { json: jsonPath, csv: append_to, dir: path.dirname(append_to) };
|
|
244
|
+
} else {
|
|
245
|
+
const fullCsv = storesToCSV(stores);
|
|
246
|
+
writtenPaths = writeResults(directory_url, stores, fullCsv);
|
|
223
247
|
}
|
|
248
|
+
} catch (err) {
|
|
249
|
+
writeError = err.message;
|
|
224
250
|
}
|
|
225
251
|
|
|
226
252
|
const bySource = {};
|
|
@@ -232,25 +258,22 @@ async function handleScrapeDirectory({ directory_url, max_stores = 10, concurren
|
|
|
232
258
|
|
|
233
259
|
const summary = {
|
|
234
260
|
directory_url,
|
|
235
|
-
|
|
261
|
+
total_available: totalAvailable,
|
|
262
|
+
extracted_in_this_call: extractedInThisCall,
|
|
263
|
+
start_offset: offset,
|
|
264
|
+
next_offset: isComplete ? null : nextOffset,
|
|
265
|
+
is_complete: isComplete,
|
|
266
|
+
csv_file_path: writtenPaths ? writtenPaths.csv : null,
|
|
236
267
|
hours_layer_breakdown: bySource,
|
|
237
268
|
llm_usage: usage,
|
|
238
269
|
llm_failed: usage.errors > 0
|
|
239
|
-
? `⚠ ${usage.errors} LLM calls failed
|
|
270
|
+
? `⚠ ${usage.errors} LLM calls failed. Last error: ${usage.lastError}. Run check_status to diagnose.`
|
|
240
271
|
: null,
|
|
241
|
-
written_files: writtenPaths,
|
|
242
272
|
write_error: writeError,
|
|
243
273
|
auth_mode: creds.mode,
|
|
244
274
|
mcp_version: PACKAGE_VERSION,
|
|
245
275
|
};
|
|
246
276
|
|
|
247
|
-
// Response design:
|
|
248
|
-
// 1. Brief status line (always) — what the user sees in the chat reply
|
|
249
|
-
// 2. resource_link to the CSV — file attachment with user-priority annotations
|
|
250
|
-
// 3. ONLY on error: a loud error block so the user knows something failed
|
|
251
|
-
//
|
|
252
|
-
// No JSON dump / no inline CSV preview when things succeed — keeps the chat
|
|
253
|
-
// reply minimal.
|
|
254
277
|
const host = new URL(directory_url).hostname.replace(/^www\./, '');
|
|
255
278
|
const csvFilename = writtenPaths
|
|
256
279
|
? path.basename(writtenPaths.csv)
|
|
@@ -263,10 +286,22 @@ async function handleScrapeDirectory({ directory_url, max_stores = 10, concurren
|
|
|
263
286
|
const hasWriteFailure = !!writeError;
|
|
264
287
|
const anyFailure = hasLlmFailure || hasWriteFailure;
|
|
265
288
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
289
|
+
// Brief differs depending on whether this is the final batch or a continuation
|
|
290
|
+
const progress = `${offset + extractedInThisCall}/${totalAvailable}`;
|
|
291
|
+
const versionTag = `mall-scraper-mcp v${PACKAGE_VERSION}`;
|
|
292
|
+
let brief;
|
|
293
|
+
if (isComplete) {
|
|
294
|
+
brief =
|
|
295
|
+
`✅ ${versionTag}\n` +
|
|
296
|
+
`Done — ${progress} stores from ${host}\n` +
|
|
297
|
+
(writtenPaths ? `📄 ${writtenPaths.csv}` : '⚠ Disk write failed');
|
|
298
|
+
} else {
|
|
299
|
+
brief =
|
|
300
|
+
`⏳ ${versionTag}\n` +
|
|
301
|
+
`Batch done — ${progress} stores from ${host}\n` +
|
|
302
|
+
(writtenPaths ? `📄 (in progress) ${writtenPaths.csv}` : '⚠ Disk write failed') + '\n' +
|
|
303
|
+
`→ More to scrape. Call again with start_offset=${nextOffset} and append_to=${writtenPaths ? writtenPaths.csv : '<csv path>'}`;
|
|
304
|
+
}
|
|
270
305
|
|
|
271
306
|
const content = [
|
|
272
307
|
{
|
|
@@ -276,21 +311,19 @@ async function handleScrapeDirectory({ directory_url, max_stores = 10, concurren
|
|
|
276
311
|
},
|
|
277
312
|
];
|
|
278
313
|
|
|
279
|
-
// resource_link only if we have a real file path (file:// URI must point at
|
|
280
|
-
// an existing file for the client to do anything useful with it).
|
|
281
314
|
if (csvUri) {
|
|
282
315
|
content.push({
|
|
283
316
|
type: 'resource_link',
|
|
284
317
|
uri: csvUri,
|
|
285
318
|
name: csvFilename,
|
|
286
|
-
description:
|
|
319
|
+
description: isComplete
|
|
320
|
+
? `Final CSV — ${totalAvailable} stores from ${host}`
|
|
321
|
+
: `Partial CSV (${progress}) — more batches coming`,
|
|
287
322
|
mimeType: 'text/csv',
|
|
288
|
-
annotations: { audience: ['user'], priority: 0.
|
|
323
|
+
annotations: { audience: ['user'], priority: isComplete ? 1.0 : 0.5 },
|
|
289
324
|
});
|
|
290
325
|
}
|
|
291
326
|
|
|
292
|
-
// Loud error block — only when something failed. The user explicitly asked
|
|
293
|
-
// for nothing other than a status rundown UNLESS something broke.
|
|
294
327
|
if (anyFailure) {
|
|
295
328
|
const errLines = [];
|
|
296
329
|
if (hasLlmFailure) {
|
|
@@ -310,6 +343,20 @@ async function handleScrapeDirectory({ directory_url, max_stores = 10, concurren
|
|
|
310
343
|
});
|
|
311
344
|
}
|
|
312
345
|
|
|
346
|
+
// Include a small machine-readable hint so Claude can grab next_offset
|
|
347
|
+
// reliably without parsing the brief.
|
|
348
|
+
content.push({
|
|
349
|
+
type: 'text',
|
|
350
|
+
text: '\n' + JSON.stringify({
|
|
351
|
+
is_complete: isComplete,
|
|
352
|
+
next_offset: isComplete ? null : nextOffset,
|
|
353
|
+
csv_file_path: writtenPaths ? writtenPaths.csv : null,
|
|
354
|
+
total_available: totalAvailable,
|
|
355
|
+
extracted_in_this_call: extractedInThisCall,
|
|
356
|
+
}, null, 2),
|
|
357
|
+
annotations: { audience: ['assistant'], priority: 0.4 },
|
|
358
|
+
});
|
|
359
|
+
|
|
313
360
|
return { content };
|
|
314
361
|
} finally {
|
|
315
362
|
try { await browser.close(); } catch (_) {}
|
|
@@ -610,6 +657,39 @@ function writeResults(directoryUrl, stores, csvText) {
|
|
|
610
657
|
return { json: `${base}.json`, csv: `${base}.csv`, dir: outDir };
|
|
611
658
|
}
|
|
612
659
|
|
|
660
|
+
/**
|
|
661
|
+
* Append CSV rows (header-stripped) to an existing CSV file.
|
|
662
|
+
* Validates that the target exists — otherwise the caller chained without
|
|
663
|
+
* a prior fresh batch, which would be a usage error.
|
|
664
|
+
*/
|
|
665
|
+
function appendRowsToCSV(csvPath, rowsOnlyText) {
|
|
666
|
+
if (!fs.existsSync(csvPath)) {
|
|
667
|
+
throw new Error(`append_to path does not exist: ${csvPath}. The first batch must run without append_to to create the file.`);
|
|
668
|
+
}
|
|
669
|
+
fs.appendFileSync(csvPath, rowsOnlyText);
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
/**
|
|
673
|
+
* Append stores to an existing JSON array file (which holds prior batches).
|
|
674
|
+
* Reads the file, parses, concats, rewrites. OK for the sizes we deal with.
|
|
675
|
+
*/
|
|
676
|
+
function appendStoresToJSON(jsonPath, stores) {
|
|
677
|
+
let existing = [];
|
|
678
|
+
if (fs.existsSync(jsonPath)) {
|
|
679
|
+
try {
|
|
680
|
+
const raw = fs.readFileSync(jsonPath, 'utf8');
|
|
681
|
+
const parsed = JSON.parse(raw);
|
|
682
|
+
if (Array.isArray(parsed)) existing = parsed;
|
|
683
|
+
} catch (_) {
|
|
684
|
+
// If parse fails, start a sibling .partial file rather than overwriting.
|
|
685
|
+
const partial = jsonPath.replace(/\.json$/, '.partial.json');
|
|
686
|
+
fs.writeFileSync(partial, JSON.stringify(stores, null, 2));
|
|
687
|
+
return;
|
|
688
|
+
}
|
|
689
|
+
}
|
|
690
|
+
fs.writeFileSync(jsonPath, JSON.stringify([...existing, ...stores], null, 2));
|
|
691
|
+
}
|
|
692
|
+
|
|
613
693
|
function errorResult(message) {
|
|
614
694
|
return { isError: true, content: [{ type: 'text', text: message }] };
|
|
615
695
|
}
|
package/src/storeModel.js
CHANGED
|
@@ -124,7 +124,16 @@ function csvCell(val, { alwaysQuote = false } = {}) {
|
|
|
124
124
|
* Booleans and numeric fields are left unquoted so destination systems can
|
|
125
125
|
* type-detect them.
|
|
126
126
|
*/
|
|
127
|
-
|
|
127
|
+
/**
|
|
128
|
+
* Serialize stores to CSV.
|
|
129
|
+
*
|
|
130
|
+
* Options:
|
|
131
|
+
* - rowsOnly: skip the BOM + header line (for appending to an existing CSV)
|
|
132
|
+
* - lineEnding, bom, alwaysQuoteStrings: as before
|
|
133
|
+
*/
|
|
134
|
+
function storesToCSV(stores, {
|
|
135
|
+
lineEnding = '\r\n', bom = true, alwaysQuoteStrings = true, rowsOnly = false,
|
|
136
|
+
} = {}) {
|
|
128
137
|
const csvFields = STORE_FIELDS.filter(f => !CSV_EXCLUDE_FIELDS.has(f));
|
|
129
138
|
const formatCell = (field, val) => {
|
|
130
139
|
if (BOOLEAN_FIELDS.has(field) || NUMERIC_FIELDS.has(field)) {
|
|
@@ -136,11 +145,13 @@ function storesToCSV(stores, { lineEnding = '\r\n', bom = true, alwaysQuoteStrin
|
|
|
136
145
|
.map(f => csvCell(f, { alwaysQuote: alwaysQuoteStrings }))
|
|
137
146
|
.join(',');
|
|
138
147
|
if (!stores || stores.length === 0) {
|
|
148
|
+
if (rowsOnly) return '';
|
|
139
149
|
return (bom ? '' : '') + headerLine + lineEnding;
|
|
140
150
|
}
|
|
141
151
|
const rows = stores.map(store =>
|
|
142
152
|
csvFields.map(f => formatCell(f, store[f] == null ? '' : store[f])).join(',')
|
|
143
153
|
);
|
|
154
|
+
if (rowsOnly) return rows.join(lineEnding) + lineEnding;
|
|
144
155
|
return (bom ? '' : '') + [headerLine, ...rows].join(lineEnding) + lineEnding;
|
|
145
156
|
}
|
|
146
157
|
|