mallmaverick-store-scraper 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +225 -0
- package/package.json +41 -0
- package/src/brandSiteFallback.js +272 -0
- package/src/browser.js +234 -0
- package/src/deterministic.js +235 -0
- package/src/discovery.js +298 -0
- package/src/externalFollow.js +89 -0
- package/src/hoursParser.js +313 -0
- package/src/hoursPipeline.js +151 -0
- package/src/imageExtraction.js +331 -0
- package/src/llmExtract.js +99 -0
- package/src/logoExtraction.js +130 -0
- package/src/main.js +330 -0
- package/src/mallContext.js +201 -0
- package/src/mcp-server.js +425 -0
- package/src/openai-proxy.js +52 -0
- package/src/output.js +21 -0
- package/src/retryStrategy.js +60 -0
- package/src/storeExtractor.js +239 -0
- package/src/storeModel.js +147 -0
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { URL } = require('url');
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Extract candidate logo image URLs from a store detail page.
|
|
7
|
+
* Returns ranked list (highest priority first).
|
|
8
|
+
*
|
|
9
|
+
* Priority:
|
|
10
|
+
* 1 — img with "logo" in src/alt/class/id/title
|
|
11
|
+
* 2 — og:image meta
|
|
12
|
+
* 3 — header/banner/store-logo containers
|
|
13
|
+
* 4 — JSON-LD image property
|
|
14
|
+
*/
|
|
15
|
+
async function extractLogoImages(page, storeUrl) {
|
|
16
|
+
let origin = '';
|
|
17
|
+
try { origin = new URL(storeUrl).origin; } catch (_) {}
|
|
18
|
+
|
|
19
|
+
try {
|
|
20
|
+
return await page.evaluate((origin) => {
|
|
21
|
+
const resolve = (src) => {
|
|
22
|
+
if (!src) return null;
|
|
23
|
+
if (src.startsWith('data:')) return null;
|
|
24
|
+
if (src.startsWith('http')) return src;
|
|
25
|
+
if (src.startsWith('//')) return 'https:' + src;
|
|
26
|
+
if (src.startsWith('/')) return origin + src;
|
|
27
|
+
return null;
|
|
28
|
+
};
|
|
29
|
+
|
|
30
|
+
const candidates = [];
|
|
31
|
+
|
|
32
|
+
// P1: img with "logo" anywhere
|
|
33
|
+
document.querySelectorAll('img').forEach(img => {
|
|
34
|
+
const attrs = [
|
|
35
|
+
img.getAttribute('src') || '',
|
|
36
|
+
img.getAttribute('data-src') || '',
|
|
37
|
+
img.getAttribute('alt') || '',
|
|
38
|
+
img.getAttribute('class') || '',
|
|
39
|
+
img.getAttribute('id') || '',
|
|
40
|
+
img.getAttribute('title') || '',
|
|
41
|
+
img.getAttribute('data-lazy-src') || '',
|
|
42
|
+
].join(' ').toLowerCase();
|
|
43
|
+
if (attrs.includes('logo')) {
|
|
44
|
+
const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-lazy-src');
|
|
45
|
+
const r = resolve(src);
|
|
46
|
+
if (r) candidates.push({ url: r, priority: 1 });
|
|
47
|
+
}
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
// P2: og:image
|
|
51
|
+
const og = document.querySelector('meta[property="og:image"]');
|
|
52
|
+
if (og) {
|
|
53
|
+
const r = resolve(og.getAttribute('content'));
|
|
54
|
+
if (r) candidates.push({ url: r, priority: 2 });
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
// P3: header / store-logo / brand-logo containers
|
|
58
|
+
const containerSelectors = [
|
|
59
|
+
'header img', '.store-logo img', '.brand-logo img',
|
|
60
|
+
'[class*="logo"] img', '[id*="logo"] img',
|
|
61
|
+
'.store-header img', '.store-profile img', '.store-banner img',
|
|
62
|
+
'.tenant-logo img', '.retailer-logo img',
|
|
63
|
+
];
|
|
64
|
+
for (const sel of containerSelectors) {
|
|
65
|
+
document.querySelectorAll(sel).forEach(img => {
|
|
66
|
+
const src = img.getAttribute('src') || img.getAttribute('data-src');
|
|
67
|
+
const r = resolve(src);
|
|
68
|
+
if (r) candidates.push({ url: r, priority: 3 });
|
|
69
|
+
});
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const seen = new Set();
|
|
73
|
+
return candidates
|
|
74
|
+
.filter(c => { if (seen.has(c.url)) return false; seen.add(c.url); return true; })
|
|
75
|
+
.sort((a, b) => a.priority - b.priority)
|
|
76
|
+
.map(c => c.url)
|
|
77
|
+
.slice(0, 5);
|
|
78
|
+
}, origin);
|
|
79
|
+
} catch (_) { return []; }
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Pick the best logo image given (directoryLogoUrl, pageLogoCandidates, jsonLd).
|
|
84
|
+
* Returns the chosen URL or ''.
|
|
85
|
+
*/
|
|
86
|
+
function pickBestLogo({ directoryLogoUrl, pageLogoCandidates, jsonLd }) {
|
|
87
|
+
if (directoryLogoUrl && !looksLikePlaceholder(directoryLogoUrl)) return directoryLogoUrl;
|
|
88
|
+
|
|
89
|
+
// JSON-LD image (Schema.org LocalBusiness.image)
|
|
90
|
+
if (jsonLd && jsonLd.length) {
|
|
91
|
+
for (const node of jsonLd) {
|
|
92
|
+
const img = imageFromJsonLd(node);
|
|
93
|
+
if (img) return img;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
if (pageLogoCandidates && pageLogoCandidates.length > 0) {
|
|
98
|
+
return pageLogoCandidates[0];
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
return '';
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
function imageFromJsonLd(node) {
|
|
105
|
+
if (!node || typeof node !== 'object') return null;
|
|
106
|
+
if (Array.isArray(node)) {
|
|
107
|
+
for (const n of node) { const r = imageFromJsonLd(n); if (r) return r; }
|
|
108
|
+
return null;
|
|
109
|
+
}
|
|
110
|
+
if (node.image) {
|
|
111
|
+
if (typeof node.image === 'string') return node.image;
|
|
112
|
+
if (Array.isArray(node.image) && node.image[0]) {
|
|
113
|
+
return typeof node.image[0] === 'string' ? node.image[0] : node.image[0].url;
|
|
114
|
+
}
|
|
115
|
+
if (node.image.url) return node.image.url;
|
|
116
|
+
}
|
|
117
|
+
if (node.logo) {
|
|
118
|
+
if (typeof node.logo === 'string') return node.logo;
|
|
119
|
+
if (node.logo.url) return node.logo.url;
|
|
120
|
+
}
|
|
121
|
+
if (node['@graph']) return imageFromJsonLd(node['@graph']);
|
|
122
|
+
return null;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
function looksLikePlaceholder(url) {
|
|
126
|
+
if (!url) return true;
|
|
127
|
+
return /placeholder|spacer|blank|default[-_]?(image|logo)/i.test(url);
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
module.exports = { extractLogoImages, pickBestLogo };
|
package/src/main.js
ADDED
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
require('dotenv').config();
|
|
4
|
+
|
|
5
|
+
const fs = require('fs');
|
|
6
|
+
const path = require('path');
|
|
7
|
+
const { URL } = require('url');
|
|
8
|
+
const readline = require('readline-sync');
|
|
9
|
+
const pLimit = require('p-limit');
|
|
10
|
+
const { createOpenAIClient, describeCredentials } = require('./openai-proxy');
|
|
11
|
+
|
|
12
|
+
const {
|
|
13
|
+
launchBrowser, newPage, loadPageWithStrategy,
|
|
14
|
+
attachXhrInterceptor, captureScreenshot, sleep,
|
|
15
|
+
} = require('./browser');
|
|
16
|
+
const { discoverStores } = require('./discovery');
|
|
17
|
+
const { getMallContext } = require('./mallContext');
|
|
18
|
+
const { extractHours } = require('./hoursPipeline');
|
|
19
|
+
const { classifyImages, pickImages } = require('./imageExtraction');
|
|
20
|
+
const { fetchBrandLogo } = require('./brandSiteFallback');
|
|
21
|
+
const {
|
|
22
|
+
extractPhone, extractSocials, extractWebsite, detectStatusFlags,
|
|
23
|
+
} = require('./deterministic');
|
|
24
|
+
const { StoreExtractor } = require('./storeExtractor');
|
|
25
|
+
const { scrapeWithRetry, DEFAULT_THRESHOLD } = require('./retryStrategy');
|
|
26
|
+
const { mergeExtracted, storesToCSV } = require('./storeModel');
|
|
27
|
+
|
|
28
|
+
const logger = {
|
|
29
|
+
info: (...a) => console.log(...a),
|
|
30
|
+
warn: (...a) => console.warn(...a),
|
|
31
|
+
error: (...a) => console.error(...a),
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
function prompt(label, def) {
|
|
35
|
+
const v = readline.question(`${label}${def !== undefined ? ` [${def}]` : ''}: `);
|
|
36
|
+
return v.trim() || def;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function parseArgs(argv) {
|
|
40
|
+
const out = {};
|
|
41
|
+
for (let i = 0; i < argv.length; i++) {
|
|
42
|
+
const a = argv[i];
|
|
43
|
+
if (a === '--url') out.url = argv[++i];
|
|
44
|
+
else if (a === '--model') out.model = argv[++i];
|
|
45
|
+
else if (a === '--max') out.max = parseInt(argv[++i], 10);
|
|
46
|
+
else if (a === '--concurrency') out.concurrency = parseInt(argv[++i], 10);
|
|
47
|
+
else if (a === '--threshold') out.threshold = parseFloat(argv[++i]);
|
|
48
|
+
else if (a === '--vision') out.vision = true;
|
|
49
|
+
else if (a === '--no-vision') out.vision = false;
|
|
50
|
+
}
|
|
51
|
+
return out;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
async function main() {
|
|
55
|
+
const creds = describeCredentials();
|
|
56
|
+
if (creds.mode === 'none') {
|
|
57
|
+
logger.error(
|
|
58
|
+
'No OpenAI credentials. Set MALL_SCRAPER_PROXY_URL+MALL_SCRAPER_TOKEN ' +
|
|
59
|
+
'(via Cloudflare Worker) or OPENAI_API_KEY in .env.'
|
|
60
|
+
);
|
|
61
|
+
process.exit(1);
|
|
62
|
+
}
|
|
63
|
+
logger.info(`🔑 Auth: ${creds.mode} (${creds.endpoint})`);
|
|
64
|
+
|
|
65
|
+
const args = parseArgs(process.argv.slice(2));
|
|
66
|
+
const directoryUrl = args.url || prompt('Directory URL', 'https://shopcurrents.ca/directory/a-z-listing/');
|
|
67
|
+
const model = args.model || prompt('Model', 'gpt-5.4-mini');
|
|
68
|
+
const maxStoresInput = args.max != null ? String(args.max) : prompt('Max stores (0 = all)', '5');
|
|
69
|
+
const maxStores = parseInt(maxStoresInput, 10) || 0;
|
|
70
|
+
const concurrency = parseInt(args.concurrency || prompt('Concurrency', '2'), 10) || 2;
|
|
71
|
+
const threshold = args.threshold != null ? args.threshold
|
|
72
|
+
: parseFloat(prompt('Confidence threshold', String(DEFAULT_THRESHOLD))) || DEFAULT_THRESHOLD;
|
|
73
|
+
const useVision = args.vision != null ? args.vision
|
|
74
|
+
: ((prompt('Use Vision (screenshots)?', 'n').toLowerCase().startsWith('y')));
|
|
75
|
+
|
|
76
|
+
const client = createOpenAIClient();
|
|
77
|
+
const browser = await launchBrowser({ headless: true });
|
|
78
|
+
|
|
79
|
+
const mallRoot = new URL(directoryUrl).origin;
|
|
80
|
+
const mallOrigin = mallRoot;
|
|
81
|
+
const outDir = path.join(__dirname, '..', 'extracted_stores');
|
|
82
|
+
const screenshotDir = useVision ? path.join(__dirname, '..', 'screenshots') : null;
|
|
83
|
+
fs.mkdirSync(outDir, { recursive: true });
|
|
84
|
+
if (screenshotDir) fs.mkdirSync(screenshotDir, { recursive: true });
|
|
85
|
+
|
|
86
|
+
const host = new URL(directoryUrl).hostname.replace(/^www\./, '');
|
|
87
|
+
const ts = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
|
|
88
|
+
const jsonPath = path.join(outDir, `stores_v5_${host}_${ts}.json`);
|
|
89
|
+
const csvPath = path.join(outDir, `stores_v5_${host}_${ts}.csv`);
|
|
90
|
+
|
|
91
|
+
const extractor = new StoreExtractor({ client, model, useVision, logger });
|
|
92
|
+
|
|
93
|
+
try {
|
|
94
|
+
logger.info(`\n🏬 Mall root: ${mallRoot}`);
|
|
95
|
+
logger.info('→ Fetching mall context (general hours)...');
|
|
96
|
+
const mallContext = await getMallContext(browser, mallRoot);
|
|
97
|
+
if (mallContext.canonical) {
|
|
98
|
+
logger.info(` ✓ Mall hours via ${mallContext.layer}: ${mallContext.canonical}`);
|
|
99
|
+
} else {
|
|
100
|
+
logger.info(' (no mall hours found — sync-with-mall layer will be skipped)');
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
logger.info(`\n📋 Discovering stores at ${directoryUrl}...`);
|
|
104
|
+
const { storeUrls: allUrls, logoMap } = await discoverStores(browser, directoryUrl, logger);
|
|
105
|
+
const storeCardLogos = Array.from(logoMap.values());
|
|
106
|
+
const urlsToScrape = maxStores > 0 ? allUrls.slice(0, maxStores) : allUrls;
|
|
107
|
+
logger.info(`\n→ Scraping ${urlsToScrape.length} stores (concurrency=${concurrency}, threshold=${threshold}, vision=${useVision})\n`);
|
|
108
|
+
|
|
109
|
+
const results = [];
|
|
110
|
+
const limit = pLimit(concurrency);
|
|
111
|
+
let done = 0;
|
|
112
|
+
|
|
113
|
+
const tasks = urlsToScrape.map((url, idx) =>
|
|
114
|
+
limit(async () => {
|
|
115
|
+
const mmId = idx + 1;
|
|
116
|
+
const normalizedUrl = url.replace(/\/$/, '').toLowerCase();
|
|
117
|
+
const directoryLogoUrl = logoMap.get(normalizedUrl) || null;
|
|
118
|
+
|
|
119
|
+
const result = await processStoreWithRetry({
|
|
120
|
+
url, mmId, browser, client, model, extractor,
|
|
121
|
+
directoryLogoUrl, mallContext, mallOrigin, storeCardLogos,
|
|
122
|
+
threshold, screenshotDir, useVision, logger,
|
|
123
|
+
});
|
|
124
|
+
|
|
125
|
+
results[idx] = result; // preserve order
|
|
126
|
+
done++;
|
|
127
|
+
saveCheckpoint(jsonPath, csvPath, results.filter(Boolean), logger);
|
|
128
|
+
logger.info(` [done ${done}/${urlsToScrape.length}]`);
|
|
129
|
+
})
|
|
130
|
+
);
|
|
131
|
+
await Promise.all(tasks);
|
|
132
|
+
|
|
133
|
+
const stores = results.filter(Boolean);
|
|
134
|
+
saveResults(jsonPath, csvPath, stores, logger);
|
|
135
|
+
printSummary(stores, extractor, jsonPath, csvPath);
|
|
136
|
+
} finally {
|
|
137
|
+
await browser.close();
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
async function processStoreWithRetry(ctx) {
|
|
142
|
+
const {
|
|
143
|
+
url, mmId, browser, client, model, extractor,
|
|
144
|
+
directoryLogoUrl, mallContext, mallOrigin, storeCardLogos,
|
|
145
|
+
threshold, screenshotDir, useVision, logger,
|
|
146
|
+
} = ctx;
|
|
147
|
+
|
|
148
|
+
logger.info(`\n[${mmId}] ${url}`);
|
|
149
|
+
|
|
150
|
+
const runOnce = async (attempt) => {
|
|
151
|
+
const page = await newPage(browser);
|
|
152
|
+
const { interceptedJson } = await attachXhrInterceptor(page, { directoryMode: false });
|
|
153
|
+
try {
|
|
154
|
+
const data = await loadPageWithStrategy(page, url, { attempt });
|
|
155
|
+
|
|
156
|
+
const links = await page.evaluate(() => {
|
|
157
|
+
return Array.from(document.querySelectorAll('a[href]')).map(a => ({
|
|
158
|
+
href: a.href, text: (a.innerText || '').trim(),
|
|
159
|
+
})).filter(o => o.href);
|
|
160
|
+
});
|
|
161
|
+
const screenshotBase64 = useVision && screenshotDir
|
|
162
|
+
? await captureScreenshot(page, url, screenshotDir, attempt)
|
|
163
|
+
: null;
|
|
164
|
+
|
|
165
|
+
const urlSlug = (() => {
|
|
166
|
+
try {
|
|
167
|
+
const parts = new URL(url).pathname.replace(/\/$/, '').split('/').filter(Boolean);
|
|
168
|
+
return parts[parts.length - 1] || '';
|
|
169
|
+
} catch { return ''; }
|
|
170
|
+
})();
|
|
171
|
+
|
|
172
|
+
const name = data.h1 || slugToName(urlSlug) || data.title || '';
|
|
173
|
+
|
|
174
|
+
// --- Image classification (deterministic, runs before LLM) ---
|
|
175
|
+
const rawCandidates = await classifyImages(page, url, {
|
|
176
|
+
storeName: name,
|
|
177
|
+
mallName: mallContext.mallName || '',
|
|
178
|
+
mallEcosystem: mallContext.mallEcosystemDomains || [],
|
|
179
|
+
mallChromeImages: mallContext.mallChromeImages || [],
|
|
180
|
+
storeCardLogos: storeCardLogos || [],
|
|
181
|
+
});
|
|
182
|
+
const imagePicks = pickImages(rawCandidates, { directoryLogoUrl, storeName: name });
|
|
183
|
+
|
|
184
|
+
// --- Hours pipeline ---
|
|
185
|
+
const hoursResult = await extractHours({
|
|
186
|
+
url, text: data.text, html: data.html, jsonLd: data.jsonLd, metaTags: data.metaTags, links,
|
|
187
|
+
}, {
|
|
188
|
+
mallContext, client, model, browser, mallOrigin, logger,
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
// --- Deterministic extractors ---
|
|
192
|
+
const phone = extractPhone(data.text, data.jsonLd);
|
|
193
|
+
const socials = extractSocials(links, mallContext.mallSocials);
|
|
194
|
+
const website = extractWebsite(links, mallOrigin, name, mallContext.mallEcosystemDomains || []);
|
|
195
|
+
const flagsFromText = detectStatusFlags(data.text);
|
|
196
|
+
|
|
197
|
+
// --- LLM extractor (all other fields) ---
|
|
198
|
+
const llmInput = {
|
|
199
|
+
url, urlSlug, h1: data.h1, title: data.title,
|
|
200
|
+
textContent: data.text, jsonLd: data.jsonLd, metaTags: data.metaTags,
|
|
201
|
+
interceptedJson: interceptedJson.slice(0, 3),
|
|
202
|
+
screenshotBase64,
|
|
203
|
+
};
|
|
204
|
+
const { fields: llmFields, confidence: llmConfidence } =
|
|
205
|
+
await extractor.extract(llmInput, hoursResult.canonical);
|
|
206
|
+
|
|
207
|
+
// --- Image fields: deterministic classifier wins, LLM only fills empty slots ---
|
|
208
|
+
let logoUrl = imagePicks.logo_image_url || llmFields.logo_image_url || '';
|
|
209
|
+
let logoSource = 'mall';
|
|
210
|
+
|
|
211
|
+
// Brand-site fallback: if mall-picked logo is empty OR a GIF, try the
|
|
212
|
+
// store's own website. Many CMSes (imgix-based ones especially) refuse
|
|
213
|
+
// to ingest GIFs, so a non-GIF from the brand site is more reliable.
|
|
214
|
+
const isGifUrl = /\.gif(\?|$)/i.test(logoUrl);
|
|
215
|
+
if ((!logoUrl || isGifUrl) && website) {
|
|
216
|
+
try {
|
|
217
|
+
const fallback = await fetchBrandLogo(browser, website, name, { logger });
|
|
218
|
+
if (fallback && fallback.url) {
|
|
219
|
+
if (!logoUrl || (isGifUrl && !/\.gif(\?|$)/i.test(fallback.url))) {
|
|
220
|
+
logger.info(` ↳ brand-site logo: ${fallback.source} → ${fallback.url}`);
|
|
221
|
+
logoUrl = fallback.url;
|
|
222
|
+
logoSource = fallback.source;
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
} catch (err) {
|
|
226
|
+
logger.warn(` ⚠ brand-site fallback errored: ${err.message}`);
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
const imageFields = {
|
|
231
|
+
logo_image_url: logoUrl,
|
|
232
|
+
brand_image_url: imagePicks.brand_image_url || llmFields.brand_image_url || '',
|
|
233
|
+
store_front_image_url: imagePicks.store_front_image_url || llmFields.store_front_image_url || '',
|
|
234
|
+
logo_image_url_alt_text: imagePicks.logo_image_url_alt_text || '',
|
|
235
|
+
brand_image_url_alt_text: imagePicks.brand_image_url_alt_text || '',
|
|
236
|
+
store_front_image_url_alt_text: imagePicks.store_front_image_url_alt_text || '',
|
|
237
|
+
};
|
|
238
|
+
|
|
239
|
+
// --- Merge into final store record ---
|
|
240
|
+
const merged = {
|
|
241
|
+
name,
|
|
242
|
+
website,
|
|
243
|
+
phone,
|
|
244
|
+
...socials,
|
|
245
|
+
...flagsFromText,
|
|
246
|
+
...llmFields,
|
|
247
|
+
...imageFields,
|
|
248
|
+
store_hours: hoursResult.canonical,
|
|
249
|
+
hours_source: hoursResult.source || '',
|
|
250
|
+
hours_confidence: hoursResult.confidence,
|
|
251
|
+
sync_with_centre_hours: hoursResult.sync_with_centre_hours || false,
|
|
252
|
+
};
|
|
253
|
+
const store = mergeExtracted(mmId, merged);
|
|
254
|
+
|
|
255
|
+
// Combined confidence — average hours and LLM-fields halves
|
|
256
|
+
const combinedConfidence = (hoursResult.confidence + llmConfidence) / 2;
|
|
257
|
+
|
|
258
|
+
logger.info(
|
|
259
|
+
` ✓ "${store.name}" | hours ${hoursResult.source || '(none)'} ${pct(hoursResult.confidence)}` +
|
|
260
|
+
` | fields ${pct(llmConfidence)} | combined ${pct(combinedConfidence)}`
|
|
261
|
+
);
|
|
262
|
+
|
|
263
|
+
return { store, combinedConfidence, hoursResult, llmConfidence };
|
|
264
|
+
} finally {
|
|
265
|
+
try { await page.close(); } catch (_) {}
|
|
266
|
+
}
|
|
267
|
+
};
|
|
268
|
+
|
|
269
|
+
const out = await scrapeWithRetry({ runOnce, threshold, logger });
|
|
270
|
+
if (out.store) {
|
|
271
|
+
out.store._meta = {
|
|
272
|
+
source_url: url,
|
|
273
|
+
combined_confidence: out.combinedConfidence,
|
|
274
|
+
hours_confidence: out.hoursResult ? out.hoursResult.confidence : 0,
|
|
275
|
+
llm_confidence: out.llmConfidence || 0,
|
|
276
|
+
strategy: out.strategy,
|
|
277
|
+
attempts: out.attempt,
|
|
278
|
+
needs_review: out.needs_review,
|
|
279
|
+
};
|
|
280
|
+
}
|
|
281
|
+
return out.store;
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
function slugToName(slug) {
|
|
285
|
+
if (!slug) return '';
|
|
286
|
+
return slug.split('-').map(w => w.charAt(0).toUpperCase() + w.slice(1)).join(' ');
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
function pct(n) { return `${Math.round((n || 0) * 100)}%`; }
|
|
290
|
+
|
|
291
|
+
function saveCheckpoint(jsonPath, csvPath, stores, _logger) {
|
|
292
|
+
try {
|
|
293
|
+
fs.writeFileSync(jsonPath, JSON.stringify(stores, null, 2));
|
|
294
|
+
fs.writeFileSync(csvPath, storesToCSV(stores));
|
|
295
|
+
} catch (_) {}
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
function saveResults(jsonPath, csvPath, stores, logger) {
|
|
299
|
+
fs.writeFileSync(jsonPath, JSON.stringify(stores, null, 2));
|
|
300
|
+
fs.writeFileSync(csvPath, storesToCSV(stores));
|
|
301
|
+
logger.info(`\n💾 Wrote ${stores.length} stores`);
|
|
302
|
+
logger.info(` JSON: ${jsonPath}`);
|
|
303
|
+
logger.info(` CSV: ${csvPath}`);
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
function printSummary(stores, extractor, jsonPath, csvPath) {
|
|
307
|
+
const usage = extractor.getUsageSummary();
|
|
308
|
+
const needsReview = stores.filter(s => s._meta && s._meta.needs_review).length;
|
|
309
|
+
|
|
310
|
+
const bySource = {};
|
|
311
|
+
for (const s of stores) {
|
|
312
|
+
const src = s.hours_source || '(none)';
|
|
313
|
+
bySource[src] = (bySource[src] || 0) + 1;
|
|
314
|
+
}
|
|
315
|
+
const avgHoursConf = stores.reduce((a, s) => a + (s.hours_confidence || 0), 0) / Math.max(1, stores.length);
|
|
316
|
+
const avgFieldConf = stores.reduce((a, s) => a + ((s._meta && s._meta.llm_confidence) || 0), 0) / Math.max(1, stores.length);
|
|
317
|
+
|
|
318
|
+
logger.info('\n📊 Hours-extraction layer breakdown:');
|
|
319
|
+
for (const [src, n] of Object.entries(bySource).sort((a, b) => b[1] - a[1])) {
|
|
320
|
+
logger.info(` ${src}: ${n}`);
|
|
321
|
+
}
|
|
322
|
+
logger.info(`\n📈 Avg hours conf: ${pct(avgHoursConf)} | Avg field conf: ${pct(avgFieldConf)}`);
|
|
323
|
+
logger.info(`⚠ Needs review: ${needsReview}/${stores.length}`);
|
|
324
|
+
logger.info(`💰 LLM cost: ${usage.estimatedCost} (${usage.totalInputTokens}+${usage.totalOutputTokens} tok)`);
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
main().catch(err => {
|
|
328
|
+
logger.error(err.stack || err.message);
|
|
329
|
+
process.exit(1);
|
|
330
|
+
});
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { URL } = require('url');
|
|
4
|
+
const { loadPage, newPage } = require('./browser');
|
|
5
|
+
const {
|
|
6
|
+
parseJsonLdHours,
|
|
7
|
+
parseFreeFormHours,
|
|
8
|
+
canonicalize,
|
|
9
|
+
validateCanonical,
|
|
10
|
+
DAYS,
|
|
11
|
+
} = require('./hoursParser');
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Fetch mall context once per session:
|
|
15
|
+
* - mall hours (homepage / /hours / /contact)
|
|
16
|
+
* - mall's own social media URLs (footer / nav)
|
|
17
|
+
* - mall's "ecosystem" external domains (sister properties, ownership group)
|
|
18
|
+
*
|
|
19
|
+
* Returns { canonical, sourceUrl, layer, mallSocials, mallEcosystemDomains, mallRootUrl }.
|
|
20
|
+
* `canonical` is '' if no hours found.
|
|
21
|
+
*/
|
|
22
|
+
async function getMallContext(browser, mallRootUrl) {
|
|
23
|
+
const origin = new URL(mallRootUrl).origin;
|
|
24
|
+
const candidates = [mallRootUrl, `${origin}/hours/`, `${origin}/mall-hours/`, `${origin}/contact/`];
|
|
25
|
+
const tried = new Set();
|
|
26
|
+
let hoursResult = null;
|
|
27
|
+
|
|
28
|
+
// Always scan the homepage once to gather socials + ecosystem links (even if hours found elsewhere)
|
|
29
|
+
let mallSocials = {
|
|
30
|
+
facebook: '', instagram: '', twitter: '', youtube: '', tiktok: '', pinterest: '',
|
|
31
|
+
};
|
|
32
|
+
let mallEcosystemDomains = new Set();
|
|
33
|
+
let mallName = '';
|
|
34
|
+
let mallChromeImages = new Set();
|
|
35
|
+
|
|
36
|
+
for (const url of candidates) {
|
|
37
|
+
if (tried.has(url)) continue;
|
|
38
|
+
tried.add(url);
|
|
39
|
+
const page = await newPage(browser);
|
|
40
|
+
try {
|
|
41
|
+
const data = await loadPage(page, url);
|
|
42
|
+
|
|
43
|
+
// Hours (any layer)
|
|
44
|
+
if (!hoursResult) {
|
|
45
|
+
const ld = parseJsonLdHours(data.jsonLd);
|
|
46
|
+
if (ld) {
|
|
47
|
+
const c = canonicalize(ld);
|
|
48
|
+
if (validateCanonical(c).ok) hoursResult = { canonical: c, sourceUrl: data.finalUrl, layer: 'jsonld' };
|
|
49
|
+
}
|
|
50
|
+
if (!hoursResult) {
|
|
51
|
+
const ff = parseFreeFormHours(data.text);
|
|
52
|
+
if (ff) {
|
|
53
|
+
const c = canonicalize(ff);
|
|
54
|
+
if (validateCanonical(c).ok) hoursResult = { canonical: c, sourceUrl: data.finalUrl, layer: 'freeform' };
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Mall name detection — multiple sources, take the longest meaningful one.
|
|
60
|
+
if (url === mallRootUrl && !mallName) {
|
|
61
|
+
const candidates = [];
|
|
62
|
+
// 1. og:site_name meta (best when present)
|
|
63
|
+
if (data.metaTags && data.metaTags['og:site_name']) candidates.push(data.metaTags['og:site_name']);
|
|
64
|
+
if (data.metaTags && data.metaTags['application-name']) candidates.push(data.metaTags['application-name']);
|
|
65
|
+
// 2. Header logo alt text
|
|
66
|
+
try {
|
|
67
|
+
const headerAlt = await page.evaluate(() => {
|
|
68
|
+
const sels = [
|
|
69
|
+
'header img[alt]', 'header [class*="logo"] img[alt]',
|
|
70
|
+
'.site-logo img[alt]', '.site-branding img[alt]',
|
|
71
|
+
'a[class*="logo"] img[alt]', 'a[rel="home"] img[alt]',
|
|
72
|
+
];
|
|
73
|
+
for (const s of sels) {
|
|
74
|
+
const el = document.querySelector(s);
|
|
75
|
+
if (el && el.getAttribute('alt')) return el.getAttribute('alt');
|
|
76
|
+
}
|
|
77
|
+
return '';
|
|
78
|
+
});
|
|
79
|
+
if (headerAlt) candidates.push(headerAlt);
|
|
80
|
+
} catch (_) {}
|
|
81
|
+
// 3. Page title, suffix-stripped, only if not generic ("Home")
|
|
82
|
+
const t = (data.title || '').trim();
|
|
83
|
+
if (t && !/^home\b/i.test(t)) {
|
|
84
|
+
candidates.push(t.split(/\s*[|\-–—]\s*/)[0].trim());
|
|
85
|
+
}
|
|
86
|
+
// Pick the longest non-generic candidate
|
|
87
|
+
for (const c of candidates) {
|
|
88
|
+
const cl = String(c).trim();
|
|
89
|
+
if (cl.length >= 4 && !/^home$/i.test(cl) && cl.length > mallName.length) mallName = cl;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// Mall socials + ecosystem + chrome images (all images on the mall homepage are chrome)
|
|
94
|
+
if (url === mallRootUrl) {
|
|
95
|
+
const homepageImages = await page.evaluate(() => {
|
|
96
|
+
const out = new Set();
|
|
97
|
+
document.querySelectorAll('img').forEach(img => {
|
|
98
|
+
const src = img.currentSrc || img.src || img.getAttribute('data-src') || img.getAttribute('data-lazy-src');
|
|
99
|
+
if (src && !src.startsWith('data:')) out.add(src);
|
|
100
|
+
});
|
|
101
|
+
const og = document.querySelector('meta[property="og:image"]');
|
|
102
|
+
if (og && og.getAttribute('content')) out.add(og.getAttribute('content'));
|
|
103
|
+
return Array.from(out);
|
|
104
|
+
});
|
|
105
|
+
for (const u of homepageImages) mallChromeImages.add(u);
|
|
106
|
+
|
|
107
|
+
const linksAndDomains = await page.evaluate((origin) => {
|
|
108
|
+
const links = Array.from(document.querySelectorAll('a[href]'))
|
|
109
|
+
.map(a => a.href).filter(h => /^https?:/.test(h));
|
|
110
|
+
const externals = links.filter(h => { try { return new URL(h).origin !== origin; } catch { return false; } });
|
|
111
|
+
return externals;
|
|
112
|
+
}, origin);
|
|
113
|
+
|
|
114
|
+
const socialPatterns = {
|
|
115
|
+
facebook: /^https?:\/\/(?:www\.|m\.)?facebook\.com\/[^/?]+/i,
|
|
116
|
+
instagram: /^https?:\/\/(?:www\.)?instagram\.com\/[^/?]+/i,
|
|
117
|
+
twitter: /^https?:\/\/(?:www\.)?(?:twitter|x)\.com\/[^/?]+/i,
|
|
118
|
+
youtube: /^https?:\/\/(?:www\.)?youtube\.com\/(?:@|c\/|channel\/|user\/)[^/?]+/i,
|
|
119
|
+
tiktok: /^https?:\/\/(?:www\.)?tiktok\.com\/@[^/?]+/i,
|
|
120
|
+
pinterest: /^https?:\/\/(?:www\.)?pinterest\.[a-z.]+\/[^/?]+/i,
|
|
121
|
+
};
|
|
122
|
+
for (const ext of linksAndDomains) {
|
|
123
|
+
try {
|
|
124
|
+
const u = new URL(ext);
|
|
125
|
+
mallEcosystemDomains.add(u.hostname.replace(/^www\./, ''));
|
|
126
|
+
} catch (_) {}
|
|
127
|
+
for (const [k, re] of Object.entries(socialPatterns)) {
|
|
128
|
+
if (!mallSocials[k] && re.test(ext)) mallSocials[k] = ext.split(/[?#]/)[0];
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
} catch (_) {
|
|
133
|
+
} finally {
|
|
134
|
+
try { await page.close(); } catch (_) {}
|
|
135
|
+
}
|
|
136
|
+
if (hoursResult && url === candidates[candidates.length - 1]) break;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
if (!hoursResult) hoursResult = { canonical: '', sourceUrl: '', layer: null };
|
|
140
|
+
return {
|
|
141
|
+
...hoursResult,
|
|
142
|
+
mallName,
|
|
143
|
+
mallSocials,
|
|
144
|
+
mallEcosystemDomains: Array.from(mallEcosystemDomains),
|
|
145
|
+
mallChromeImages: Array.from(mallChromeImages),
|
|
146
|
+
mallRootUrl,
|
|
147
|
+
};
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
async function tryHoursFromUrl(browser, url) {
|
|
151
|
+
const page = await newPage(browser);
|
|
152
|
+
try {
|
|
153
|
+
const data = await loadPage(page, url);
|
|
154
|
+
// JSON-LD first
|
|
155
|
+
const ld = parseJsonLdHours(data.jsonLd);
|
|
156
|
+
if (ld) {
|
|
157
|
+
const canonical = canonicalize(ld);
|
|
158
|
+
if (validateCanonical(canonical).ok) {
|
|
159
|
+
return { canonical, sourceUrl: data.finalUrl, layer: 'jsonld' };
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
// Free-form scan of the page text
|
|
163
|
+
const ff = parseFreeFormHours(data.text);
|
|
164
|
+
if (ff) {
|
|
165
|
+
const canonical = canonicalize(ff);
|
|
166
|
+
if (validateCanonical(canonical).ok) {
|
|
167
|
+
return { canonical, sourceUrl: data.finalUrl, layer: 'freeform' };
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
return null;
|
|
171
|
+
} catch (_) {
|
|
172
|
+
return null;
|
|
173
|
+
} finally {
|
|
174
|
+
await page.close();
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
/**
|
|
179
|
+
* Detect "store hours follow mall hours" phrasings on a store page.
|
|
180
|
+
* Conservative — false positives here are worse than false negatives,
|
|
181
|
+
* because we'll blindly apply mall hours when a match is found.
|
|
182
|
+
*/
|
|
183
|
+
const SYNC_PATTERNS = [
|
|
184
|
+
/store\s+hours?\s+(?:are\s+)?(?:the\s+)?same\s+as\s+(?:the\s+)?(?:mall|cent(?:re|er))/i,
|
|
185
|
+
/(?:follows?|same as|match(?:es)?)\s+(?:the\s+)?(?:mall|cent(?:re|er))\s+hours?/i,
|
|
186
|
+
/open\s+during\s+(?:regular\s+)?(?:mall|cent(?:re|er))\s+hours?/i,
|
|
187
|
+
/hours?\s+(?:are\s+)?(?:the\s+)?same as (?:mall|cent(?:re|er))/i,
|
|
188
|
+
/see\s+(?:mall|cent(?:re|er))\s+hours?/i,
|
|
189
|
+
/(?:mall|cent(?:re|er))\s+hours?\s+apply/i,
|
|
190
|
+
/hours?\s+follow\s+(?:the\s+)?(?:mall|cent(?:re|er))/i,
|
|
191
|
+
];
|
|
192
|
+
|
|
193
|
+
function detectsSyncWithMall(text) {
|
|
194
|
+
if (!text) return false;
|
|
195
|
+
for (const re of SYNC_PATTERNS) {
|
|
196
|
+
if (re.test(text)) return true;
|
|
197
|
+
}
|
|
198
|
+
return false;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
module.exports = { getMallContext, detectsSyncWithMall, SYNC_PATTERNS };
|