mallmaverick-store-scraper 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +225 -0
- package/package.json +41 -0
- package/src/brandSiteFallback.js +272 -0
- package/src/browser.js +234 -0
- package/src/deterministic.js +235 -0
- package/src/discovery.js +298 -0
- package/src/externalFollow.js +89 -0
- package/src/hoursParser.js +313 -0
- package/src/hoursPipeline.js +151 -0
- package/src/imageExtraction.js +331 -0
- package/src/llmExtract.js +99 -0
- package/src/logoExtraction.js +130 -0
- package/src/main.js +330 -0
- package/src/mallContext.js +201 -0
- package/src/mcp-server.js +425 -0
- package/src/openai-proxy.js +52 -0
- package/src/output.js +21 -0
- package/src/retryStrategy.js +60 -0
- package/src/storeExtractor.js +239 -0
- package/src/storeModel.js +147 -0
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const {
|
|
4
|
+
parseJsonLdHours,
|
|
5
|
+
parseFreeFormHours,
|
|
6
|
+
parseSchemaOpeningHours,
|
|
7
|
+
canonicalize,
|
|
8
|
+
validateCanonical,
|
|
9
|
+
} = require('./hoursParser');
|
|
10
|
+
const { detectsSyncWithMall } = require('./mallContext');
|
|
11
|
+
const { extractHoursWithLLM } = require('./llmExtract');
|
|
12
|
+
const { chooseExternalLink, extractHoursFromExternal } = require('./externalFollow');
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Layered hours extraction.
|
|
16
|
+
*
|
|
17
|
+
* Layers (high-to-low confidence):
|
|
18
|
+
* 1. JSON-LD openingHoursSpecification → 0.98
|
|
19
|
+
* 2. Microdata itemprop="openingHours" → 0.95
|
|
20
|
+
* 3. DOM patterns (table/dl/ul stereotypes) → 0.90
|
|
21
|
+
* 4. Labeled section + free-form parse → 0.85
|
|
22
|
+
* 5. "Sync with mall hours" phrase → 0.80 (uses mallContext)
|
|
23
|
+
* 6. LLM on focused snippet → 0.70
|
|
24
|
+
* 7. External-link follow → 0.65 (visits retailer site)
|
|
25
|
+
* 8. (Vision) → reserved, not implemented in slice 1
|
|
26
|
+
*
|
|
27
|
+
* Inputs:
|
|
28
|
+
* - pageData: { html, text, jsonLd, metaTags, links?, dom? }
|
|
29
|
+
* - opts.mallContext: { canonical } - mall's own hours, if known
|
|
30
|
+
* - opts.client, opts.model - OpenAI for layer 6
|
|
31
|
+
* - opts.browser, opts.mallOrigin - for layer 7
|
|
32
|
+
* - opts.logger
|
|
33
|
+
*
|
|
34
|
+
* Returns:
|
|
35
|
+
* { canonical, source, confidence, sync_with_centre_hours }
|
|
36
|
+
*/
|
|
37
|
+
async function extractHours(pageData, opts = {}) {
|
|
38
|
+
const { mallContext, client, model, browser, mallOrigin, logger } = opts;
|
|
39
|
+
|
|
40
|
+
// Layer 1: JSON-LD openingHoursSpecification
|
|
41
|
+
if (pageData.jsonLd && pageData.jsonLd.length) {
|
|
42
|
+
const ld = parseJsonLdHours(pageData.jsonLd);
|
|
43
|
+
if (ld) {
|
|
44
|
+
const canonical = canonicalize(ld);
|
|
45
|
+
if (validateCanonical(canonical).ok) {
|
|
46
|
+
return {
|
|
47
|
+
canonical, source: 'jsonld', confidence: 0.98, sync_with_centre_hours: false,
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// Layer 2: Microdata via itemprop in DOM (or meta tags)
|
|
54
|
+
if (pageData.microdataHours) {
|
|
55
|
+
const md = parseSchemaOpeningHours(pageData.microdataHours);
|
|
56
|
+
if (md) {
|
|
57
|
+
const canonical = canonicalize(md);
|
|
58
|
+
if (validateCanonical(canonical).ok) {
|
|
59
|
+
return { canonical, source: 'microdata', confidence: 0.95, sync_with_centre_hours: false };
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Layer 3: DOM patterns — caller can pre-extract a "hours block" string from
|
|
65
|
+
// common selectors. If not pre-extracted, we'll skip.
|
|
66
|
+
if (pageData.domHoursBlock) {
|
|
67
|
+
const block = parseFreeFormHours(pageData.domHoursBlock);
|
|
68
|
+
if (block) {
|
|
69
|
+
const canonical = canonicalize(block);
|
|
70
|
+
if (validateCanonical(canonical).ok) {
|
|
71
|
+
return { canonical, source: 'dom', confidence: 0.90, sync_with_centre_hours: false };
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Layer 4: Labeled section + free-form on page text
|
|
77
|
+
if (pageData.text) {
|
|
78
|
+
const labeled = sliceLabeledHoursSection(pageData.text);
|
|
79
|
+
if (labeled) {
|
|
80
|
+
const ff = parseFreeFormHours(labeled);
|
|
81
|
+
if (ff) {
|
|
82
|
+
const canonical = canonicalize(ff);
|
|
83
|
+
const v = validateCanonical(canonical);
|
|
84
|
+
// Accept if we got 5+ days; many sites omit closed-day rows
|
|
85
|
+
const dayCount = canonical.split(';').filter(Boolean).length;
|
|
86
|
+
if (v.ok && dayCount >= 5) {
|
|
87
|
+
return { canonical, source: 'labeled', confidence: 0.85, sync_with_centre_hours: false };
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// Layer 5: Sync with mall hours
|
|
94
|
+
if (mallContext && mallContext.canonical && pageData.text && detectsSyncWithMall(pageData.text)) {
|
|
95
|
+
if (logger) logger.info(' ↳ matched "syncs with mall hours" — using mall hours');
|
|
96
|
+
return {
|
|
97
|
+
canonical: mallContext.canonical,
|
|
98
|
+
source: 'sync-with-mall',
|
|
99
|
+
confidence: 0.80,
|
|
100
|
+
sync_with_centre_hours: true,
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Layer 6: LLM on focused snippet
|
|
105
|
+
if (client && model && pageData.text) {
|
|
106
|
+
const canonical = await extractHoursWithLLM({ client, model, pageData, logger });
|
|
107
|
+
if (canonical) {
|
|
108
|
+
return { canonical, source: 'llm', confidence: 0.70, sync_with_centre_hours: false };
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// Layer 7: External-link follow
|
|
113
|
+
if (browser && pageData.links && mallOrigin) {
|
|
114
|
+
const chosen = chooseExternalLink(pageData.links, mallOrigin);
|
|
115
|
+
if (chosen) {
|
|
116
|
+
if (logger) logger.info(` ↳ following external link: ${chosen.href}`);
|
|
117
|
+
const ext = await extractHoursFromExternal(browser, chosen.href, { logger });
|
|
118
|
+
if (ext) {
|
|
119
|
+
return {
|
|
120
|
+
canonical: ext.canonical,
|
|
121
|
+
source: ext.source,
|
|
122
|
+
confidence: 0.65,
|
|
123
|
+
sync_with_centre_hours: false,
|
|
124
|
+
externalUrl: ext.sourceUrl,
|
|
125
|
+
};
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
return { canonical: '', source: null, confidence: 0, sync_with_centre_hours: false };
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Slice the chunk of text that follows a "Hours" heading.
|
|
135
|
+
* Returns a ~600-char window, or null if no heading-like line is found.
|
|
136
|
+
*/
|
|
137
|
+
function sliceLabeledHoursSection(text) {
|
|
138
|
+
if (!text) return null;
|
|
139
|
+
const lines = text.split('\n');
|
|
140
|
+
const HEADING = /\b(hours of operation|store hours|opening hours|business hours|hours)\b\s*:?$/i;
|
|
141
|
+
for (let i = 0; i < lines.length; i++) {
|
|
142
|
+
const line = lines[i].trim();
|
|
143
|
+
if (line.length === 0) continue;
|
|
144
|
+
if (HEADING.test(line) || /^(hours)\s*:?\s*$/i.test(line)) {
|
|
145
|
+
return lines.slice(i, Math.min(lines.length, i + 16)).join('\n');
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
return null;
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
module.exports = { extractHours, sliceLabeledHoursSection };
|
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { URL } = require('url');
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Enumerate every <img> on the store page and score it for three slots:
|
|
7
|
+
* - logoScore → the store's brand mark
|
|
8
|
+
* - brandScore → hero / banner / lifestyle image
|
|
9
|
+
* - frontScore → exterior storefront photo
|
|
10
|
+
*
|
|
11
|
+
* Excludes obvious chrome: map tiles, transparent pixels, mall logos,
|
|
12
|
+
* developer/parent-company logos, splash/popup images, social-share icons.
|
|
13
|
+
*
|
|
14
|
+
* @param {import('puppeteer').Page} page
|
|
15
|
+
* @param {string} storeUrl
|
|
16
|
+
* @param {object} ctx - { storeName, mallName, mallEcosystem (array of domains/hostnames) }
|
|
17
|
+
* @returns {Promise<Array<{url, alt, w, h, aspect, logoScore, brandScore, frontScore}>>}
|
|
18
|
+
*/
|
|
19
|
+
async function classifyImages(page, storeUrl, {
|
|
20
|
+
storeName = '', mallName = '', mallEcosystem = [],
|
|
21
|
+
mallChromeImages = [], storeCardLogos = [],
|
|
22
|
+
} = {}) {
|
|
23
|
+
let origin = '';
|
|
24
|
+
try { origin = new URL(storeUrl).origin; } catch (_) {}
|
|
25
|
+
|
|
26
|
+
// Effective chrome = images that appear on the mall homepage MINUS those that
|
|
27
|
+
// are known store-card logos (which often appear in homepage carousels).
|
|
28
|
+
const cardSet = new Set(storeCardLogos || []);
|
|
29
|
+
const effectiveChrome = (mallChromeImages || []).filter(u => !cardSet.has(u));
|
|
30
|
+
|
|
31
|
+
return await page.evaluate(({ origin, storeName, mallName, mallEcosystem, mallChromeImages: effChrome }) => {
|
|
32
|
+
const resolve = (src) => {
|
|
33
|
+
if (!src) return null;
|
|
34
|
+
if (src.startsWith('data:')) return null;
|
|
35
|
+
if (src.startsWith('http')) return src;
|
|
36
|
+
if (src.startsWith('//')) return 'https:' + src;
|
|
37
|
+
if (src.startsWith('/')) return origin + src;
|
|
38
|
+
return null;
|
|
39
|
+
};
|
|
40
|
+
|
|
41
|
+
const filenameOf = (u) => {
|
|
42
|
+
try { return new URL(u).pathname.split('/').pop() || ''; } catch { return ''; }
|
|
43
|
+
};
|
|
44
|
+
const hostOf = (u) => {
|
|
45
|
+
try { return new URL(u).hostname.replace(/^www\./, ''); } catch { return ''; }
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
const cleanName = String(storeName || '').toLowerCase().replace(/[^a-z0-9]/g, '');
|
|
49
|
+
const mallNameClean = String(mallName || '').toLowerCase().replace(/[^a-z0-9]/g, '');
|
|
50
|
+
// First significant word of the mall name (e.g. "Currents of Windermere" → "currents")
|
|
51
|
+
const mallNameTokens = String(mallName || '').toLowerCase()
|
|
52
|
+
.split(/\W+/).filter(w => w.length >= 4 && !['mall','shopping','centre','center','plaza','village'].includes(w));
|
|
53
|
+
const mallNameLead = mallNameTokens[0] || '';
|
|
54
|
+
const ecoHosts = new Set((mallEcosystem || []).map(s => String(s).toLowerCase()));
|
|
55
|
+
const chromeSet = new Set(effChrome || []);
|
|
56
|
+
|
|
57
|
+
// Hard-exclude these obvious non-store images.
|
|
58
|
+
const HARD_EXCLUDE_URL = /(gstatic\.com|googleusercontent|maps\.googleapis|google\.com\/maps|placeholder|spacer|blank\.gif|transparent\.png|\.svg\?token=|favicon|apple-touch-icon)/i;
|
|
59
|
+
const HARD_EXCLUDE_FILE = /(splash|noptin|popup|footer-logo|wordmark)/i;
|
|
60
|
+
const DEVELOPER_HINTS = /(developments?|development[-_]corp|holdings|properties|realty|management|hill[-_]?company|harvard|cameron|cushman|wakefield|jll|brookfield|cadillac|fairview|ivanhoe|cambridge)/i;
|
|
61
|
+
const SOCIAL_ICON_HINTS = /(facebook[-_]?icon|instagram[-_]?icon|twitter[-_]?icon|x[-_]?icon|tiktok[-_]?icon|youtube[-_]?icon|pinterest[-_]?icon|social[-_]?icon|share[-_]?icon)/i;
|
|
62
|
+
|
|
63
|
+
const out = [];
|
|
64
|
+
document.querySelectorAll('img').forEach(img => {
|
|
65
|
+
const src = img.currentSrc
|
|
66
|
+
|| img.src
|
|
67
|
+
|| img.getAttribute('data-src')
|
|
68
|
+
|| img.getAttribute('data-lazy-src')
|
|
69
|
+
|| '';
|
|
70
|
+
const url = resolve(src);
|
|
71
|
+
if (!url) return;
|
|
72
|
+
if (HARD_EXCLUDE_URL.test(url)) return;
|
|
73
|
+
// Skip mall-chrome: any image that also appears on the mall homepage.
|
|
74
|
+
if (chromeSet.has(url) || chromeSet.has(src)) return;
|
|
75
|
+
|
|
76
|
+
const file = filenameOf(url).toLowerCase();
|
|
77
|
+
if (HARD_EXCLUDE_FILE.test(file)) return;
|
|
78
|
+
|
|
79
|
+
const alt = (img.getAttribute('alt') || '').trim();
|
|
80
|
+
const cls = (img.className || '').toLowerCase();
|
|
81
|
+
const id = (img.id || '').toLowerCase();
|
|
82
|
+
const parent = img.parentElement;
|
|
83
|
+
const parentCls = parent ? (parent.className || '').toLowerCase() : '';
|
|
84
|
+
const parentTag = parent ? parent.tagName.toLowerCase() : '';
|
|
85
|
+
const parentId = parent ? (parent.id || '').toLowerCase() : '';
|
|
86
|
+
|
|
87
|
+
// Skip social-icon imagery
|
|
88
|
+
if (SOCIAL_ICON_HINTS.test(file) || SOCIAL_ICON_HINTS.test(cls) || SOCIAL_ICON_HINTS.test(alt)) return;
|
|
89
|
+
|
|
90
|
+
// Skip if the host is part of the mall ecosystem (sister property)
|
|
91
|
+
if (ecoHosts.has(hostOf(url).toLowerCase())) return;
|
|
92
|
+
|
|
93
|
+
// Dimensions
|
|
94
|
+
const w = img.naturalWidth || 0;
|
|
95
|
+
const h = img.naturalHeight || 0;
|
|
96
|
+
const rect = img.getBoundingClientRect();
|
|
97
|
+
const rW = Math.round(rect.width);
|
|
98
|
+
const rH = Math.round(rect.height);
|
|
99
|
+
const useW = w || rW;
|
|
100
|
+
const useH = h || rH;
|
|
101
|
+
// Skip tiny (likely icons)
|
|
102
|
+
if (useW < 24 || useH < 24) return;
|
|
103
|
+
|
|
104
|
+
const aspect = useH ? useW / useH : 1;
|
|
105
|
+
const altClean = alt.toLowerCase().replace(/[^a-z0-9]/g, '');
|
|
106
|
+
const inFooter = !!img.closest('footer') || /footer/.test(parentCls) || /footer/.test(parentId);
|
|
107
|
+
|
|
108
|
+
// Detect "this is mall chrome by alt text"
|
|
109
|
+
// - full mall name in alt (e.g. "Currents of Windermere")
|
|
110
|
+
// - the leading distinctive token of the mall name (e.g. "Banner - Currents - Events")
|
|
111
|
+
const altIsMallName = mallNameClean.length >= 4 && altClean.includes(mallNameClean);
|
|
112
|
+
const altHasMallLead = mallNameLead.length >= 4 && altClean.includes(mallNameLead);
|
|
113
|
+
if (altIsMallName || altHasMallLead) return;
|
|
114
|
+
// Same check on the filename (catches `banner-currents-events-1750x900.jpg` etc.)
|
|
115
|
+
if (mallNameLead.length >= 4 && file.includes(mallNameLead)) return;
|
|
116
|
+
|
|
117
|
+
// Detect "this is a developer / parent company logo"
|
|
118
|
+
if (DEVELOPER_HINTS.test(alt) || DEVELOPER_HINTS.test(file)) return;
|
|
119
|
+
|
|
120
|
+
// Footer-positioned imagery is rarely a store-relevant image (it's site chrome).
|
|
121
|
+
if (inFooter) return;
|
|
122
|
+
|
|
123
|
+
// Hard chrome skip: images in banner/hero containers OR with generic
|
|
124
|
+
// chrome filenames ("WEbsite-Background", "page-banner") that have NO
|
|
125
|
+
// store-name connection are almost always mall chrome.
|
|
126
|
+
const cleanName_ = cleanName;
|
|
127
|
+
const altClean_ = altClean;
|
|
128
|
+
const inBannerContainerEarly = /banner|hero|masthead|background/.test(cls)
|
|
129
|
+
|| /banner|hero|masthead|background/.test(parentCls)
|
|
130
|
+
|| !!(img.closest && img.closest('.banner, .hero, .masthead, [class*="banner"], [class*="hero"]'));
|
|
131
|
+
const genericChromeFileEarly = /(^|[-_])(banner|background|hero|masthead|cover|website[-_]?(bg|background)|page[-_]?banner|default)([-_]|\.|$)/i.test(file);
|
|
132
|
+
const genericChromeAltEarly = /^(banner|background|hero|cover|image|photo|picture)$/i.test((alt || '').trim());
|
|
133
|
+
const altMatchesStoreEarly = cleanName_.length >= 5
|
|
134
|
+
&& altClean_.length > 0 && altClean_.includes(cleanName_.slice(0, 5));
|
|
135
|
+
const fileMatchesStoreEarly = cleanName_.length >= 5
|
|
136
|
+
&& file.includes(cleanName_.slice(0, 5));
|
|
137
|
+
if ((inBannerContainerEarly || genericChromeFileEarly || genericChromeAltEarly)
|
|
138
|
+
&& !altMatchesStoreEarly && !fileMatchesStoreEarly) {
|
|
139
|
+
return;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// ---------- score all three slots ----------
|
|
143
|
+
let logoScore = 0, brandScore = 0, frontScore = 0;
|
|
144
|
+
|
|
145
|
+
// Strongest possible logo signal: filename contains BOTH "logo" and the
|
|
146
|
+
// store name (e.g. "Barcelos-Logo_website.png", "booster-juice-logo.png").
|
|
147
|
+
// This beats class="store-logo" because mall CMS templates sometimes
|
|
148
|
+
// mistakenly tag a placeholder with class="store-logo".
|
|
149
|
+
if (cleanName.length >= 5 && file.includes(cleanName.slice(0, 5)) && /logo/.test(file)) {
|
|
150
|
+
logoScore += 130;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
// Downstream CMSes (imgix-based image services in particular) often have
|
|
154
|
+
// trouble with GIFs. Penalize so when a PNG/JPG/webp alternative exists
|
|
155
|
+
// it wins, but a GIF can still be picked if it's the only option.
|
|
156
|
+
const isGif = /\.gif(\?|$)/i.test(url) || /\.gif$/i.test(file);
|
|
157
|
+
if (isGif) {
|
|
158
|
+
logoScore -= 50;
|
|
159
|
+
brandScore -= 30;
|
|
160
|
+
}
|
|
161
|
+
if (/store-logo|tenant-logo|retailer-logo|brand-logo/.test(cls)) logoScore += 100;
|
|
162
|
+
if (/store-logo|tenant-logo|retailer-logo|brand-logo/.test(parentCls)) logoScore += 80;
|
|
163
|
+
if (/\blogo\b/.test(cls) || /\blogo\b/.test(id) || /\blogo\b/.test(parentCls)) logoScore += 30;
|
|
164
|
+
if (/logo/.test(file)) logoScore += 40;
|
|
165
|
+
if (cleanName.length >= 3 && altClean.includes(cleanName.slice(0, Math.min(cleanName.length, 8)))) {
|
|
166
|
+
logoScore += 60;
|
|
167
|
+
}
|
|
168
|
+
// Aspect: logos are roughly square (0.5–2.0)
|
|
169
|
+
if (aspect >= 0.5 && aspect <= 2.0) logoScore += 15;
|
|
170
|
+
else logoScore -= 10;
|
|
171
|
+
// Size sweet spot
|
|
172
|
+
if (useW >= 60 && useW <= 600 && useH >= 60 && useH <= 600) logoScore += 15;
|
|
173
|
+
if (useW > 1000 || useH > 800) logoScore -= 15;
|
|
174
|
+
// In content area is a bonus
|
|
175
|
+
if (parent && parent.closest && parent.closest('article, main, .content, [class*="content"], [class*="store"]')) logoScore += 10;
|
|
176
|
+
// og:image is rarely a logo
|
|
177
|
+
// (handled below outside the loop)
|
|
178
|
+
|
|
179
|
+
// ---------- score brand / hero ----------
|
|
180
|
+
if (/banner|hero|feature|cover|masthead|brand-image|brand_image/.test(cls)) brandScore += 80;
|
|
181
|
+
if (/banner|hero|feature|cover|masthead/.test(parentCls)) brandScore += 80;
|
|
182
|
+
if (/banner|hero|feature|cover|background/.test(file)) brandScore += 40;
|
|
183
|
+
if (aspect > 1.8) brandScore += 25;
|
|
184
|
+
if (useW > 800) brandScore += 15;
|
|
185
|
+
if (parent && parent.closest && parent.closest('[class*="banner"], [class*="hero"], [class*="feature"]')) brandScore += 20;
|
|
186
|
+
// Brand images are usually NOT square — penalize square big images so we don't pick a logo here
|
|
187
|
+
if (aspect >= 0.85 && aspect <= 1.15 && useW <= 400) brandScore -= 30;
|
|
188
|
+
|
|
189
|
+
// (chrome detection happens earlier; this block intentionally left empty)
|
|
190
|
+
|
|
191
|
+
// ---------- score storefront ----------
|
|
192
|
+
if (/(storefront|store[-_]?front|exterior|outside|building|facade)/.test(cls)
|
|
193
|
+
|| /(storefront|store[-_]?front|exterior|outside|building|facade)/.test(parentCls)) frontScore += 100;
|
|
194
|
+
if (/(storefront|store[-_]?front|exterior|outside|building|facade)/.test(file)) frontScore += 80;
|
|
195
|
+
if (/exterior|outside|store ?front|front (of|view)/i.test(alt)) frontScore += 60;
|
|
196
|
+
if (parentTag === 'figure') frontScore += 10;
|
|
197
|
+
if (aspect >= 1.2 && aspect <= 1.9) frontScore += 8;
|
|
198
|
+
|
|
199
|
+
out.push({
|
|
200
|
+
url,
|
|
201
|
+
alt,
|
|
202
|
+
cls,
|
|
203
|
+
parentCls,
|
|
204
|
+
file,
|
|
205
|
+
w: useW,
|
|
206
|
+
h: useH,
|
|
207
|
+
aspect: Math.round(aspect * 100) / 100,
|
|
208
|
+
top: Math.round(rect.top),
|
|
209
|
+
logoScore,
|
|
210
|
+
brandScore,
|
|
211
|
+
frontScore,
|
|
212
|
+
});
|
|
213
|
+
});
|
|
214
|
+
|
|
215
|
+
// og:image as a candidate (usually a hero/feature image for a store page)
|
|
216
|
+
const og = document.querySelector('meta[property="og:image"]');
|
|
217
|
+
if (og) {
|
|
218
|
+
const ogUrl = resolve(og.getAttribute('content'));
|
|
219
|
+
if (ogUrl && !HARD_EXCLUDE_URL.test(ogUrl)) {
|
|
220
|
+
out.push({
|
|
221
|
+
url: ogUrl, alt: 'og:image', cls: '', parentCls: '', file: '',
|
|
222
|
+
w: 0, h: 0, aspect: 0, top: -1,
|
|
223
|
+
logoScore: 5, brandScore: 30, frontScore: 5,
|
|
224
|
+
});
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
return out;
|
|
229
|
+
}, { origin, storeName, mallName, mallEcosystem, mallChromeImages: effectiveChrome });
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
/**
|
|
233
|
+
* Given the classifier output and the directoryLogoUrl, choose final URLs for
|
|
234
|
+
* { logo_image_url, brand_image_url, store_front_image_url }.
|
|
235
|
+
*
|
|
236
|
+
* Logic:
|
|
237
|
+
* - LOGO: highest logoScore. directoryLogoUrl is treated as a candidate with
|
|
238
|
+
* a baseline logoScore boost — but if a store-page candidate scores higher
|
|
239
|
+
* and clearly looks like a logo (logoScore >= 80), prefer it.
|
|
240
|
+
* - BRAND: highest brandScore among remaining candidates with brandScore >= 30.
|
|
241
|
+
* - FRONT: highest frontScore among remaining candidates with frontScore >= 50.
|
|
242
|
+
*
|
|
243
|
+
* Logos never get reused in brand/front slots, and vice versa.
|
|
244
|
+
*/
|
|
245
|
+
function pickImages(candidates, { directoryLogoUrl, storeName = '' } = {}) {
|
|
246
|
+
const cleanName = String(storeName || '').toLowerCase().replace(/[^a-z0-9]/g, '');
|
|
247
|
+
// Dedup by URL (keep highest scores across duplicates)
|
|
248
|
+
const map = new Map();
|
|
249
|
+
for (const c of candidates) {
|
|
250
|
+
if (!c.url) continue;
|
|
251
|
+
const key = c.url;
|
|
252
|
+
const existing = map.get(key);
|
|
253
|
+
if (!existing) { map.set(key, { ...c }); continue; }
|
|
254
|
+
existing.logoScore = Math.max(existing.logoScore, c.logoScore);
|
|
255
|
+
existing.brandScore = Math.max(existing.brandScore, c.brandScore);
|
|
256
|
+
existing.frontScore = Math.max(existing.frontScore, c.frontScore);
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
// Inject directoryLogoUrl as a candidate if not already present.
|
|
260
|
+
if (directoryLogoUrl && !map.has(directoryLogoUrl)) {
|
|
261
|
+
map.set(directoryLogoUrl, {
|
|
262
|
+
url: directoryLogoUrl,
|
|
263
|
+
alt: '', cls: '', parentCls: '', file: '',
|
|
264
|
+
w: 0, h: 0, aspect: 0, top: 0,
|
|
265
|
+
logoScore: 75, // baseline trust for the listing card image
|
|
266
|
+
brandScore: 0,
|
|
267
|
+
frontScore: 0,
|
|
268
|
+
});
|
|
269
|
+
} else if (directoryLogoUrl && map.has(directoryLogoUrl)) {
|
|
270
|
+
// Boost the existing entry's logoScore — it appeared on the listing card.
|
|
271
|
+
map.get(directoryLogoUrl).logoScore += 25;
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
const all = Array.from(map.values());
|
|
275
|
+
const used = new Set();
|
|
276
|
+
|
|
277
|
+
// LOGO selection — descriptive filename beats class-based signal.
|
|
278
|
+
//
|
|
279
|
+
// Step 1: among ALL candidates that pass the qualifier, see if any has BOTH
|
|
280
|
+
// "logo" and the store name in the URL. That's the most authoritative
|
|
281
|
+
// possible signal of a real logo file — pick the highest-scoring such.
|
|
282
|
+
// Step 2: otherwise fall back to top by logoScore.
|
|
283
|
+
const logoSorted = [...all].sort((a, b) => b.logoScore - a.logoScore);
|
|
284
|
+
const logoQualified = logoSorted.filter(c => c.logoScore >= 40);
|
|
285
|
+
let logo = null;
|
|
286
|
+
if (logoQualified.length > 0) {
|
|
287
|
+
if (cleanName.length >= 5) {
|
|
288
|
+
const namedLogo = logoQualified.find(c =>
|
|
289
|
+
/logo/i.test(c.url) && c.url.toLowerCase().includes(cleanName.slice(0, 5))
|
|
290
|
+
);
|
|
291
|
+
if (namedLogo) logo = namedLogo;
|
|
292
|
+
}
|
|
293
|
+
if (!logo) logo = logoQualified[0];
|
|
294
|
+
}
|
|
295
|
+
if (logo) used.add(logo.url);
|
|
296
|
+
|
|
297
|
+
// BRAND
|
|
298
|
+
const brandSorted = all
|
|
299
|
+
.filter(c => !used.has(c.url))
|
|
300
|
+
.sort((a, b) => b.brandScore - a.brandScore);
|
|
301
|
+
let brand = null;
|
|
302
|
+
for (const c of brandSorted) {
|
|
303
|
+
if (c.brandScore >= 40) { brand = c; break; }
|
|
304
|
+
}
|
|
305
|
+
if (brand) used.add(brand.url);
|
|
306
|
+
|
|
307
|
+
// STOREFRONT
|
|
308
|
+
const frontSorted = all
|
|
309
|
+
.filter(c => !used.has(c.url))
|
|
310
|
+
.sort((a, b) => b.frontScore - a.frontScore);
|
|
311
|
+
let front = null;
|
|
312
|
+
for (const c of frontSorted) {
|
|
313
|
+
if (c.frontScore >= 50) { front = c; break; }
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
return {
|
|
317
|
+
logo_image_url: logo ? logo.url : '',
|
|
318
|
+
brand_image_url: brand ? brand.url : '',
|
|
319
|
+
store_front_image_url: front ? front.url : '',
|
|
320
|
+
logo_image_url_alt_text: logo ? logo.alt : '',
|
|
321
|
+
brand_image_url_alt_text: brand ? brand.alt : '',
|
|
322
|
+
store_front_image_url_alt_text: front ? front.alt : '',
|
|
323
|
+
_imageDecisions: {
|
|
324
|
+
logoScore: logo ? logo.logoScore : 0,
|
|
325
|
+
brandScore: brand ? brand.brandScore : 0,
|
|
326
|
+
frontScore: front ? front.frontScore : 0,
|
|
327
|
+
},
|
|
328
|
+
};
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
module.exports = { classifyImages, pickImages };
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const {
|
|
4
|
+
parseFreeFormHours,
|
|
5
|
+
canonicalize,
|
|
6
|
+
validateCanonical,
|
|
7
|
+
} = require('./hoursParser');
|
|
8
|
+
|
|
9
|
+
const HOURS_KEYWORDS = /\b(hours?|open(?:ing)?|closed|monday|tuesday|wednesday|thursday|friday|saturday|sunday|mon\b|tue\b|wed\b|thu\b|fri\b|sat\b|sun\b)/i;
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Locate hours-relevant snippets in raw page text.
|
|
13
|
+
* Returns up to maxSnippets windows of ~window chars around keyword hits.
|
|
14
|
+
*/
|
|
15
|
+
function findHoursSnippets(text, { window: w = 400, maxSnippets = 3 } = {}) {
|
|
16
|
+
if (!text) return [];
|
|
17
|
+
const cleaned = text.replace(/\r/g, '');
|
|
18
|
+
const snippets = [];
|
|
19
|
+
const seen = new Set();
|
|
20
|
+
|
|
21
|
+
const lines = cleaned.split('\n');
|
|
22
|
+
for (let i = 0; i < lines.length; i++) {
|
|
23
|
+
if (!HOURS_KEYWORDS.test(lines[i])) continue;
|
|
24
|
+
// Gather surrounding lines
|
|
25
|
+
const start = Math.max(0, i - 3);
|
|
26
|
+
const end = Math.min(lines.length, i + 8);
|
|
27
|
+
const slice = lines.slice(start, end).join('\n').trim();
|
|
28
|
+
if (!slice || slice.length < 8) continue;
|
|
29
|
+
if (seen.has(slice)) continue;
|
|
30
|
+
seen.add(slice);
|
|
31
|
+
snippets.push(slice.slice(0, w * 2));
|
|
32
|
+
if (snippets.length >= maxSnippets) break;
|
|
33
|
+
}
|
|
34
|
+
return snippets;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Ask the LLM to extract hours from a focused snippet.
|
|
39
|
+
* Returns canonical string or '' on failure.
|
|
40
|
+
*/
|
|
41
|
+
async function llmExtractHoursFromSnippet(client, model, snippet, logger) {
|
|
42
|
+
if (!snippet || !snippet.trim()) return '';
|
|
43
|
+
|
|
44
|
+
const systemPrompt =
|
|
45
|
+
`You extract retail store opening hours from short text snippets and return ONLY JSON.
|
|
46
|
+
Rules:
|
|
47
|
+
- Output JSON: { "hours": { "Monday": "10:00 AM - 9:00 PM", "Tuesday": "...", ..., "Sunday": "..." } }
|
|
48
|
+
- Use full day names as keys (Monday..Sunday).
|
|
49
|
+
- Use exactly "H:MM AM/PM - H:MM AM/PM" for open/close.
|
|
50
|
+
- Use "Closed" for closed days.
|
|
51
|
+
- Omit a day if the snippet does not state hours for it. Do not invent.
|
|
52
|
+
- If the snippet does not contain any operating hours, return { "hours": {} }.`;
|
|
53
|
+
|
|
54
|
+
const userPrompt = `Snippet:\n"""\n${snippet}\n"""\n\nExtract the hours as JSON.`;
|
|
55
|
+
|
|
56
|
+
const isGpt5Family = /^gpt-5(\.|-|$)/i.test(model);
|
|
57
|
+
const params = {
|
|
58
|
+
model,
|
|
59
|
+
messages: [
|
|
60
|
+
{ role: 'system', content: systemPrompt },
|
|
61
|
+
{ role: 'user', content: userPrompt },
|
|
62
|
+
],
|
|
63
|
+
response_format: { type: 'json_object' },
|
|
64
|
+
};
|
|
65
|
+
if (isGpt5Family) params.max_completion_tokens = 600;
|
|
66
|
+
else { params.max_tokens = 600; params.temperature = 0.0; }
|
|
67
|
+
|
|
68
|
+
try {
|
|
69
|
+
const resp = await client.chat.completions.create(params);
|
|
70
|
+
const raw = JSON.parse(resp.choices[0].message.content);
|
|
71
|
+
const hours = raw && raw.hours;
|
|
72
|
+
if (!hours || typeof hours !== 'object' || Object.keys(hours).length === 0) return '';
|
|
73
|
+
// Round-trip through parser to canonicalize and validate format
|
|
74
|
+
const lines = Object.entries(hours).map(([d, v]) => `${d}: ${v}`).join('\n');
|
|
75
|
+
const parsed = parseFreeFormHours(lines);
|
|
76
|
+
if (!parsed) return '';
|
|
77
|
+
return canonicalize(parsed);
|
|
78
|
+
} catch (err) {
|
|
79
|
+
if (logger) logger.warn(` ⚠ LLM hours extract failed: ${err.message}`);
|
|
80
|
+
return '';
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Run LLM extraction over the best snippets found in pageData.text.
|
|
86
|
+
* Returns the first canonical string that passes validation.
|
|
87
|
+
*/
|
|
88
|
+
async function extractHoursWithLLM({ client, model, pageData, logger }) {
|
|
89
|
+
const snippets = findHoursSnippets(pageData.text || '', { maxSnippets: 3 });
|
|
90
|
+
if (snippets.length === 0) return '';
|
|
91
|
+
|
|
92
|
+
for (const snip of snippets) {
|
|
93
|
+
const canonical = await llmExtractHoursFromSnippet(client, model, snip, logger);
|
|
94
|
+
if (canonical && validateCanonical(canonical).ok) return canonical;
|
|
95
|
+
}
|
|
96
|
+
return '';
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
module.exports = { findHoursSnippets, llmExtractHoursFromSnippet, extractHoursWithLLM };
|