mallmaverick-store-scraper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,151 @@
1
+ 'use strict';
2
+
3
+ const {
4
+ parseJsonLdHours,
5
+ parseFreeFormHours,
6
+ parseSchemaOpeningHours,
7
+ canonicalize,
8
+ validateCanonical,
9
+ } = require('./hoursParser');
10
+ const { detectsSyncWithMall } = require('./mallContext');
11
+ const { extractHoursWithLLM } = require('./llmExtract');
12
+ const { chooseExternalLink, extractHoursFromExternal } = require('./externalFollow');
13
+
14
+ /**
15
+ * Layered hours extraction.
16
+ *
17
+ * Layers (high-to-low confidence):
18
+ * 1. JSON-LD openingHoursSpecification → 0.98
19
+ * 2. Microdata itemprop="openingHours" → 0.95
20
+ * 3. DOM patterns (table/dl/ul stereotypes) → 0.90
21
+ * 4. Labeled section + free-form parse → 0.85
22
+ * 5. "Sync with mall hours" phrase → 0.80 (uses mallContext)
23
+ * 6. LLM on focused snippet → 0.70
24
+ * 7. External-link follow → 0.65 (visits retailer site)
25
+ * 8. (Vision) → reserved, not implemented in slice 1
26
+ *
27
+ * Inputs:
28
+ * - pageData: { html, text, jsonLd, metaTags, links?, dom? }
29
+ * - opts.mallContext: { canonical } - mall's own hours, if known
30
+ * - opts.client, opts.model - OpenAI for layer 6
31
+ * - opts.browser, opts.mallOrigin - for layer 7
32
+ * - opts.logger
33
+ *
34
+ * Returns:
35
+ * { canonical, source, confidence, sync_with_centre_hours }
36
+ */
37
+ async function extractHours(pageData, opts = {}) {
38
+ const { mallContext, client, model, browser, mallOrigin, logger } = opts;
39
+
40
+ // Layer 1: JSON-LD openingHoursSpecification
41
+ if (pageData.jsonLd && pageData.jsonLd.length) {
42
+ const ld = parseJsonLdHours(pageData.jsonLd);
43
+ if (ld) {
44
+ const canonical = canonicalize(ld);
45
+ if (validateCanonical(canonical).ok) {
46
+ return {
47
+ canonical, source: 'jsonld', confidence: 0.98, sync_with_centre_hours: false,
48
+ };
49
+ }
50
+ }
51
+ }
52
+
53
+ // Layer 2: Microdata via itemprop in DOM (or meta tags)
54
+ if (pageData.microdataHours) {
55
+ const md = parseSchemaOpeningHours(pageData.microdataHours);
56
+ if (md) {
57
+ const canonical = canonicalize(md);
58
+ if (validateCanonical(canonical).ok) {
59
+ return { canonical, source: 'microdata', confidence: 0.95, sync_with_centre_hours: false };
60
+ }
61
+ }
62
+ }
63
+
64
+ // Layer 3: DOM patterns — caller can pre-extract a "hours block" string from
65
+ // common selectors. If not pre-extracted, we'll skip.
66
+ if (pageData.domHoursBlock) {
67
+ const block = parseFreeFormHours(pageData.domHoursBlock);
68
+ if (block) {
69
+ const canonical = canonicalize(block);
70
+ if (validateCanonical(canonical).ok) {
71
+ return { canonical, source: 'dom', confidence: 0.90, sync_with_centre_hours: false };
72
+ }
73
+ }
74
+ }
75
+
76
+ // Layer 4: Labeled section + free-form on page text
77
+ if (pageData.text) {
78
+ const labeled = sliceLabeledHoursSection(pageData.text);
79
+ if (labeled) {
80
+ const ff = parseFreeFormHours(labeled);
81
+ if (ff) {
82
+ const canonical = canonicalize(ff);
83
+ const v = validateCanonical(canonical);
84
+ // Accept if we got 5+ days; many sites omit closed-day rows
85
+ const dayCount = canonical.split(';').filter(Boolean).length;
86
+ if (v.ok && dayCount >= 5) {
87
+ return { canonical, source: 'labeled', confidence: 0.85, sync_with_centre_hours: false };
88
+ }
89
+ }
90
+ }
91
+ }
92
+
93
+ // Layer 5: Sync with mall hours
94
+ if (mallContext && mallContext.canonical && pageData.text && detectsSyncWithMall(pageData.text)) {
95
+ if (logger) logger.info(' ↳ matched "syncs with mall hours" — using mall hours');
96
+ return {
97
+ canonical: mallContext.canonical,
98
+ source: 'sync-with-mall',
99
+ confidence: 0.80,
100
+ sync_with_centre_hours: true,
101
+ };
102
+ }
103
+
104
+ // Layer 6: LLM on focused snippet
105
+ if (client && model && pageData.text) {
106
+ const canonical = await extractHoursWithLLM({ client, model, pageData, logger });
107
+ if (canonical) {
108
+ return { canonical, source: 'llm', confidence: 0.70, sync_with_centre_hours: false };
109
+ }
110
+ }
111
+
112
+ // Layer 7: External-link follow
113
+ if (browser && pageData.links && mallOrigin) {
114
+ const chosen = chooseExternalLink(pageData.links, mallOrigin);
115
+ if (chosen) {
116
+ if (logger) logger.info(` ↳ following external link: ${chosen.href}`);
117
+ const ext = await extractHoursFromExternal(browser, chosen.href, { logger });
118
+ if (ext) {
119
+ return {
120
+ canonical: ext.canonical,
121
+ source: ext.source,
122
+ confidence: 0.65,
123
+ sync_with_centre_hours: false,
124
+ externalUrl: ext.sourceUrl,
125
+ };
126
+ }
127
+ }
128
+ }
129
+
130
+ return { canonical: '', source: null, confidence: 0, sync_with_centre_hours: false };
131
+ }
132
+
133
+ /**
134
+ * Slice the chunk of text that follows a "Hours" heading.
135
+ * Returns a ~600-char window, or null if no heading-like line is found.
136
+ */
137
+ function sliceLabeledHoursSection(text) {
138
+ if (!text) return null;
139
+ const lines = text.split('\n');
140
+ const HEADING = /\b(hours of operation|store hours|opening hours|business hours|hours)\b\s*:?$/i;
141
+ for (let i = 0; i < lines.length; i++) {
142
+ const line = lines[i].trim();
143
+ if (line.length === 0) continue;
144
+ if (HEADING.test(line) || /^(hours)\s*:?\s*$/i.test(line)) {
145
+ return lines.slice(i, Math.min(lines.length, i + 16)).join('\n');
146
+ }
147
+ }
148
+ return null;
149
+ }
150
+
151
+ module.exports = { extractHours, sliceLabeledHoursSection };
@@ -0,0 +1,331 @@
1
+ 'use strict';
2
+
3
+ const { URL } = require('url');
4
+
5
+ /**
6
+ * Enumerate every <img> on the store page and score it for three slots:
7
+ * - logoScore → the store's brand mark
8
+ * - brandScore → hero / banner / lifestyle image
9
+ * - frontScore → exterior storefront photo
10
+ *
11
+ * Excludes obvious chrome: map tiles, transparent pixels, mall logos,
12
+ * developer/parent-company logos, splash/popup images, social-share icons.
13
+ *
14
+ * @param {import('puppeteer').Page} page
15
+ * @param {string} storeUrl
16
+ * @param {object} ctx - { storeName, mallName, mallEcosystem (array of domains/hostnames) }
17
+ * @returns {Promise<Array<{url, alt, w, h, aspect, logoScore, brandScore, frontScore}>>}
18
+ */
19
+ async function classifyImages(page, storeUrl, {
20
+ storeName = '', mallName = '', mallEcosystem = [],
21
+ mallChromeImages = [], storeCardLogos = [],
22
+ } = {}) {
23
+ let origin = '';
24
+ try { origin = new URL(storeUrl).origin; } catch (_) {}
25
+
26
+ // Effective chrome = images that appear on the mall homepage MINUS those that
27
+ // are known store-card logos (which often appear in homepage carousels).
28
+ const cardSet = new Set(storeCardLogos || []);
29
+ const effectiveChrome = (mallChromeImages || []).filter(u => !cardSet.has(u));
30
+
31
+ return await page.evaluate(({ origin, storeName, mallName, mallEcosystem, mallChromeImages: effChrome }) => {
32
+ const resolve = (src) => {
33
+ if (!src) return null;
34
+ if (src.startsWith('data:')) return null;
35
+ if (src.startsWith('http')) return src;
36
+ if (src.startsWith('//')) return 'https:' + src;
37
+ if (src.startsWith('/')) return origin + src;
38
+ return null;
39
+ };
40
+
41
+ const filenameOf = (u) => {
42
+ try { return new URL(u).pathname.split('/').pop() || ''; } catch { return ''; }
43
+ };
44
+ const hostOf = (u) => {
45
+ try { return new URL(u).hostname.replace(/^www\./, ''); } catch { return ''; }
46
+ };
47
+
48
+ const cleanName = String(storeName || '').toLowerCase().replace(/[^a-z0-9]/g, '');
49
+ const mallNameClean = String(mallName || '').toLowerCase().replace(/[^a-z0-9]/g, '');
50
+ // First significant word of the mall name (e.g. "Currents of Windermere" → "currents")
51
+ const mallNameTokens = String(mallName || '').toLowerCase()
52
+ .split(/\W+/).filter(w => w.length >= 4 && !['mall','shopping','centre','center','plaza','village'].includes(w));
53
+ const mallNameLead = mallNameTokens[0] || '';
54
+ const ecoHosts = new Set((mallEcosystem || []).map(s => String(s).toLowerCase()));
55
+ const chromeSet = new Set(effChrome || []);
56
+
57
+ // Hard-exclude these obvious non-store images.
58
+ const HARD_EXCLUDE_URL = /(gstatic\.com|googleusercontent|maps\.googleapis|google\.com\/maps|placeholder|spacer|blank\.gif|transparent\.png|\.svg\?token=|favicon|apple-touch-icon)/i;
59
+ const HARD_EXCLUDE_FILE = /(splash|noptin|popup|footer-logo|wordmark)/i;
60
+ const DEVELOPER_HINTS = /(developments?|development[-_]corp|holdings|properties|realty|management|hill[-_]?company|harvard|cameron|cushman|wakefield|jll|brookfield|cadillac|fairview|ivanhoe|cambridge)/i;
61
+ const SOCIAL_ICON_HINTS = /(facebook[-_]?icon|instagram[-_]?icon|twitter[-_]?icon|x[-_]?icon|tiktok[-_]?icon|youtube[-_]?icon|pinterest[-_]?icon|social[-_]?icon|share[-_]?icon)/i;
62
+
63
+ const out = [];
64
+ document.querySelectorAll('img').forEach(img => {
65
+ const src = img.currentSrc
66
+ || img.src
67
+ || img.getAttribute('data-src')
68
+ || img.getAttribute('data-lazy-src')
69
+ || '';
70
+ const url = resolve(src);
71
+ if (!url) return;
72
+ if (HARD_EXCLUDE_URL.test(url)) return;
73
+ // Skip mall-chrome: any image that also appears on the mall homepage.
74
+ if (chromeSet.has(url) || chromeSet.has(src)) return;
75
+
76
+ const file = filenameOf(url).toLowerCase();
77
+ if (HARD_EXCLUDE_FILE.test(file)) return;
78
+
79
+ const alt = (img.getAttribute('alt') || '').trim();
80
+ const cls = (img.className || '').toLowerCase();
81
+ const id = (img.id || '').toLowerCase();
82
+ const parent = img.parentElement;
83
+ const parentCls = parent ? (parent.className || '').toLowerCase() : '';
84
+ const parentTag = parent ? parent.tagName.toLowerCase() : '';
85
+ const parentId = parent ? (parent.id || '').toLowerCase() : '';
86
+
87
+ // Skip social-icon imagery
88
+ if (SOCIAL_ICON_HINTS.test(file) || SOCIAL_ICON_HINTS.test(cls) || SOCIAL_ICON_HINTS.test(alt)) return;
89
+
90
+ // Skip if the host is part of the mall ecosystem (sister property)
91
+ if (ecoHosts.has(hostOf(url).toLowerCase())) return;
92
+
93
+ // Dimensions
94
+ const w = img.naturalWidth || 0;
95
+ const h = img.naturalHeight || 0;
96
+ const rect = img.getBoundingClientRect();
97
+ const rW = Math.round(rect.width);
98
+ const rH = Math.round(rect.height);
99
+ const useW = w || rW;
100
+ const useH = h || rH;
101
+ // Skip tiny (likely icons)
102
+ if (useW < 24 || useH < 24) return;
103
+
104
+ const aspect = useH ? useW / useH : 1;
105
+ const altClean = alt.toLowerCase().replace(/[^a-z0-9]/g, '');
106
+ const inFooter = !!img.closest('footer') || /footer/.test(parentCls) || /footer/.test(parentId);
107
+
108
+ // Detect "this is mall chrome by alt text"
109
+ // - full mall name in alt (e.g. "Currents of Windermere")
110
+ // - the leading distinctive token of the mall name (e.g. "Banner - Currents - Events")
111
+ const altIsMallName = mallNameClean.length >= 4 && altClean.includes(mallNameClean);
112
+ const altHasMallLead = mallNameLead.length >= 4 && altClean.includes(mallNameLead);
113
+ if (altIsMallName || altHasMallLead) return;
114
+ // Same check on the filename (catches `banner-currents-events-1750x900.jpg` etc.)
115
+ if (mallNameLead.length >= 4 && file.includes(mallNameLead)) return;
116
+
117
+ // Detect "this is a developer / parent company logo"
118
+ if (DEVELOPER_HINTS.test(alt) || DEVELOPER_HINTS.test(file)) return;
119
+
120
+ // Footer-positioned imagery is rarely a store-relevant image (it's site chrome).
121
+ if (inFooter) return;
122
+
123
+ // Hard chrome skip: images in banner/hero containers OR with generic
124
+ // chrome filenames ("WEbsite-Background", "page-banner") that have NO
125
+ // store-name connection are almost always mall chrome.
126
+ const cleanName_ = cleanName;
127
+ const altClean_ = altClean;
128
+ const inBannerContainerEarly = /banner|hero|masthead|background/.test(cls)
129
+ || /banner|hero|masthead|background/.test(parentCls)
130
+ || !!(img.closest && img.closest('.banner, .hero, .masthead, [class*="banner"], [class*="hero"]'));
131
+ const genericChromeFileEarly = /(^|[-_])(banner|background|hero|masthead|cover|website[-_]?(bg|background)|page[-_]?banner|default)([-_]|\.|$)/i.test(file);
132
+ const genericChromeAltEarly = /^(banner|background|hero|cover|image|photo|picture)$/i.test((alt || '').trim());
133
+ const altMatchesStoreEarly = cleanName_.length >= 5
134
+ && altClean_.length > 0 && altClean_.includes(cleanName_.slice(0, 5));
135
+ const fileMatchesStoreEarly = cleanName_.length >= 5
136
+ && file.includes(cleanName_.slice(0, 5));
137
+ if ((inBannerContainerEarly || genericChromeFileEarly || genericChromeAltEarly)
138
+ && !altMatchesStoreEarly && !fileMatchesStoreEarly) {
139
+ return;
140
+ }
141
+
142
+ // ---------- score all three slots ----------
143
+ let logoScore = 0, brandScore = 0, frontScore = 0;
144
+
145
+ // Strongest possible logo signal: filename contains BOTH "logo" and the
146
+ // store name (e.g. "Barcelos-Logo_website.png", "booster-juice-logo.png").
147
+ // This beats class="store-logo" because mall CMS templates sometimes
148
+ // mistakenly tag a placeholder with class="store-logo".
149
+ if (cleanName.length >= 5 && file.includes(cleanName.slice(0, 5)) && /logo/.test(file)) {
150
+ logoScore += 130;
151
+ }
152
+
153
+ // Downstream CMSes (imgix-based image services in particular) often have
154
+ // trouble with GIFs. Penalize so when a PNG/JPG/webp alternative exists
155
+ // it wins, but a GIF can still be picked if it's the only option.
156
+ const isGif = /\.gif(\?|$)/i.test(url) || /\.gif$/i.test(file);
157
+ if (isGif) {
158
+ logoScore -= 50;
159
+ brandScore -= 30;
160
+ }
161
+ if (/store-logo|tenant-logo|retailer-logo|brand-logo/.test(cls)) logoScore += 100;
162
+ if (/store-logo|tenant-logo|retailer-logo|brand-logo/.test(parentCls)) logoScore += 80;
163
+ if (/\blogo\b/.test(cls) || /\blogo\b/.test(id) || /\blogo\b/.test(parentCls)) logoScore += 30;
164
+ if (/logo/.test(file)) logoScore += 40;
165
+ if (cleanName.length >= 3 && altClean.includes(cleanName.slice(0, Math.min(cleanName.length, 8)))) {
166
+ logoScore += 60;
167
+ }
168
+ // Aspect: logos are roughly square (0.5–2.0)
169
+ if (aspect >= 0.5 && aspect <= 2.0) logoScore += 15;
170
+ else logoScore -= 10;
171
+ // Size sweet spot
172
+ if (useW >= 60 && useW <= 600 && useH >= 60 && useH <= 600) logoScore += 15;
173
+ if (useW > 1000 || useH > 800) logoScore -= 15;
174
+ // In content area is a bonus
175
+ if (parent && parent.closest && parent.closest('article, main, .content, [class*="content"], [class*="store"]')) logoScore += 10;
176
+ // og:image is rarely a logo
177
+ // (handled below outside the loop)
178
+
179
+ // ---------- score brand / hero ----------
180
+ if (/banner|hero|feature|cover|masthead|brand-image|brand_image/.test(cls)) brandScore += 80;
181
+ if (/banner|hero|feature|cover|masthead/.test(parentCls)) brandScore += 80;
182
+ if (/banner|hero|feature|cover|background/.test(file)) brandScore += 40;
183
+ if (aspect > 1.8) brandScore += 25;
184
+ if (useW > 800) brandScore += 15;
185
+ if (parent && parent.closest && parent.closest('[class*="banner"], [class*="hero"], [class*="feature"]')) brandScore += 20;
186
+ // Brand images are usually NOT square — penalize square big images so we don't pick a logo here
187
+ if (aspect >= 0.85 && aspect <= 1.15 && useW <= 400) brandScore -= 30;
188
+
189
+ // (chrome detection happens earlier; this block intentionally left empty)
190
+
191
+ // ---------- score storefront ----------
192
+ if (/(storefront|store[-_]?front|exterior|outside|building|facade)/.test(cls)
193
+ || /(storefront|store[-_]?front|exterior|outside|building|facade)/.test(parentCls)) frontScore += 100;
194
+ if (/(storefront|store[-_]?front|exterior|outside|building|facade)/.test(file)) frontScore += 80;
195
+ if (/exterior|outside|store ?front|front (of|view)/i.test(alt)) frontScore += 60;
196
+ if (parentTag === 'figure') frontScore += 10;
197
+ if (aspect >= 1.2 && aspect <= 1.9) frontScore += 8;
198
+
199
+ out.push({
200
+ url,
201
+ alt,
202
+ cls,
203
+ parentCls,
204
+ file,
205
+ w: useW,
206
+ h: useH,
207
+ aspect: Math.round(aspect * 100) / 100,
208
+ top: Math.round(rect.top),
209
+ logoScore,
210
+ brandScore,
211
+ frontScore,
212
+ });
213
+ });
214
+
215
+ // og:image as a candidate (usually a hero/feature image for a store page)
216
+ const og = document.querySelector('meta[property="og:image"]');
217
+ if (og) {
218
+ const ogUrl = resolve(og.getAttribute('content'));
219
+ if (ogUrl && !HARD_EXCLUDE_URL.test(ogUrl)) {
220
+ out.push({
221
+ url: ogUrl, alt: 'og:image', cls: '', parentCls: '', file: '',
222
+ w: 0, h: 0, aspect: 0, top: -1,
223
+ logoScore: 5, brandScore: 30, frontScore: 5,
224
+ });
225
+ }
226
+ }
227
+
228
+ return out;
229
+ }, { origin, storeName, mallName, mallEcosystem, mallChromeImages: effectiveChrome });
230
+ }
231
+
232
+ /**
233
+ * Given the classifier output and the directoryLogoUrl, choose final URLs for
234
+ * { logo_image_url, brand_image_url, store_front_image_url }.
235
+ *
236
+ * Logic:
237
+ * - LOGO: highest logoScore. directoryLogoUrl is treated as a candidate with
238
+ * a baseline logoScore boost — but if a store-page candidate scores higher
239
+ * and clearly looks like a logo (logoScore >= 80), prefer it.
240
+ * - BRAND: highest brandScore among remaining candidates with brandScore >= 30.
241
+ * - FRONT: highest frontScore among remaining candidates with frontScore >= 50.
242
+ *
243
+ * Logos never get reused in brand/front slots, and vice versa.
244
+ */
245
+ function pickImages(candidates, { directoryLogoUrl, storeName = '' } = {}) {
246
+ const cleanName = String(storeName || '').toLowerCase().replace(/[^a-z0-9]/g, '');
247
+ // Dedup by URL (keep highest scores across duplicates)
248
+ const map = new Map();
249
+ for (const c of candidates) {
250
+ if (!c.url) continue;
251
+ const key = c.url;
252
+ const existing = map.get(key);
253
+ if (!existing) { map.set(key, { ...c }); continue; }
254
+ existing.logoScore = Math.max(existing.logoScore, c.logoScore);
255
+ existing.brandScore = Math.max(existing.brandScore, c.brandScore);
256
+ existing.frontScore = Math.max(existing.frontScore, c.frontScore);
257
+ }
258
+
259
+ // Inject directoryLogoUrl as a candidate if not already present.
260
+ if (directoryLogoUrl && !map.has(directoryLogoUrl)) {
261
+ map.set(directoryLogoUrl, {
262
+ url: directoryLogoUrl,
263
+ alt: '', cls: '', parentCls: '', file: '',
264
+ w: 0, h: 0, aspect: 0, top: 0,
265
+ logoScore: 75, // baseline trust for the listing card image
266
+ brandScore: 0,
267
+ frontScore: 0,
268
+ });
269
+ } else if (directoryLogoUrl && map.has(directoryLogoUrl)) {
270
+ // Boost the existing entry's logoScore — it appeared on the listing card.
271
+ map.get(directoryLogoUrl).logoScore += 25;
272
+ }
273
+
274
+ const all = Array.from(map.values());
275
+ const used = new Set();
276
+
277
+ // LOGO selection — descriptive filename beats class-based signal.
278
+ //
279
+ // Step 1: among ALL candidates that pass the qualifier, see if any has BOTH
280
+ // "logo" and the store name in the URL. That's the most authoritative
281
+ // possible signal of a real logo file — pick the highest-scoring such.
282
+ // Step 2: otherwise fall back to top by logoScore.
283
+ const logoSorted = [...all].sort((a, b) => b.logoScore - a.logoScore);
284
+ const logoQualified = logoSorted.filter(c => c.logoScore >= 40);
285
+ let logo = null;
286
+ if (logoQualified.length > 0) {
287
+ if (cleanName.length >= 5) {
288
+ const namedLogo = logoQualified.find(c =>
289
+ /logo/i.test(c.url) && c.url.toLowerCase().includes(cleanName.slice(0, 5))
290
+ );
291
+ if (namedLogo) logo = namedLogo;
292
+ }
293
+ if (!logo) logo = logoQualified[0];
294
+ }
295
+ if (logo) used.add(logo.url);
296
+
297
+ // BRAND
298
+ const brandSorted = all
299
+ .filter(c => !used.has(c.url))
300
+ .sort((a, b) => b.brandScore - a.brandScore);
301
+ let brand = null;
302
+ for (const c of brandSorted) {
303
+ if (c.brandScore >= 40) { brand = c; break; }
304
+ }
305
+ if (brand) used.add(brand.url);
306
+
307
+ // STOREFRONT
308
+ const frontSorted = all
309
+ .filter(c => !used.has(c.url))
310
+ .sort((a, b) => b.frontScore - a.frontScore);
311
+ let front = null;
312
+ for (const c of frontSorted) {
313
+ if (c.frontScore >= 50) { front = c; break; }
314
+ }
315
+
316
+ return {
317
+ logo_image_url: logo ? logo.url : '',
318
+ brand_image_url: brand ? brand.url : '',
319
+ store_front_image_url: front ? front.url : '',
320
+ logo_image_url_alt_text: logo ? logo.alt : '',
321
+ brand_image_url_alt_text: brand ? brand.alt : '',
322
+ store_front_image_url_alt_text: front ? front.alt : '',
323
+ _imageDecisions: {
324
+ logoScore: logo ? logo.logoScore : 0,
325
+ brandScore: brand ? brand.brandScore : 0,
326
+ frontScore: front ? front.frontScore : 0,
327
+ },
328
+ };
329
+ }
330
+
331
+ module.exports = { classifyImages, pickImages };
@@ -0,0 +1,99 @@
1
+ 'use strict';
2
+
3
+ const {
4
+ parseFreeFormHours,
5
+ canonicalize,
6
+ validateCanonical,
7
+ } = require('./hoursParser');
8
+
9
+ const HOURS_KEYWORDS = /\b(hours?|open(?:ing)?|closed|monday|tuesday|wednesday|thursday|friday|saturday|sunday|mon\b|tue\b|wed\b|thu\b|fri\b|sat\b|sun\b)/i;
10
+
11
+ /**
12
+ * Locate hours-relevant snippets in raw page text.
13
+ * Returns up to maxSnippets windows of ~window chars around keyword hits.
14
+ */
15
+ function findHoursSnippets(text, { window: w = 400, maxSnippets = 3 } = {}) {
16
+ if (!text) return [];
17
+ const cleaned = text.replace(/\r/g, '');
18
+ const snippets = [];
19
+ const seen = new Set();
20
+
21
+ const lines = cleaned.split('\n');
22
+ for (let i = 0; i < lines.length; i++) {
23
+ if (!HOURS_KEYWORDS.test(lines[i])) continue;
24
+ // Gather surrounding lines
25
+ const start = Math.max(0, i - 3);
26
+ const end = Math.min(lines.length, i + 8);
27
+ const slice = lines.slice(start, end).join('\n').trim();
28
+ if (!slice || slice.length < 8) continue;
29
+ if (seen.has(slice)) continue;
30
+ seen.add(slice);
31
+ snippets.push(slice.slice(0, w * 2));
32
+ if (snippets.length >= maxSnippets) break;
33
+ }
34
+ return snippets;
35
+ }
36
+
37
+ /**
38
+ * Ask the LLM to extract hours from a focused snippet.
39
+ * Returns canonical string or '' on failure.
40
+ */
41
+ async function llmExtractHoursFromSnippet(client, model, snippet, logger) {
42
+ if (!snippet || !snippet.trim()) return '';
43
+
44
+ const systemPrompt =
45
+ `You extract retail store opening hours from short text snippets and return ONLY JSON.
46
+ Rules:
47
+ - Output JSON: { "hours": { "Monday": "10:00 AM - 9:00 PM", "Tuesday": "...", ..., "Sunday": "..." } }
48
+ - Use full day names as keys (Monday..Sunday).
49
+ - Use exactly "H:MM AM/PM - H:MM AM/PM" for open/close.
50
+ - Use "Closed" for closed days.
51
+ - Omit a day if the snippet does not state hours for it. Do not invent.
52
+ - If the snippet does not contain any operating hours, return { "hours": {} }.`;
53
+
54
+ const userPrompt = `Snippet:\n"""\n${snippet}\n"""\n\nExtract the hours as JSON.`;
55
+
56
+ const isGpt5Family = /^gpt-5(\.|-|$)/i.test(model);
57
+ const params = {
58
+ model,
59
+ messages: [
60
+ { role: 'system', content: systemPrompt },
61
+ { role: 'user', content: userPrompt },
62
+ ],
63
+ response_format: { type: 'json_object' },
64
+ };
65
+ if (isGpt5Family) params.max_completion_tokens = 600;
66
+ else { params.max_tokens = 600; params.temperature = 0.0; }
67
+
68
+ try {
69
+ const resp = await client.chat.completions.create(params);
70
+ const raw = JSON.parse(resp.choices[0].message.content);
71
+ const hours = raw && raw.hours;
72
+ if (!hours || typeof hours !== 'object' || Object.keys(hours).length === 0) return '';
73
+ // Round-trip through parser to canonicalize and validate format
74
+ const lines = Object.entries(hours).map(([d, v]) => `${d}: ${v}`).join('\n');
75
+ const parsed = parseFreeFormHours(lines);
76
+ if (!parsed) return '';
77
+ return canonicalize(parsed);
78
+ } catch (err) {
79
+ if (logger) logger.warn(` ⚠ LLM hours extract failed: ${err.message}`);
80
+ return '';
81
+ }
82
+ }
83
+
84
+ /**
85
+ * Run LLM extraction over the best snippets found in pageData.text.
86
+ * Returns the first canonical string that passes validation.
87
+ */
88
+ async function extractHoursWithLLM({ client, model, pageData, logger }) {
89
+ const snippets = findHoursSnippets(pageData.text || '', { maxSnippets: 3 });
90
+ if (snippets.length === 0) return '';
91
+
92
+ for (const snip of snippets) {
93
+ const canonical = await llmExtractHoursFromSnippet(client, model, snip, logger);
94
+ if (canonical && validateCanonical(canonical).ok) return canonical;
95
+ }
96
+ return '';
97
+ }
98
+
99
+ module.exports = { findHoursSnippets, llmExtractHoursFromSnippet, extractHoursWithLLM };