mallmaverick-store-scraper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,298 @@
1
+ 'use strict';
2
+
3
+ const { URL } = require('url');
4
+ const http = require('http');
5
+ const https = require('https');
6
+ const {
7
+ newPage, autoScroll, attachXhrInterceptor, sleep, XHR_CAPTURE_PATTERNS,
8
+ } = require('./browser');
9
+
10
+ const STORE_URL_KEYWORDS = [
11
+ '/store/', '/stores/', '/shop/', '/shops/', '/tenant/', '/tenants/',
12
+ '/directory/', '/listing/', '/listings/', '/retailer/', '/restaurant/',
13
+ '/brand/', '/location/', '/vendor/', '/merchant/',
14
+ ];
15
+
16
+ const NON_STORE_PATTERNS = [
17
+ /\/(about|contact|faq|privacy|terms|careers|news|blog|events|deals|gift|parking|directions|hours|jobs)\b/i,
18
+ /\/(categories?|category)(\/|$)/i,
19
+ /\/(shopping-?map|site-?map|map)(\/|$)/i,
20
+ /\/(a-?z-?listing|a-?z-?list|all-?stores?)(\/|$)/i,
21
+ /\/(page|tag|filter|search)(\/|$)/i,
22
+ /\.(pdf|jpg|jpeg|png|gif|svg|css|js|xml|json)$/i,
23
+ ];
24
+
25
+ const NON_STORE_SLUGS = new Set([
26
+ 'shopping-map', 'map', 'directory', 'a-z-listing', 'listing', 'list',
27
+ 'all', 'index', 'category', 'categories', 'about', 'contact', 'hours',
28
+ 'parking', 'careers', 'leasing', 'news', 'events', 'promotions', 'deals',
29
+ 'gift-cards', 'gift', 'jobs', 'blog', 'faq', 'help', 'login', 'register',
30
+ 'cart', 'checkout', 'search', 'filter', 'page',
31
+ ]);
32
+
33
+ /**
34
+ * Multi-strategy store URL discovery:
35
+ * 1. XHR/fetch interception (directory APIs returning store JSON)
36
+ * 2. Directory listing card scan (anchor+img pairs β†’ logoMap)
37
+ * 3. DOM link scan
38
+ * 4. sitemap.xml + common variants
39
+ * 5. /api/stores etc. probing
40
+ *
41
+ * Returns { storeUrls, logoMap } where logoMap: Map<normalizedStoreUrl, logoUrl>.
42
+ */
43
+ async function discoverStores(browser, directoryUrl, logger) {
44
+ const parsedBase = new URL(directoryUrl);
45
+ const baseUrl = `${parsedBase.protocol}//${parsedBase.hostname}`;
46
+
47
+ const page = await newPage(browser);
48
+ let result;
49
+ try {
50
+ const { interceptedJson, interceptedRaw } = await attachXhrInterceptor(page, { directoryMode: true });
51
+ await page.goto(directoryUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
52
+ await sleep(3000);
53
+ await autoScroll(page);
54
+ await sleep(1500);
55
+ await autoScroll(page, { distance: 300, delay: 100 });
56
+ await sleep(500);
57
+
58
+ const logoMap = await collectDirectoryLogoMap(page, baseUrl);
59
+ if (logger) logger.info(` πŸ–ΌοΈ Directory logos collected: ${logoMap.size}`);
60
+
61
+ const pageUrls = await extractLinksFromPage(page, baseUrl);
62
+ if (logger) logger.info(` πŸ“„ DOM links: ${pageUrls.length}`);
63
+
64
+ const sitemapUrls = await fetchSitemapUrls(baseUrl, logger);
65
+ if (logger) logger.info(` πŸ—ΊοΈ Sitemap URLs: ${sitemapUrls.length}`);
66
+
67
+ const apiUrls = await probeApiEndpoints(baseUrl, logger);
68
+ if (logger) logger.info(` πŸ“‘ API URLs: ${apiUrls.length}`);
69
+
70
+ const all = new Set([...logoMap.keys(), ...pageUrls, ...sitemapUrls, ...apiUrls]);
71
+ for (const { url: src, text } of interceptedRaw) {
72
+ const extracted = extractUrlsFromJson(text, baseUrl);
73
+ if (extracted.length && logger) logger.info(` πŸ“¦ +${extracted.length} URLs from XHR (${src})`);
74
+ extracted.forEach(u => all.add(u));
75
+ }
76
+
77
+ const filtered = filterStoreUrls(Array.from(all), directoryUrl, logoMap);
78
+ // Final dedup: normalize trailing slash so /aw and /aw/ are one URL.
79
+ const seen = new Set();
80
+ const storeUrls = [];
81
+ for (const u of filtered) {
82
+ const norm = u.replace(/\/+$/, '').toLowerCase();
83
+ if (seen.has(norm)) continue;
84
+ seen.add(norm);
85
+ storeUrls.push(u);
86
+ }
87
+ if (logger) logger.info(`βœ… Total store URLs: ${storeUrls.length}`);
88
+ result = { storeUrls, logoMap };
89
+ } finally {
90
+ await page.close();
91
+ }
92
+ return result;
93
+ }
94
+
95
+ async function collectDirectoryLogoMap(page, baseUrl) {
96
+ try {
97
+ const entries = await page.evaluate((baseUrl) => {
98
+ const out = [];
99
+ const anchors = Array.from(document.querySelectorAll('a[href]'));
100
+ for (const a of anchors) {
101
+ const href = a.getAttribute('href') || '';
102
+ const full = href.startsWith('http') ? href : (href.startsWith('/') ? baseUrl + href : null);
103
+ if (!full) continue;
104
+ const img = a.querySelector('img');
105
+ if (!img) continue;
106
+ const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-lazy-src') || '';
107
+ if (!src || src.length < 5) continue;
108
+ if (/placeholder|spacer|blank/i.test(src)) continue;
109
+ const fullSrc = src.startsWith('http') ? src : (src.startsWith('//') ? 'https:' + src : (src.startsWith('/') ? baseUrl + src : null));
110
+ if (fullSrc) out.push({ url: full, logo: fullSrc });
111
+ }
112
+ const cardSelectors = [
113
+ 'li', 'article',
114
+ '[class*="store-item"]', '[class*="store-card"]', '[class*="tenant"]',
115
+ '[class*="listing-item"]', '[class*="directory-item"]', '[class*="retailer"]',
116
+ '[class*="brand-item"]', '[class*="shop-item"]',
117
+ ];
118
+ for (const sel of cardSelectors) {
119
+ document.querySelectorAll(sel).forEach(card => {
120
+ const link = card.querySelector('a[href]');
121
+ const img = card.querySelector('img');
122
+ if (!link || !img) return;
123
+ const href = link.getAttribute('href') || '';
124
+ const full = href.startsWith('http') ? href : (href.startsWith('/') ? baseUrl + href : null);
125
+ if (!full) return;
126
+ const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-lazy-src') || '';
127
+ if (!src || /placeholder|spacer/i.test(src)) return;
128
+ const fullSrc = src.startsWith('http') ? src : (src.startsWith('//') ? 'https:' + src : (src.startsWith('/') ? baseUrl + src : null));
129
+ if (fullSrc) out.push({ url: full, logo: fullSrc });
130
+ });
131
+ }
132
+ return out;
133
+ }, baseUrl);
134
+
135
+ const map = new Map();
136
+ for (const { url, logo } of entries) {
137
+ const normalized = url.replace(/\/$/, '').toLowerCase();
138
+ if (!map.has(normalized)) map.set(normalized, logo);
139
+ }
140
+ return map;
141
+ } catch (_) {
142
+ return new Map();
143
+ }
144
+ }
145
+
146
+ async function extractLinksFromPage(page, baseUrl) {
147
+ try {
148
+ return await page.evaluate((baseUrl) => {
149
+ return Array.from(document.querySelectorAll('a[href]'))
150
+ .map(a => {
151
+ const href = a.getAttribute('href');
152
+ if (!href) return null;
153
+ if (href.startsWith('http')) return href;
154
+ if (href.startsWith('/')) return baseUrl + href;
155
+ return null;
156
+ })
157
+ .filter(Boolean);
158
+ }, baseUrl);
159
+ } catch (_) { return []; }
160
+ }
161
+
162
+ async function fetchSitemapUrls(baseUrl, logger) {
163
+ const urls = new Set();
164
+ const locations = [
165
+ `${baseUrl}/sitemap.xml`,
166
+ `${baseUrl}/sitemap_index.xml`,
167
+ `${baseUrl}/sitemap-stores.xml`,
168
+ `${baseUrl}/sitemap-listings.xml`,
169
+ `${baseUrl}/sitemaps/sitemap.xml`,
170
+ ];
171
+ for (const loc of locations) {
172
+ const text = await fetchText(loc);
173
+ if (!text) continue;
174
+ const matches = text.match(/<loc>(.*?)<\/loc>/g) || [];
175
+ matches.forEach(m => {
176
+ const u = m.replace(/<\/?loc>/g, '').trim();
177
+ if (u) urls.add(u);
178
+ });
179
+ if (urls.size > 0) {
180
+ if (logger) logger.info(` πŸ—ΊοΈ Sitemap: ${loc}`);
181
+ break;
182
+ }
183
+ }
184
+ return Array.from(urls);
185
+ }
186
+
187
+ async function probeApiEndpoints(baseUrl, logger) {
188
+ const urls = [];
189
+ const endpoints = [
190
+ `${baseUrl}/api/stores`,
191
+ `${baseUrl}/api/v1/stores`,
192
+ `${baseUrl}/api/listings`,
193
+ `${baseUrl}/api/directory`,
194
+ `${baseUrl}/wp-json/wp/v2/stores`,
195
+ `${baseUrl}/stores.json`,
196
+ `${baseUrl}/directory.json`,
197
+ `${baseUrl}/listings.json`,
198
+ ];
199
+ for (const ep of endpoints) {
200
+ const text = await fetchText(ep, 5000);
201
+ if (!text) continue;
202
+ try {
203
+ const data = JSON.parse(text);
204
+ const extracted = extractUrlsFromJson(JSON.stringify(data), baseUrl);
205
+ if (extracted.length > 0) {
206
+ if (logger) logger.info(` πŸ“‘ API: ${ep} (${extracted.length})`);
207
+ urls.push(...extracted);
208
+ break;
209
+ }
210
+ } catch (_) {}
211
+ }
212
+ return urls;
213
+ }
214
+
215
+ function extractUrlsFromJson(jsonText, baseUrl) {
216
+ const urls = [];
217
+ const urlPattern = /(https?:\/\/[^\s"'<>]+)/g;
218
+ const relativePattern = /"(?:url|link|href|permalink|slug)"\s*:\s*"(\/[^"]+)"/gi;
219
+ let m;
220
+ while ((m = urlPattern.exec(jsonText)) !== null) urls.push(m[1]);
221
+ while ((m = relativePattern.exec(jsonText)) !== null) urls.push(baseUrl + m[1]);
222
+ return [...new Set(urls)];
223
+ }
224
+
225
+ function filterStoreUrls(urls, directoryUrl, logoMap = null) {
226
+ const parsedDir = new URL(directoryUrl);
227
+ const dirHostname = parsedDir.hostname;
228
+ const dirPath = parsedDir.pathname.replace(/\/+$/, '');
229
+ const dirSegs = dirPath.split('/').filter(Boolean);
230
+ const parentPath = dirSegs.length > 1 ? '/' + dirSegs.slice(0, -1).join('/') : null;
231
+
232
+ const logoMapUrls = new Set();
233
+ if (logoMap) for (const k of logoMap.keys()) logoMapUrls.add(k.replace(/\/+$/, '').toLowerCase());
234
+
235
+ const isChildOf = (p, base) => base && p.startsWith(base + '/') && p !== base;
236
+
237
+ return urls.filter(u => {
238
+ try {
239
+ const parsed = new URL(u);
240
+ if (parsed.hostname !== dirHostname) return false;
241
+ if (NON_STORE_PATTERNS.some(p => p.test(parsed.pathname))) return false;
242
+ if (u === directoryUrl) return false;
243
+ const path = parsed.pathname.replace(/\/+$/, '');
244
+ if (path.length < 4) return false;
245
+ if (path === dirPath) return false;
246
+ if (parentPath && path === parentPath) return false;
247
+ const slug = path.split('/').pop();
248
+ if (NON_STORE_SLUGS.has((slug || '').toLowerCase())) return false;
249
+
250
+ if (logoMapUrls.has(path.toLowerCase()) || logoMapUrls.has(u.replace(/\/+$/, '').toLowerCase())) return true;
251
+
252
+ const acceptUnder = (base) => {
253
+ const remainder = path.slice(base.length + 1);
254
+ const segs = remainder.split('/').filter(Boolean);
255
+ if (segs.length === 0 || segs.length > 2) return false;
256
+ if (!/[a-z]/i.test(segs[0])) return false;
257
+ if (NON_STORE_SLUGS.has(segs[0].toLowerCase())) return false;
258
+ return true;
259
+ };
260
+
261
+ if (isChildOf(path, dirPath) && acceptUnder(dirPath)) return true;
262
+ if (parentPath && isChildOf(path, parentPath) && acceptUnder(parentPath)) return true;
263
+ if (STORE_URL_KEYWORDS.some(kw => parsed.pathname.toLowerCase().includes(kw))) {
264
+ // Final slug sanity check
265
+ return acceptUnder(parsed.pathname.replace(/\/[^/]*$/, ''));
266
+ }
267
+ return false;
268
+ } catch { return false; }
269
+ });
270
+ }
271
+
272
+ function fetchText(url, timeout = 8000) {
273
+ return new Promise(resolve => {
274
+ const mod = url.startsWith('https') ? https : http;
275
+ const req = mod.get(url, {
276
+ headers: {
277
+ 'User-Agent': 'Mozilla/5.0 (compatible; StoreScraperV5/1.0)',
278
+ 'Accept': 'text/html,application/json,*/*',
279
+ },
280
+ timeout,
281
+ }, res => {
282
+ if (res.statusCode !== 200) { resolve(null); return; }
283
+ let data = '';
284
+ res.on('data', c => { data += c; });
285
+ res.on('end', () => resolve(data));
286
+ });
287
+ req.on('error', () => resolve(null));
288
+ req.on('timeout', () => { req.destroy(); resolve(null); });
289
+ });
290
+ }
291
+
292
+ // Backwards-compat for first-slice callers
293
+ async function discoverStoreLinks(browser, directoryUrl) {
294
+ const { storeUrls } = await discoverStores(browser, directoryUrl, null);
295
+ return storeUrls;
296
+ }
297
+
298
+ module.exports = { discoverStores, discoverStoreLinks, collectDirectoryLogoMap };
@@ -0,0 +1,89 @@
1
+ 'use strict';
2
+
3
+ const { URL } = require('url');
4
+ const { loadPage, newPage } = require('./browser');
5
+ const {
6
+ parseJsonLdHours,
7
+ parseFreeFormHours,
8
+ canonicalize,
9
+ validateCanonical,
10
+ } = require('./hoursParser');
11
+
12
+ /**
13
+ * Pick the best external link to follow when the mall page has no hours.
14
+ *
15
+ * Heuristics:
16
+ * - Prefer links labeled "hours" or "click here for business hours"
17
+ * - Then prefer links to known store-locator paths
18
+ * - Then the store's own website
19
+ *
20
+ * @param {Array<{href:string, text:string}>} links - links scraped from the store page
21
+ * @param {string} mallOrigin - the mall site origin to exclude
22
+ */
23
+ function chooseExternalLink(links, mallOrigin) {
24
+ if (!links || links.length === 0) return null;
25
+
26
+ const external = links
27
+ .filter(l => {
28
+ try {
29
+ const u = new URL(l.href);
30
+ return u.origin !== mallOrigin && /^https?:/.test(u.protocol);
31
+ } catch { return false; }
32
+ });
33
+
34
+ if (external.length === 0) return null;
35
+
36
+ const score = (l) => {
37
+ let s = 0;
38
+ const t = (l.text || '').toLowerCase();
39
+ const h = (l.href || '').toLowerCase();
40
+ if (/business hours|store hours|hours of operation|click here for.*hours/.test(t)) s += 100;
41
+ if (/\bhours?\b/.test(t)) s += 30;
42
+ if (/store-locator|storelocation|store\/[^/]+\/?$|locations?\/|location\/.+/.test(h)) s += 50;
43
+ if (/locator/.test(h)) s += 25;
44
+ if (/instagram|facebook|twitter|tiktok|youtube|pinterest|google|maps\./.test(h)) s -= 100;
45
+ return s;
46
+ };
47
+
48
+ const ranked = external
49
+ .map(l => ({ ...l, _score: score(l) }))
50
+ .sort((a, b) => b._score - a._score);
51
+
52
+ return ranked[0];
53
+ }
54
+
55
+ /**
56
+ * Visit an external URL and try to extract hours via layers 1-2 (JSON-LD + free-form).
57
+ * Returns { canonical, source: 'external-jsonld' | 'external-freeform' } or null.
58
+ */
59
+ async function extractHoursFromExternal(browser, url, { logger } = {}) {
60
+ if (!url) return null;
61
+ const page = await newPage(browser);
62
+ try {
63
+ const data = await loadPage(page, url, { waitUntil: 'networkidle2', timeout: 30000 });
64
+
65
+ const ld = parseJsonLdHours(data.jsonLd);
66
+ if (ld) {
67
+ const canonical = canonicalize(ld);
68
+ if (validateCanonical(canonical).ok) {
69
+ return { canonical, source: 'external-jsonld', sourceUrl: data.finalUrl };
70
+ }
71
+ }
72
+
73
+ const ff = parseFreeFormHours(data.text);
74
+ if (ff) {
75
+ const canonical = canonicalize(ff);
76
+ if (validateCanonical(canonical).ok) {
77
+ return { canonical, source: 'external-freeform', sourceUrl: data.finalUrl };
78
+ }
79
+ }
80
+ return null;
81
+ } catch (err) {
82
+ if (logger) logger.warn(` ⚠ External fetch failed (${url}): ${err.message}`);
83
+ return null;
84
+ } finally {
85
+ try { await page.close(); } catch (_) {}
86
+ }
87
+ }
88
+
89
+ module.exports = { chooseExternalLink, extractHoursFromExternal };
@@ -0,0 +1,313 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Hours parser: accepts varied free-form and structured inputs, emits canonical
5
+ * "Monday = 10:00 AM - 9:00 PM;Tuesday = ..." string (v4-compatible).
6
+ *
7
+ * Public API:
8
+ * parseFreeFormHours(text) β†’ canonical string | null
9
+ * parseJsonLdHours(jsonLdArr)β†’ canonical string | null
10
+ * parseSchemaOpeningHours(s) β†’ canonical string | null (legacy "Mo-Fr 10:00-21:00" form)
11
+ * canonicalize(dayMap) β†’ canonical string
12
+ * validateCanonical(s) β†’ { ok, missing[], reason }
13
+ * normalizeDayName(s) β†’ "Monday" | null
14
+ * normalizeTime(s) β†’ "10:00 AM" | "Closed" | null
15
+ */
16
+
17
+ const DAYS = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'];
18
+ const DAY_INDEX = Object.fromEntries(DAYS.map((d, i) => [d, i]));
19
+
20
+ // Generous day-name lookup including 2-3 letter abbrevs and Schema.org full forms.
21
+ const DAY_LOOKUP = {
22
+ mon: 'Monday', monday: 'Monday', mo: 'Monday',
23
+ tue: 'Tuesday', tues: 'Tuesday', tuesday: 'Tuesday', tu: 'Tuesday',
24
+ wed: 'Wednesday', weds: 'Wednesday', wednesday: 'Wednesday', we: 'Wednesday',
25
+ thu: 'Thursday', thur: 'Thursday', thurs: 'Thursday', thursday: 'Thursday', th: 'Thursday',
26
+ fri: 'Friday', friday: 'Friday', fr: 'Friday',
27
+ sat: 'Saturday', saturday: 'Saturday', sa: 'Saturday',
28
+ sun: 'Sunday', sunday: 'Sunday', su: 'Sunday',
29
+ };
30
+
31
+ function normalizeDayName(raw) {
32
+ if (!raw) return null;
33
+ const key = String(raw).trim().toLowerCase().replace(/[\.\s]/g, '');
34
+ // Handle Schema.org URI form: http://schema.org/Monday
35
+ const tail = key.split('/').pop();
36
+ return DAY_LOOKUP[tail] || DAY_LOOKUP[key] || null;
37
+ }
38
+
39
+ /**
40
+ * Parse a single time token to "H:MM AM/PM" or "Closed".
41
+ * Accepts: "10", "10am", "10 am", "10:00am", "10:00 AM", "10:00", "21:00",
42
+ * "noon", "midnight", "closed".
43
+ */
44
+ function normalizeTime(raw) {
45
+ if (!raw) return null;
46
+ const s = String(raw).trim().toLowerCase();
47
+ if (!s) return null;
48
+
49
+ if (/^closed$/.test(s) || /^by appointment$/.test(s)) return 'Closed';
50
+ if (/^noon|^12:?00\s*(p\.?m\.?)?$/.test(s)) return '12:00 PM';
51
+ if (/^midnight|^12:?00\s*a\.?m\.?$/.test(s)) return '12:00 AM';
52
+ if (/^24\s*h(ou)?rs?$|^open\s*24/.test(s)) return 'Open 24 Hours';
53
+
54
+ // Match "H[:MM][ ]?am|pm" or 24h "HH:MM" or bare "HH"
55
+ const m = s.match(/^(\d{1,2})(?:[:\.](\d{2}))?\s*([ap])\.?\s*m?\.?$/i)
56
+ || s.match(/^(\d{1,2})[:\.](\d{2})$/)
57
+ || s.match(/^(\d{1,2})$/);
58
+ if (!m) return null;
59
+
60
+ let h = parseInt(m[1], 10);
61
+ const min = m[2] ? parseInt(m[2], 10) : 0;
62
+ const ampmRaw = m[3];
63
+
64
+ if (Number.isNaN(h) || Number.isNaN(min) || h < 0 || h > 24 || min < 0 || min > 59) return null;
65
+
66
+ let ampm;
67
+ if (ampmRaw) {
68
+ ampm = ampmRaw.toLowerCase() === 'a' ? 'AM' : 'PM';
69
+ if (h === 12 && ampm === 'AM') h = 0; // 12am = 00
70
+ else if (h === 12 && ampm === 'PM') h = 12; // 12pm = 12
71
+ else if (ampm === 'PM') h += 12;
72
+ } else {
73
+ // No am/pm β†’ assume 24-hour input
74
+ if (h === 24) h = 0;
75
+ if (h > 23) return null;
76
+ }
77
+
78
+ const h12 = h === 0 ? 12 : h > 12 ? h - 12 : h;
79
+ const ampmOut = h < 12 ? 'AM' : 'PM';
80
+ return `${h12}:${String(min).padStart(2, '0')} ${ampmOut}`;
81
+ }
82
+
83
+ /**
84
+ * Expand a day range "mon-fri" to ["Monday","Tuesday",...,"Friday"].
85
+ * Returns null if either side is unparseable.
86
+ */
87
+ function expandDayRange(startRaw, endRaw) {
88
+ const start = normalizeDayName(startRaw);
89
+ const end = normalizeDayName(endRaw);
90
+ if (!start || !end) return null;
91
+ const si = DAY_INDEX[start];
92
+ const ei = DAY_INDEX[end];
93
+ if (si === undefined || ei === undefined) return null;
94
+ const out = [];
95
+ let i = si;
96
+ // Allow wrap-around e.g. Fri-Sun
97
+ while (true) {
98
+ out.push(DAYS[i]);
99
+ if (i === ei) break;
100
+ i = (i + 1) % 7;
101
+ if (out.length > 7) return null; // safety
102
+ }
103
+ return out;
104
+ }
105
+
106
+ /**
107
+ * Parse one entry like "Mon-Fri 10am-9pm" or "Saturday 10:00 AM - 9:00 PM" or "Sunday: Closed".
108
+ * Returns { days: ["Monday",...], open: "10:00 AM", close: "9:00 PM" } or
109
+ * { days: [...], closed: true } or null.
110
+ */
111
+ function parseHoursEntry(text) {
112
+ if (!text) return null;
113
+ let s = String(text).trim();
114
+
115
+ // Normalize various dashes and "to" separator to ' - '
116
+ s = s
117
+ .replace(/[‐-β€•βˆ’]/g, '-') // hyphen/dash variants
118
+ .replace(/\s+to\s+/gi, ' - ')
119
+ .replace(/\s+through\s+/gi, '-') // for day ranges
120
+ .replace(/\s+/g, ' ');
121
+
122
+ // Strip trailing parenthetical/suffix junk like "(appointment only)",
123
+ // "* by appointment", "(walk-ins welcome)" β€” these confuse the regex
124
+ // anchor but don't affect the times themselves.
125
+ s = s
126
+ .replace(/\s*\([^)]*\)\s*$/g, '')
127
+ .replace(/\s*\*.*$/g, '')
128
+ .trim();
129
+
130
+ // Closed line: "Sunday: Closed", "Monday Closed", "Mon-Fri: Closed"
131
+ const closedMatch = s.match(/^([a-z]+)(?:\s*[-–—]\s*([a-z]+))?\s*[:\-]?\s*(closed|by appointment(?: only)?)$/i);
132
+ if (closedMatch) {
133
+ const days = closedMatch[2]
134
+ ? expandDayRange(closedMatch[1], closedMatch[2])
135
+ : [normalizeDayName(closedMatch[1])].filter(Boolean);
136
+ if (!days || days.length === 0) return null;
137
+ return { days, closed: true };
138
+ }
139
+
140
+ // Day(-Day)? : open - close
141
+ // Examples:
142
+ // "Monday: 6:00 AM to 8:00 PM" (already normalized "to" β†’ "-")
143
+ // "Mon-Fri 10am-9pm"
144
+ // "Saturday 10:00 - 18:00"
145
+ const m = s.match(
146
+ /^([a-z]+)(?:\s*[-–—]\s*([a-z]+))?\s*[:\-]?\s*([0-9][0-9:\.\sapm]*?)\s*[-–—]\s*([0-9][0-9:\.\sapm]*)$/i
147
+ );
148
+ if (!m) return null;
149
+
150
+ const days = m[2]
151
+ ? expandDayRange(m[1], m[2])
152
+ : [normalizeDayName(m[1])].filter(Boolean);
153
+ if (!days || days.length === 0) return null;
154
+
155
+ const open = normalizeTime(m[3]);
156
+ const close = normalizeTime(m[4]);
157
+ if (!open || !close) return null;
158
+
159
+ return { days, open, close };
160
+ }
161
+
162
+ /**
163
+ * Parse a free-form hours block: multi-line or comma-separated.
164
+ * Returns a dayMap { Monday: "10:00 AM - 9:00 PM", ... } or null if nothing recognized.
165
+ */
166
+ function parseFreeFormHours(text) {
167
+ if (!text) return null;
168
+ const raw = String(text);
169
+
170
+ // Normalize line breaks and common separators
171
+ // Split on newlines, semicolons, " | ", and commas-followed-by-day-name
172
+ const candidates = raw
173
+ .split(/\n+|;+|\s*\|\s*/)
174
+ .flatMap(chunk => {
175
+ // Further split on commas, but only when both halves look like an entry.
176
+ // Heuristic: if "," appears between two day names, split.
177
+ const parts = chunk.split(/,\s*(?=(?:mon|tue|wed|thu|fri|sat|sun|mo|tu|we|th|fr|sa|su)[a-z]*\b)/i);
178
+ return parts;
179
+ })
180
+ .map(p => p.trim())
181
+ .filter(Boolean);
182
+
183
+ const dayMap = {};
184
+ let parsedAny = false;
185
+ for (const c of candidates) {
186
+ const entry = parseHoursEntry(c);
187
+ if (!entry) continue;
188
+ parsedAny = true;
189
+ for (const d of entry.days) {
190
+ dayMap[d] = entry.closed ? 'Closed' : `${entry.open} - ${entry.close}`;
191
+ }
192
+ }
193
+ return parsedAny ? dayMap : null;
194
+ }
195
+
196
+ /**
197
+ * Parse Schema.org-style "openingHours" property string.
198
+ * Format: "Mo-Fr 10:00-21:00, Sa 09:00-19:00, Su 11:00-18:00"
199
+ * Returns dayMap or null.
200
+ */
201
+ function parseSchemaOpeningHours(value) {
202
+ if (!value) return null;
203
+ if (Array.isArray(value)) {
204
+ const merged = {};
205
+ for (const v of value) {
206
+ const m = parseSchemaOpeningHours(v);
207
+ if (m) Object.assign(merged, m);
208
+ }
209
+ return Object.keys(merged).length ? merged : null;
210
+ }
211
+ return parseFreeFormHours(String(value));
212
+ }
213
+
214
+ /**
215
+ * Walk a parsed JSON-LD blob and return canonical hours if found.
216
+ * Supports:
217
+ * - LocalBusiness.openingHoursSpecification (array of {dayOfWeek, opens, closes})
218
+ * - LocalBusiness.openingHours (string or array of strings, Schema.org compact form)
219
+ * - Nested @graph
220
+ */
221
+ function parseJsonLdHours(jsonLdArr) {
222
+ if (!jsonLdArr) return null;
223
+ const arr = Array.isArray(jsonLdArr) ? jsonLdArr : [jsonLdArr];
224
+ const dayMap = {};
225
+
226
+ const visit = (node) => {
227
+ if (!node || typeof node !== 'object') return;
228
+ if (Array.isArray(node)) { node.forEach(visit); return; }
229
+
230
+ if (node['@graph']) visit(node['@graph']);
231
+
232
+ if (node.openingHoursSpecification) {
233
+ const specs = Array.isArray(node.openingHoursSpecification)
234
+ ? node.openingHoursSpecification
235
+ : [node.openingHoursSpecification];
236
+ for (const spec of specs) {
237
+ if (!spec) continue;
238
+ const days = spec.dayOfWeek;
239
+ const dayList = Array.isArray(days) ? days : days ? [days] : [];
240
+ for (const d of dayList) {
241
+ const dayName = normalizeDayName(d);
242
+ if (!dayName) continue;
243
+ const open = normalizeTime(spec.opens);
244
+ const close = normalizeTime(spec.closes);
245
+ if (open && close) dayMap[dayName] = `${open} - ${close}`;
246
+ else if (spec.closes === undefined && spec.opens === undefined) dayMap[dayName] = 'Closed';
247
+ }
248
+ }
249
+ }
250
+
251
+ if (node.openingHours) {
252
+ const compact = parseSchemaOpeningHours(node.openingHours);
253
+ if (compact) Object.assign(dayMap, compact);
254
+ }
255
+
256
+ // Recurse into common containers
257
+ for (const key of Object.keys(node)) {
258
+ if (['openingHoursSpecification', 'openingHours'].includes(key)) continue;
259
+ const v = node[key];
260
+ if (v && typeof v === 'object') visit(v);
261
+ }
262
+ };
263
+
264
+ for (const node of arr) visit(node);
265
+ return Object.keys(dayMap).length ? dayMap : null;
266
+ }
267
+
268
+ /**
269
+ * Convert a dayMap to canonical "Monday = 10:00 AM - 9:00 PM;..." string.
270
+ * Fills missing days with empty entries (not present at all β†’ caller decides).
271
+ */
272
+ function canonicalize(dayMap, { fillClosed = false } = {}) {
273
+ if (!dayMap) return '';
274
+ const parts = [];
275
+ for (const day of DAYS) {
276
+ if (dayMap[day]) {
277
+ parts.push(`${day} = ${dayMap[day]}`);
278
+ } else if (fillClosed) {
279
+ parts.push(`${day} = Closed`);
280
+ }
281
+ }
282
+ return parts.join(';');
283
+ }
284
+
285
+ /**
286
+ * Check the canonical string. Returns { ok, missing, reason }.
287
+ * Strict mode (default): all 7 days must be represented.
288
+ */
289
+ function validateCanonical(canonical, { strict = false } = {}) {
290
+ if (!canonical) return { ok: false, missing: DAYS, reason: 'empty' };
291
+ const present = new Set();
292
+ for (const seg of canonical.split(';')) {
293
+ const m = seg.match(/^(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s*=/);
294
+ if (m) present.add(m[1]);
295
+ }
296
+ const missing = DAYS.filter(d => !present.has(d));
297
+ if (strict && missing.length > 0) return { ok: false, missing, reason: 'missing-days' };
298
+ if (present.size === 0) return { ok: false, missing, reason: 'no-days-parsed' };
299
+ return { ok: true, missing, reason: null };
300
+ }
301
+
302
+ module.exports = {
303
+ DAYS,
304
+ normalizeDayName,
305
+ normalizeTime,
306
+ expandDayRange,
307
+ parseHoursEntry,
308
+ parseFreeFormHours,
309
+ parseSchemaOpeningHours,
310
+ parseJsonLdHours,
311
+ canonicalize,
312
+ validateCanonical,
313
+ };