mallmaverick-store-scraper 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +225 -0
- package/package.json +41 -0
- package/src/brandSiteFallback.js +272 -0
- package/src/browser.js +234 -0
- package/src/deterministic.js +235 -0
- package/src/discovery.js +298 -0
- package/src/externalFollow.js +89 -0
- package/src/hoursParser.js +313 -0
- package/src/hoursPipeline.js +151 -0
- package/src/imageExtraction.js +331 -0
- package/src/llmExtract.js +99 -0
- package/src/logoExtraction.js +130 -0
- package/src/main.js +330 -0
- package/src/mallContext.js +201 -0
- package/src/mcp-server.js +425 -0
- package/src/openai-proxy.js +52 -0
- package/src/output.js +21 -0
- package/src/retryStrategy.js +60 -0
- package/src/storeExtractor.js +239 -0
- package/src/storeModel.js +147 -0
package/src/discovery.js
ADDED
|
@@ -0,0 +1,298 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { URL } = require('url');
|
|
4
|
+
const http = require('http');
|
|
5
|
+
const https = require('https');
|
|
6
|
+
const {
|
|
7
|
+
newPage, autoScroll, attachXhrInterceptor, sleep, XHR_CAPTURE_PATTERNS,
|
|
8
|
+
} = require('./browser');
|
|
9
|
+
|
|
10
|
+
const STORE_URL_KEYWORDS = [
|
|
11
|
+
'/store/', '/stores/', '/shop/', '/shops/', '/tenant/', '/tenants/',
|
|
12
|
+
'/directory/', '/listing/', '/listings/', '/retailer/', '/restaurant/',
|
|
13
|
+
'/brand/', '/location/', '/vendor/', '/merchant/',
|
|
14
|
+
];
|
|
15
|
+
|
|
16
|
+
const NON_STORE_PATTERNS = [
|
|
17
|
+
/\/(about|contact|faq|privacy|terms|careers|news|blog|events|deals|gift|parking|directions|hours|jobs)\b/i,
|
|
18
|
+
/\/(categories?|category)(\/|$)/i,
|
|
19
|
+
/\/(shopping-?map|site-?map|map)(\/|$)/i,
|
|
20
|
+
/\/(a-?z-?listing|a-?z-?list|all-?stores?)(\/|$)/i,
|
|
21
|
+
/\/(page|tag|filter|search)(\/|$)/i,
|
|
22
|
+
/\.(pdf|jpg|jpeg|png|gif|svg|css|js|xml|json)$/i,
|
|
23
|
+
];
|
|
24
|
+
|
|
25
|
+
const NON_STORE_SLUGS = new Set([
|
|
26
|
+
'shopping-map', 'map', 'directory', 'a-z-listing', 'listing', 'list',
|
|
27
|
+
'all', 'index', 'category', 'categories', 'about', 'contact', 'hours',
|
|
28
|
+
'parking', 'careers', 'leasing', 'news', 'events', 'promotions', 'deals',
|
|
29
|
+
'gift-cards', 'gift', 'jobs', 'blog', 'faq', 'help', 'login', 'register',
|
|
30
|
+
'cart', 'checkout', 'search', 'filter', 'page',
|
|
31
|
+
]);
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Multi-strategy store URL discovery:
|
|
35
|
+
* 1. XHR/fetch interception (directory APIs returning store JSON)
|
|
36
|
+
* 2. Directory listing card scan (anchor+img pairs β logoMap)
|
|
37
|
+
* 3. DOM link scan
|
|
38
|
+
* 4. sitemap.xml + common variants
|
|
39
|
+
* 5. /api/stores etc. probing
|
|
40
|
+
*
|
|
41
|
+
* Returns { storeUrls, logoMap } where logoMap: Map<normalizedStoreUrl, logoUrl>.
|
|
42
|
+
*/
|
|
43
|
+
async function discoverStores(browser, directoryUrl, logger) {
|
|
44
|
+
const parsedBase = new URL(directoryUrl);
|
|
45
|
+
const baseUrl = `${parsedBase.protocol}//${parsedBase.hostname}`;
|
|
46
|
+
|
|
47
|
+
const page = await newPage(browser);
|
|
48
|
+
let result;
|
|
49
|
+
try {
|
|
50
|
+
const { interceptedJson, interceptedRaw } = await attachXhrInterceptor(page, { directoryMode: true });
|
|
51
|
+
await page.goto(directoryUrl, { waitUntil: 'domcontentloaded', timeout: 60000 });
|
|
52
|
+
await sleep(3000);
|
|
53
|
+
await autoScroll(page);
|
|
54
|
+
await sleep(1500);
|
|
55
|
+
await autoScroll(page, { distance: 300, delay: 100 });
|
|
56
|
+
await sleep(500);
|
|
57
|
+
|
|
58
|
+
const logoMap = await collectDirectoryLogoMap(page, baseUrl);
|
|
59
|
+
if (logger) logger.info(` πΌοΈ Directory logos collected: ${logoMap.size}`);
|
|
60
|
+
|
|
61
|
+
const pageUrls = await extractLinksFromPage(page, baseUrl);
|
|
62
|
+
if (logger) logger.info(` π DOM links: ${pageUrls.length}`);
|
|
63
|
+
|
|
64
|
+
const sitemapUrls = await fetchSitemapUrls(baseUrl, logger);
|
|
65
|
+
if (logger) logger.info(` πΊοΈ Sitemap URLs: ${sitemapUrls.length}`);
|
|
66
|
+
|
|
67
|
+
const apiUrls = await probeApiEndpoints(baseUrl, logger);
|
|
68
|
+
if (logger) logger.info(` π‘ API URLs: ${apiUrls.length}`);
|
|
69
|
+
|
|
70
|
+
const all = new Set([...logoMap.keys(), ...pageUrls, ...sitemapUrls, ...apiUrls]);
|
|
71
|
+
for (const { url: src, text } of interceptedRaw) {
|
|
72
|
+
const extracted = extractUrlsFromJson(text, baseUrl);
|
|
73
|
+
if (extracted.length && logger) logger.info(` π¦ +${extracted.length} URLs from XHR (${src})`);
|
|
74
|
+
extracted.forEach(u => all.add(u));
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const filtered = filterStoreUrls(Array.from(all), directoryUrl, logoMap);
|
|
78
|
+
// Final dedup: normalize trailing slash so /aw and /aw/ are one URL.
|
|
79
|
+
const seen = new Set();
|
|
80
|
+
const storeUrls = [];
|
|
81
|
+
for (const u of filtered) {
|
|
82
|
+
const norm = u.replace(/\/+$/, '').toLowerCase();
|
|
83
|
+
if (seen.has(norm)) continue;
|
|
84
|
+
seen.add(norm);
|
|
85
|
+
storeUrls.push(u);
|
|
86
|
+
}
|
|
87
|
+
if (logger) logger.info(`β
Total store URLs: ${storeUrls.length}`);
|
|
88
|
+
result = { storeUrls, logoMap };
|
|
89
|
+
} finally {
|
|
90
|
+
await page.close();
|
|
91
|
+
}
|
|
92
|
+
return result;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
async function collectDirectoryLogoMap(page, baseUrl) {
|
|
96
|
+
try {
|
|
97
|
+
const entries = await page.evaluate((baseUrl) => {
|
|
98
|
+
const out = [];
|
|
99
|
+
const anchors = Array.from(document.querySelectorAll('a[href]'));
|
|
100
|
+
for (const a of anchors) {
|
|
101
|
+
const href = a.getAttribute('href') || '';
|
|
102
|
+
const full = href.startsWith('http') ? href : (href.startsWith('/') ? baseUrl + href : null);
|
|
103
|
+
if (!full) continue;
|
|
104
|
+
const img = a.querySelector('img');
|
|
105
|
+
if (!img) continue;
|
|
106
|
+
const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-lazy-src') || '';
|
|
107
|
+
if (!src || src.length < 5) continue;
|
|
108
|
+
if (/placeholder|spacer|blank/i.test(src)) continue;
|
|
109
|
+
const fullSrc = src.startsWith('http') ? src : (src.startsWith('//') ? 'https:' + src : (src.startsWith('/') ? baseUrl + src : null));
|
|
110
|
+
if (fullSrc) out.push({ url: full, logo: fullSrc });
|
|
111
|
+
}
|
|
112
|
+
const cardSelectors = [
|
|
113
|
+
'li', 'article',
|
|
114
|
+
'[class*="store-item"]', '[class*="store-card"]', '[class*="tenant"]',
|
|
115
|
+
'[class*="listing-item"]', '[class*="directory-item"]', '[class*="retailer"]',
|
|
116
|
+
'[class*="brand-item"]', '[class*="shop-item"]',
|
|
117
|
+
];
|
|
118
|
+
for (const sel of cardSelectors) {
|
|
119
|
+
document.querySelectorAll(sel).forEach(card => {
|
|
120
|
+
const link = card.querySelector('a[href]');
|
|
121
|
+
const img = card.querySelector('img');
|
|
122
|
+
if (!link || !img) return;
|
|
123
|
+
const href = link.getAttribute('href') || '';
|
|
124
|
+
const full = href.startsWith('http') ? href : (href.startsWith('/') ? baseUrl + href : null);
|
|
125
|
+
if (!full) return;
|
|
126
|
+
const src = img.getAttribute('src') || img.getAttribute('data-src') || img.getAttribute('data-lazy-src') || '';
|
|
127
|
+
if (!src || /placeholder|spacer/i.test(src)) return;
|
|
128
|
+
const fullSrc = src.startsWith('http') ? src : (src.startsWith('//') ? 'https:' + src : (src.startsWith('/') ? baseUrl + src : null));
|
|
129
|
+
if (fullSrc) out.push({ url: full, logo: fullSrc });
|
|
130
|
+
});
|
|
131
|
+
}
|
|
132
|
+
return out;
|
|
133
|
+
}, baseUrl);
|
|
134
|
+
|
|
135
|
+
const map = new Map();
|
|
136
|
+
for (const { url, logo } of entries) {
|
|
137
|
+
const normalized = url.replace(/\/$/, '').toLowerCase();
|
|
138
|
+
if (!map.has(normalized)) map.set(normalized, logo);
|
|
139
|
+
}
|
|
140
|
+
return map;
|
|
141
|
+
} catch (_) {
|
|
142
|
+
return new Map();
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
async function extractLinksFromPage(page, baseUrl) {
|
|
147
|
+
try {
|
|
148
|
+
return await page.evaluate((baseUrl) => {
|
|
149
|
+
return Array.from(document.querySelectorAll('a[href]'))
|
|
150
|
+
.map(a => {
|
|
151
|
+
const href = a.getAttribute('href');
|
|
152
|
+
if (!href) return null;
|
|
153
|
+
if (href.startsWith('http')) return href;
|
|
154
|
+
if (href.startsWith('/')) return baseUrl + href;
|
|
155
|
+
return null;
|
|
156
|
+
})
|
|
157
|
+
.filter(Boolean);
|
|
158
|
+
}, baseUrl);
|
|
159
|
+
} catch (_) { return []; }
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
async function fetchSitemapUrls(baseUrl, logger) {
|
|
163
|
+
const urls = new Set();
|
|
164
|
+
const locations = [
|
|
165
|
+
`${baseUrl}/sitemap.xml`,
|
|
166
|
+
`${baseUrl}/sitemap_index.xml`,
|
|
167
|
+
`${baseUrl}/sitemap-stores.xml`,
|
|
168
|
+
`${baseUrl}/sitemap-listings.xml`,
|
|
169
|
+
`${baseUrl}/sitemaps/sitemap.xml`,
|
|
170
|
+
];
|
|
171
|
+
for (const loc of locations) {
|
|
172
|
+
const text = await fetchText(loc);
|
|
173
|
+
if (!text) continue;
|
|
174
|
+
const matches = text.match(/<loc>(.*?)<\/loc>/g) || [];
|
|
175
|
+
matches.forEach(m => {
|
|
176
|
+
const u = m.replace(/<\/?loc>/g, '').trim();
|
|
177
|
+
if (u) urls.add(u);
|
|
178
|
+
});
|
|
179
|
+
if (urls.size > 0) {
|
|
180
|
+
if (logger) logger.info(` πΊοΈ Sitemap: ${loc}`);
|
|
181
|
+
break;
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
return Array.from(urls);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
async function probeApiEndpoints(baseUrl, logger) {
|
|
188
|
+
const urls = [];
|
|
189
|
+
const endpoints = [
|
|
190
|
+
`${baseUrl}/api/stores`,
|
|
191
|
+
`${baseUrl}/api/v1/stores`,
|
|
192
|
+
`${baseUrl}/api/listings`,
|
|
193
|
+
`${baseUrl}/api/directory`,
|
|
194
|
+
`${baseUrl}/wp-json/wp/v2/stores`,
|
|
195
|
+
`${baseUrl}/stores.json`,
|
|
196
|
+
`${baseUrl}/directory.json`,
|
|
197
|
+
`${baseUrl}/listings.json`,
|
|
198
|
+
];
|
|
199
|
+
for (const ep of endpoints) {
|
|
200
|
+
const text = await fetchText(ep, 5000);
|
|
201
|
+
if (!text) continue;
|
|
202
|
+
try {
|
|
203
|
+
const data = JSON.parse(text);
|
|
204
|
+
const extracted = extractUrlsFromJson(JSON.stringify(data), baseUrl);
|
|
205
|
+
if (extracted.length > 0) {
|
|
206
|
+
if (logger) logger.info(` π‘ API: ${ep} (${extracted.length})`);
|
|
207
|
+
urls.push(...extracted);
|
|
208
|
+
break;
|
|
209
|
+
}
|
|
210
|
+
} catch (_) {}
|
|
211
|
+
}
|
|
212
|
+
return urls;
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
function extractUrlsFromJson(jsonText, baseUrl) {
|
|
216
|
+
const urls = [];
|
|
217
|
+
const urlPattern = /(https?:\/\/[^\s"'<>]+)/g;
|
|
218
|
+
const relativePattern = /"(?:url|link|href|permalink|slug)"\s*:\s*"(\/[^"]+)"/gi;
|
|
219
|
+
let m;
|
|
220
|
+
while ((m = urlPattern.exec(jsonText)) !== null) urls.push(m[1]);
|
|
221
|
+
while ((m = relativePattern.exec(jsonText)) !== null) urls.push(baseUrl + m[1]);
|
|
222
|
+
return [...new Set(urls)];
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
function filterStoreUrls(urls, directoryUrl, logoMap = null) {
|
|
226
|
+
const parsedDir = new URL(directoryUrl);
|
|
227
|
+
const dirHostname = parsedDir.hostname;
|
|
228
|
+
const dirPath = parsedDir.pathname.replace(/\/+$/, '');
|
|
229
|
+
const dirSegs = dirPath.split('/').filter(Boolean);
|
|
230
|
+
const parentPath = dirSegs.length > 1 ? '/' + dirSegs.slice(0, -1).join('/') : null;
|
|
231
|
+
|
|
232
|
+
const logoMapUrls = new Set();
|
|
233
|
+
if (logoMap) for (const k of logoMap.keys()) logoMapUrls.add(k.replace(/\/+$/, '').toLowerCase());
|
|
234
|
+
|
|
235
|
+
const isChildOf = (p, base) => base && p.startsWith(base + '/') && p !== base;
|
|
236
|
+
|
|
237
|
+
return urls.filter(u => {
|
|
238
|
+
try {
|
|
239
|
+
const parsed = new URL(u);
|
|
240
|
+
if (parsed.hostname !== dirHostname) return false;
|
|
241
|
+
if (NON_STORE_PATTERNS.some(p => p.test(parsed.pathname))) return false;
|
|
242
|
+
if (u === directoryUrl) return false;
|
|
243
|
+
const path = parsed.pathname.replace(/\/+$/, '');
|
|
244
|
+
if (path.length < 4) return false;
|
|
245
|
+
if (path === dirPath) return false;
|
|
246
|
+
if (parentPath && path === parentPath) return false;
|
|
247
|
+
const slug = path.split('/').pop();
|
|
248
|
+
if (NON_STORE_SLUGS.has((slug || '').toLowerCase())) return false;
|
|
249
|
+
|
|
250
|
+
if (logoMapUrls.has(path.toLowerCase()) || logoMapUrls.has(u.replace(/\/+$/, '').toLowerCase())) return true;
|
|
251
|
+
|
|
252
|
+
const acceptUnder = (base) => {
|
|
253
|
+
const remainder = path.slice(base.length + 1);
|
|
254
|
+
const segs = remainder.split('/').filter(Boolean);
|
|
255
|
+
if (segs.length === 0 || segs.length > 2) return false;
|
|
256
|
+
if (!/[a-z]/i.test(segs[0])) return false;
|
|
257
|
+
if (NON_STORE_SLUGS.has(segs[0].toLowerCase())) return false;
|
|
258
|
+
return true;
|
|
259
|
+
};
|
|
260
|
+
|
|
261
|
+
if (isChildOf(path, dirPath) && acceptUnder(dirPath)) return true;
|
|
262
|
+
if (parentPath && isChildOf(path, parentPath) && acceptUnder(parentPath)) return true;
|
|
263
|
+
if (STORE_URL_KEYWORDS.some(kw => parsed.pathname.toLowerCase().includes(kw))) {
|
|
264
|
+
// Final slug sanity check
|
|
265
|
+
return acceptUnder(parsed.pathname.replace(/\/[^/]*$/, ''));
|
|
266
|
+
}
|
|
267
|
+
return false;
|
|
268
|
+
} catch { return false; }
|
|
269
|
+
});
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
function fetchText(url, timeout = 8000) {
|
|
273
|
+
return new Promise(resolve => {
|
|
274
|
+
const mod = url.startsWith('https') ? https : http;
|
|
275
|
+
const req = mod.get(url, {
|
|
276
|
+
headers: {
|
|
277
|
+
'User-Agent': 'Mozilla/5.0 (compatible; StoreScraperV5/1.0)',
|
|
278
|
+
'Accept': 'text/html,application/json,*/*',
|
|
279
|
+
},
|
|
280
|
+
timeout,
|
|
281
|
+
}, res => {
|
|
282
|
+
if (res.statusCode !== 200) { resolve(null); return; }
|
|
283
|
+
let data = '';
|
|
284
|
+
res.on('data', c => { data += c; });
|
|
285
|
+
res.on('end', () => resolve(data));
|
|
286
|
+
});
|
|
287
|
+
req.on('error', () => resolve(null));
|
|
288
|
+
req.on('timeout', () => { req.destroy(); resolve(null); });
|
|
289
|
+
});
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// Backwards-compat for first-slice callers
|
|
293
|
+
async function discoverStoreLinks(browser, directoryUrl) {
|
|
294
|
+
const { storeUrls } = await discoverStores(browser, directoryUrl, null);
|
|
295
|
+
return storeUrls;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
module.exports = { discoverStores, discoverStoreLinks, collectDirectoryLogoMap };
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { URL } = require('url');
|
|
4
|
+
const { loadPage, newPage } = require('./browser');
|
|
5
|
+
const {
|
|
6
|
+
parseJsonLdHours,
|
|
7
|
+
parseFreeFormHours,
|
|
8
|
+
canonicalize,
|
|
9
|
+
validateCanonical,
|
|
10
|
+
} = require('./hoursParser');
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Pick the best external link to follow when the mall page has no hours.
|
|
14
|
+
*
|
|
15
|
+
* Heuristics:
|
|
16
|
+
* - Prefer links labeled "hours" or "click here for business hours"
|
|
17
|
+
* - Then prefer links to known store-locator paths
|
|
18
|
+
* - Then the store's own website
|
|
19
|
+
*
|
|
20
|
+
* @param {Array<{href:string, text:string}>} links - links scraped from the store page
|
|
21
|
+
* @param {string} mallOrigin - the mall site origin to exclude
|
|
22
|
+
*/
|
|
23
|
+
function chooseExternalLink(links, mallOrigin) {
|
|
24
|
+
if (!links || links.length === 0) return null;
|
|
25
|
+
|
|
26
|
+
const external = links
|
|
27
|
+
.filter(l => {
|
|
28
|
+
try {
|
|
29
|
+
const u = new URL(l.href);
|
|
30
|
+
return u.origin !== mallOrigin && /^https?:/.test(u.protocol);
|
|
31
|
+
} catch { return false; }
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
if (external.length === 0) return null;
|
|
35
|
+
|
|
36
|
+
const score = (l) => {
|
|
37
|
+
let s = 0;
|
|
38
|
+
const t = (l.text || '').toLowerCase();
|
|
39
|
+
const h = (l.href || '').toLowerCase();
|
|
40
|
+
if (/business hours|store hours|hours of operation|click here for.*hours/.test(t)) s += 100;
|
|
41
|
+
if (/\bhours?\b/.test(t)) s += 30;
|
|
42
|
+
if (/store-locator|storelocation|store\/[^/]+\/?$|locations?\/|location\/.+/.test(h)) s += 50;
|
|
43
|
+
if (/locator/.test(h)) s += 25;
|
|
44
|
+
if (/instagram|facebook|twitter|tiktok|youtube|pinterest|google|maps\./.test(h)) s -= 100;
|
|
45
|
+
return s;
|
|
46
|
+
};
|
|
47
|
+
|
|
48
|
+
const ranked = external
|
|
49
|
+
.map(l => ({ ...l, _score: score(l) }))
|
|
50
|
+
.sort((a, b) => b._score - a._score);
|
|
51
|
+
|
|
52
|
+
return ranked[0];
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Visit an external URL and try to extract hours via layers 1-2 (JSON-LD + free-form).
|
|
57
|
+
* Returns { canonical, source: 'external-jsonld' | 'external-freeform' } or null.
|
|
58
|
+
*/
|
|
59
|
+
async function extractHoursFromExternal(browser, url, { logger } = {}) {
|
|
60
|
+
if (!url) return null;
|
|
61
|
+
const page = await newPage(browser);
|
|
62
|
+
try {
|
|
63
|
+
const data = await loadPage(page, url, { waitUntil: 'networkidle2', timeout: 30000 });
|
|
64
|
+
|
|
65
|
+
const ld = parseJsonLdHours(data.jsonLd);
|
|
66
|
+
if (ld) {
|
|
67
|
+
const canonical = canonicalize(ld);
|
|
68
|
+
if (validateCanonical(canonical).ok) {
|
|
69
|
+
return { canonical, source: 'external-jsonld', sourceUrl: data.finalUrl };
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
const ff = parseFreeFormHours(data.text);
|
|
74
|
+
if (ff) {
|
|
75
|
+
const canonical = canonicalize(ff);
|
|
76
|
+
if (validateCanonical(canonical).ok) {
|
|
77
|
+
return { canonical, source: 'external-freeform', sourceUrl: data.finalUrl };
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
return null;
|
|
81
|
+
} catch (err) {
|
|
82
|
+
if (logger) logger.warn(` β External fetch failed (${url}): ${err.message}`);
|
|
83
|
+
return null;
|
|
84
|
+
} finally {
|
|
85
|
+
try { await page.close(); } catch (_) {}
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
module.exports = { chooseExternalLink, extractHoursFromExternal };
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Hours parser: accepts varied free-form and structured inputs, emits canonical
|
|
5
|
+
* "Monday = 10:00 AM - 9:00 PM;Tuesday = ..." string (v4-compatible).
|
|
6
|
+
*
|
|
7
|
+
* Public API:
|
|
8
|
+
* parseFreeFormHours(text) β canonical string | null
|
|
9
|
+
* parseJsonLdHours(jsonLdArr)β canonical string | null
|
|
10
|
+
* parseSchemaOpeningHours(s) β canonical string | null (legacy "Mo-Fr 10:00-21:00" form)
|
|
11
|
+
* canonicalize(dayMap) β canonical string
|
|
12
|
+
* validateCanonical(s) β { ok, missing[], reason }
|
|
13
|
+
* normalizeDayName(s) β "Monday" | null
|
|
14
|
+
* normalizeTime(s) β "10:00 AM" | "Closed" | null
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
const DAYS = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'];
|
|
18
|
+
const DAY_INDEX = Object.fromEntries(DAYS.map((d, i) => [d, i]));
|
|
19
|
+
|
|
20
|
+
// Generous day-name lookup including 2-3 letter abbrevs and Schema.org full forms.
|
|
21
|
+
const DAY_LOOKUP = {
|
|
22
|
+
mon: 'Monday', monday: 'Monday', mo: 'Monday',
|
|
23
|
+
tue: 'Tuesday', tues: 'Tuesday', tuesday: 'Tuesday', tu: 'Tuesday',
|
|
24
|
+
wed: 'Wednesday', weds: 'Wednesday', wednesday: 'Wednesday', we: 'Wednesday',
|
|
25
|
+
thu: 'Thursday', thur: 'Thursday', thurs: 'Thursday', thursday: 'Thursday', th: 'Thursday',
|
|
26
|
+
fri: 'Friday', friday: 'Friday', fr: 'Friday',
|
|
27
|
+
sat: 'Saturday', saturday: 'Saturday', sa: 'Saturday',
|
|
28
|
+
sun: 'Sunday', sunday: 'Sunday', su: 'Sunday',
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
function normalizeDayName(raw) {
|
|
32
|
+
if (!raw) return null;
|
|
33
|
+
const key = String(raw).trim().toLowerCase().replace(/[\.\s]/g, '');
|
|
34
|
+
// Handle Schema.org URI form: http://schema.org/Monday
|
|
35
|
+
const tail = key.split('/').pop();
|
|
36
|
+
return DAY_LOOKUP[tail] || DAY_LOOKUP[key] || null;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* Parse a single time token to "H:MM AM/PM" or "Closed".
|
|
41
|
+
* Accepts: "10", "10am", "10 am", "10:00am", "10:00 AM", "10:00", "21:00",
|
|
42
|
+
* "noon", "midnight", "closed".
|
|
43
|
+
*/
|
|
44
|
+
function normalizeTime(raw) {
|
|
45
|
+
if (!raw) return null;
|
|
46
|
+
const s = String(raw).trim().toLowerCase();
|
|
47
|
+
if (!s) return null;
|
|
48
|
+
|
|
49
|
+
if (/^closed$/.test(s) || /^by appointment$/.test(s)) return 'Closed';
|
|
50
|
+
if (/^noon|^12:?00\s*(p\.?m\.?)?$/.test(s)) return '12:00 PM';
|
|
51
|
+
if (/^midnight|^12:?00\s*a\.?m\.?$/.test(s)) return '12:00 AM';
|
|
52
|
+
if (/^24\s*h(ou)?rs?$|^open\s*24/.test(s)) return 'Open 24 Hours';
|
|
53
|
+
|
|
54
|
+
// Match "H[:MM][ ]?am|pm" or 24h "HH:MM" or bare "HH"
|
|
55
|
+
const m = s.match(/^(\d{1,2})(?:[:\.](\d{2}))?\s*([ap])\.?\s*m?\.?$/i)
|
|
56
|
+
|| s.match(/^(\d{1,2})[:\.](\d{2})$/)
|
|
57
|
+
|| s.match(/^(\d{1,2})$/);
|
|
58
|
+
if (!m) return null;
|
|
59
|
+
|
|
60
|
+
let h = parseInt(m[1], 10);
|
|
61
|
+
const min = m[2] ? parseInt(m[2], 10) : 0;
|
|
62
|
+
const ampmRaw = m[3];
|
|
63
|
+
|
|
64
|
+
if (Number.isNaN(h) || Number.isNaN(min) || h < 0 || h > 24 || min < 0 || min > 59) return null;
|
|
65
|
+
|
|
66
|
+
let ampm;
|
|
67
|
+
if (ampmRaw) {
|
|
68
|
+
ampm = ampmRaw.toLowerCase() === 'a' ? 'AM' : 'PM';
|
|
69
|
+
if (h === 12 && ampm === 'AM') h = 0; // 12am = 00
|
|
70
|
+
else if (h === 12 && ampm === 'PM') h = 12; // 12pm = 12
|
|
71
|
+
else if (ampm === 'PM') h += 12;
|
|
72
|
+
} else {
|
|
73
|
+
// No am/pm β assume 24-hour input
|
|
74
|
+
if (h === 24) h = 0;
|
|
75
|
+
if (h > 23) return null;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
const h12 = h === 0 ? 12 : h > 12 ? h - 12 : h;
|
|
79
|
+
const ampmOut = h < 12 ? 'AM' : 'PM';
|
|
80
|
+
return `${h12}:${String(min).padStart(2, '0')} ${ampmOut}`;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Expand a day range "mon-fri" to ["Monday","Tuesday",...,"Friday"].
|
|
85
|
+
* Returns null if either side is unparseable.
|
|
86
|
+
*/
|
|
87
|
+
function expandDayRange(startRaw, endRaw) {
|
|
88
|
+
const start = normalizeDayName(startRaw);
|
|
89
|
+
const end = normalizeDayName(endRaw);
|
|
90
|
+
if (!start || !end) return null;
|
|
91
|
+
const si = DAY_INDEX[start];
|
|
92
|
+
const ei = DAY_INDEX[end];
|
|
93
|
+
if (si === undefined || ei === undefined) return null;
|
|
94
|
+
const out = [];
|
|
95
|
+
let i = si;
|
|
96
|
+
// Allow wrap-around e.g. Fri-Sun
|
|
97
|
+
while (true) {
|
|
98
|
+
out.push(DAYS[i]);
|
|
99
|
+
if (i === ei) break;
|
|
100
|
+
i = (i + 1) % 7;
|
|
101
|
+
if (out.length > 7) return null; // safety
|
|
102
|
+
}
|
|
103
|
+
return out;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Parse one entry like "Mon-Fri 10am-9pm" or "Saturday 10:00 AM - 9:00 PM" or "Sunday: Closed".
|
|
108
|
+
* Returns { days: ["Monday",...], open: "10:00 AM", close: "9:00 PM" } or
|
|
109
|
+
* { days: [...], closed: true } or null.
|
|
110
|
+
*/
|
|
111
|
+
function parseHoursEntry(text) {
|
|
112
|
+
if (!text) return null;
|
|
113
|
+
let s = String(text).trim();
|
|
114
|
+
|
|
115
|
+
// Normalize various dashes and "to" separator to ' - '
|
|
116
|
+
s = s
|
|
117
|
+
.replace(/[β-ββ]/g, '-') // hyphen/dash variants
|
|
118
|
+
.replace(/\s+to\s+/gi, ' - ')
|
|
119
|
+
.replace(/\s+through\s+/gi, '-') // for day ranges
|
|
120
|
+
.replace(/\s+/g, ' ');
|
|
121
|
+
|
|
122
|
+
// Strip trailing parenthetical/suffix junk like "(appointment only)",
|
|
123
|
+
// "* by appointment", "(walk-ins welcome)" β these confuse the regex
|
|
124
|
+
// anchor but don't affect the times themselves.
|
|
125
|
+
s = s
|
|
126
|
+
.replace(/\s*\([^)]*\)\s*$/g, '')
|
|
127
|
+
.replace(/\s*\*.*$/g, '')
|
|
128
|
+
.trim();
|
|
129
|
+
|
|
130
|
+
// Closed line: "Sunday: Closed", "Monday Closed", "Mon-Fri: Closed"
|
|
131
|
+
const closedMatch = s.match(/^([a-z]+)(?:\s*[-ββ]\s*([a-z]+))?\s*[:\-]?\s*(closed|by appointment(?: only)?)$/i);
|
|
132
|
+
if (closedMatch) {
|
|
133
|
+
const days = closedMatch[2]
|
|
134
|
+
? expandDayRange(closedMatch[1], closedMatch[2])
|
|
135
|
+
: [normalizeDayName(closedMatch[1])].filter(Boolean);
|
|
136
|
+
if (!days || days.length === 0) return null;
|
|
137
|
+
return { days, closed: true };
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Day(-Day)? : open - close
|
|
141
|
+
// Examples:
|
|
142
|
+
// "Monday: 6:00 AM to 8:00 PM" (already normalized "to" β "-")
|
|
143
|
+
// "Mon-Fri 10am-9pm"
|
|
144
|
+
// "Saturday 10:00 - 18:00"
|
|
145
|
+
const m = s.match(
|
|
146
|
+
/^([a-z]+)(?:\s*[-ββ]\s*([a-z]+))?\s*[:\-]?\s*([0-9][0-9:\.\sapm]*?)\s*[-ββ]\s*([0-9][0-9:\.\sapm]*)$/i
|
|
147
|
+
);
|
|
148
|
+
if (!m) return null;
|
|
149
|
+
|
|
150
|
+
const days = m[2]
|
|
151
|
+
? expandDayRange(m[1], m[2])
|
|
152
|
+
: [normalizeDayName(m[1])].filter(Boolean);
|
|
153
|
+
if (!days || days.length === 0) return null;
|
|
154
|
+
|
|
155
|
+
const open = normalizeTime(m[3]);
|
|
156
|
+
const close = normalizeTime(m[4]);
|
|
157
|
+
if (!open || !close) return null;
|
|
158
|
+
|
|
159
|
+
return { days, open, close };
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
/**
|
|
163
|
+
* Parse a free-form hours block: multi-line or comma-separated.
|
|
164
|
+
* Returns a dayMap { Monday: "10:00 AM - 9:00 PM", ... } or null if nothing recognized.
|
|
165
|
+
*/
|
|
166
|
+
function parseFreeFormHours(text) {
|
|
167
|
+
if (!text) return null;
|
|
168
|
+
const raw = String(text);
|
|
169
|
+
|
|
170
|
+
// Normalize line breaks and common separators
|
|
171
|
+
// Split on newlines, semicolons, " | ", and commas-followed-by-day-name
|
|
172
|
+
const candidates = raw
|
|
173
|
+
.split(/\n+|;+|\s*\|\s*/)
|
|
174
|
+
.flatMap(chunk => {
|
|
175
|
+
// Further split on commas, but only when both halves look like an entry.
|
|
176
|
+
// Heuristic: if "," appears between two day names, split.
|
|
177
|
+
const parts = chunk.split(/,\s*(?=(?:mon|tue|wed|thu|fri|sat|sun|mo|tu|we|th|fr|sa|su)[a-z]*\b)/i);
|
|
178
|
+
return parts;
|
|
179
|
+
})
|
|
180
|
+
.map(p => p.trim())
|
|
181
|
+
.filter(Boolean);
|
|
182
|
+
|
|
183
|
+
const dayMap = {};
|
|
184
|
+
let parsedAny = false;
|
|
185
|
+
for (const c of candidates) {
|
|
186
|
+
const entry = parseHoursEntry(c);
|
|
187
|
+
if (!entry) continue;
|
|
188
|
+
parsedAny = true;
|
|
189
|
+
for (const d of entry.days) {
|
|
190
|
+
dayMap[d] = entry.closed ? 'Closed' : `${entry.open} - ${entry.close}`;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
return parsedAny ? dayMap : null;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
/**
|
|
197
|
+
* Parse Schema.org-style "openingHours" property string.
|
|
198
|
+
* Format: "Mo-Fr 10:00-21:00, Sa 09:00-19:00, Su 11:00-18:00"
|
|
199
|
+
* Returns dayMap or null.
|
|
200
|
+
*/
|
|
201
|
+
function parseSchemaOpeningHours(value) {
|
|
202
|
+
if (!value) return null;
|
|
203
|
+
if (Array.isArray(value)) {
|
|
204
|
+
const merged = {};
|
|
205
|
+
for (const v of value) {
|
|
206
|
+
const m = parseSchemaOpeningHours(v);
|
|
207
|
+
if (m) Object.assign(merged, m);
|
|
208
|
+
}
|
|
209
|
+
return Object.keys(merged).length ? merged : null;
|
|
210
|
+
}
|
|
211
|
+
return parseFreeFormHours(String(value));
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
/**
|
|
215
|
+
* Walk a parsed JSON-LD blob and return canonical hours if found.
|
|
216
|
+
* Supports:
|
|
217
|
+
* - LocalBusiness.openingHoursSpecification (array of {dayOfWeek, opens, closes})
|
|
218
|
+
* - LocalBusiness.openingHours (string or array of strings, Schema.org compact form)
|
|
219
|
+
* - Nested @graph
|
|
220
|
+
*/
|
|
221
|
+
function parseJsonLdHours(jsonLdArr) {
|
|
222
|
+
if (!jsonLdArr) return null;
|
|
223
|
+
const arr = Array.isArray(jsonLdArr) ? jsonLdArr : [jsonLdArr];
|
|
224
|
+
const dayMap = {};
|
|
225
|
+
|
|
226
|
+
const visit = (node) => {
|
|
227
|
+
if (!node || typeof node !== 'object') return;
|
|
228
|
+
if (Array.isArray(node)) { node.forEach(visit); return; }
|
|
229
|
+
|
|
230
|
+
if (node['@graph']) visit(node['@graph']);
|
|
231
|
+
|
|
232
|
+
if (node.openingHoursSpecification) {
|
|
233
|
+
const specs = Array.isArray(node.openingHoursSpecification)
|
|
234
|
+
? node.openingHoursSpecification
|
|
235
|
+
: [node.openingHoursSpecification];
|
|
236
|
+
for (const spec of specs) {
|
|
237
|
+
if (!spec) continue;
|
|
238
|
+
const days = spec.dayOfWeek;
|
|
239
|
+
const dayList = Array.isArray(days) ? days : days ? [days] : [];
|
|
240
|
+
for (const d of dayList) {
|
|
241
|
+
const dayName = normalizeDayName(d);
|
|
242
|
+
if (!dayName) continue;
|
|
243
|
+
const open = normalizeTime(spec.opens);
|
|
244
|
+
const close = normalizeTime(spec.closes);
|
|
245
|
+
if (open && close) dayMap[dayName] = `${open} - ${close}`;
|
|
246
|
+
else if (spec.closes === undefined && spec.opens === undefined) dayMap[dayName] = 'Closed';
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
if (node.openingHours) {
|
|
252
|
+
const compact = parseSchemaOpeningHours(node.openingHours);
|
|
253
|
+
if (compact) Object.assign(dayMap, compact);
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
// Recurse into common containers
|
|
257
|
+
for (const key of Object.keys(node)) {
|
|
258
|
+
if (['openingHoursSpecification', 'openingHours'].includes(key)) continue;
|
|
259
|
+
const v = node[key];
|
|
260
|
+
if (v && typeof v === 'object') visit(v);
|
|
261
|
+
}
|
|
262
|
+
};
|
|
263
|
+
|
|
264
|
+
for (const node of arr) visit(node);
|
|
265
|
+
return Object.keys(dayMap).length ? dayMap : null;
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
/**
|
|
269
|
+
* Convert a dayMap to canonical "Monday = 10:00 AM - 9:00 PM;..." string.
|
|
270
|
+
* Fills missing days with empty entries (not present at all β caller decides).
|
|
271
|
+
*/
|
|
272
|
+
function canonicalize(dayMap, { fillClosed = false } = {}) {
|
|
273
|
+
if (!dayMap) return '';
|
|
274
|
+
const parts = [];
|
|
275
|
+
for (const day of DAYS) {
|
|
276
|
+
if (dayMap[day]) {
|
|
277
|
+
parts.push(`${day} = ${dayMap[day]}`);
|
|
278
|
+
} else if (fillClosed) {
|
|
279
|
+
parts.push(`${day} = Closed`);
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
return parts.join(';');
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
/**
|
|
286
|
+
* Check the canonical string. Returns { ok, missing, reason }.
|
|
287
|
+
* Strict mode (default): all 7 days must be represented.
|
|
288
|
+
*/
|
|
289
|
+
function validateCanonical(canonical, { strict = false } = {}) {
|
|
290
|
+
if (!canonical) return { ok: false, missing: DAYS, reason: 'empty' };
|
|
291
|
+
const present = new Set();
|
|
292
|
+
for (const seg of canonical.split(';')) {
|
|
293
|
+
const m = seg.match(/^(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday)\s*=/);
|
|
294
|
+
if (m) present.add(m[1]);
|
|
295
|
+
}
|
|
296
|
+
const missing = DAYS.filter(d => !present.has(d));
|
|
297
|
+
if (strict && missing.length > 0) return { ok: false, missing, reason: 'missing-days' };
|
|
298
|
+
if (present.size === 0) return { ok: false, missing, reason: 'no-days-parsed' };
|
|
299
|
+
return { ok: true, missing, reason: null };
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
module.exports = {
|
|
303
|
+
DAYS,
|
|
304
|
+
normalizeDayName,
|
|
305
|
+
normalizeTime,
|
|
306
|
+
expandDayRange,
|
|
307
|
+
parseHoursEntry,
|
|
308
|
+
parseFreeFormHours,
|
|
309
|
+
parseSchemaOpeningHours,
|
|
310
|
+
parseJsonLdHours,
|
|
311
|
+
canonicalize,
|
|
312
|
+
validateCanonical,
|
|
313
|
+
};
|