mallmaverick-store-scraper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/browser.js ADDED
@@ -0,0 +1,234 @@
1
+ 'use strict';
2
+
3
+ const puppeteer = require('puppeteer');
4
+ const path = require('path');
5
+ const fs = require('fs');
6
+
7
+ const DEFAULT_UA =
8
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 ' +
9
+ '(KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36';
10
+
11
+ const BROWSER_LAUNCH_OPTS = {
12
+ args: [
13
+ '--no-sandbox',
14
+ '--disable-setuid-sandbox',
15
+ '--disable-dev-shm-usage',
16
+ '--disable-accelerated-2d-canvas',
17
+ '--no-first-run',
18
+ '--no-zygote',
19
+ '--disable-gpu',
20
+ '--window-size=1440,900',
21
+ ],
22
+ };
23
+
24
+ const XHR_CAPTURE_PATTERNS = [
25
+ /stores?\.json/i,
26
+ /\/api\/(stores?|listings?|tenants?|directory|merchants?|retailers?)/i,
27
+ /\/wp-json\/.*\/(stores?|listings?)/i,
28
+ /directory\.json/i,
29
+ ];
30
+
31
+ async function launchBrowser({ headless = true } = {}) {
32
+ return puppeteer.launch({
33
+ headless: headless ? 'new' : false,
34
+ ...BROWSER_LAUNCH_OPTS,
35
+ });
36
+ }
37
+
38
+ async function newPage(browser) {
39
+ const page = await browser.newPage();
40
+ await page.setUserAgent(DEFAULT_UA);
41
+ await page.setViewport({ width: 1440, height: 900 });
42
+ await page.setExtraHTTPHeaders({
43
+ 'Accept-Language': 'en-US,en;q=0.9',
44
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
45
+ });
46
+ page.setDefaultNavigationTimeout(60000);
47
+ page.setDefaultTimeout(60000);
48
+ return page;
49
+ }
50
+
51
+ /**
52
+ * Attach XHR interception. Captures JSON responses matching either:
53
+ * - directoryMode=true → only XHR_CAPTURE_PATTERNS (directory listing APIs)
54
+ * - directoryMode=false → any JSON response 50B-200KB (store detail XHRs)
55
+ *
56
+ * Returns { interceptedJson: [] } — the array fills as responses arrive.
57
+ */
58
+ async function attachXhrInterceptor(page, { directoryMode = false } = {}) {
59
+ const interceptedJson = [];
60
+ const interceptedRaw = [];
61
+ await page.setRequestInterception(true);
62
+ page.on('request', (req) => {
63
+ try { req.continue(); } catch (_) {}
64
+ });
65
+ page.on('response', async (response) => {
66
+ try {
67
+ const url = response.url();
68
+ const ct = response.headers()['content-type'] || '';
69
+ if (!ct.includes('application/json')) return;
70
+ if (directoryMode) {
71
+ if (!XHR_CAPTURE_PATTERNS.some(p => p.test(url))) return;
72
+ const text = await response.text();
73
+ interceptedRaw.push({ url, text });
74
+ interceptedJson.push(text);
75
+ } else {
76
+ const text = await response.text();
77
+ if (text.length > 50 && text.length < 200000) {
78
+ interceptedRaw.push({ url, text });
79
+ interceptedJson.push(text);
80
+ }
81
+ }
82
+ } catch (_) {}
83
+ });
84
+ return { interceptedJson, interceptedRaw };
85
+ }
86
+
87
+ /**
88
+ * Load a URL with attempt-dependent strategy.
89
+ * attempt 1: domcontentloaded + single scroll + 1s wait
90
+ * attempt 2: domcontentloaded + scroll + clickExpandables + 2s wait
91
+ * attempt 3: networkidle2 + scroll + clickExpandables + 3s wait
92
+ */
93
+ async function loadPageWithStrategy(page, url, { attempt = 1, timeout = 45000 } = {}) {
94
+ const waitUntil = attempt >= 3 ? 'networkidle2' : 'domcontentloaded';
95
+ await page.goto(url, { waitUntil, timeout });
96
+
97
+ if (attempt === 1) {
98
+ await autoScroll(page);
99
+ await sleep(1000);
100
+ } else if (attempt === 2) {
101
+ await autoScroll(page, { distance: 300, delay: 200 });
102
+ await clickExpandables(page);
103
+ await sleep(2000);
104
+ } else {
105
+ await autoScroll(page, { distance: 200, delay: 300 });
106
+ await clickExpandables(page);
107
+ await sleep(3000);
108
+ }
109
+
110
+ return collectPageData(page);
111
+ }
112
+
113
+ /**
114
+ * Standard load (no attempt escalation). Used by mall context, external follow, etc.
115
+ */
116
+ async function loadPage(page, url, { waitUntil = 'domcontentloaded', timeout = 45000 } = {}) {
117
+ await page.goto(url, { waitUntil, timeout });
118
+ await autoScroll(page);
119
+ await sleep(500);
120
+ return collectPageData(page);
121
+ }
122
+
123
+ async function collectPageData(page) {
124
+ const html = await page.content();
125
+ const text = await page.evaluate(() => document.body ? document.body.innerText : '');
126
+ const title = await page.title();
127
+ const h1 = await extractH1(page);
128
+ const jsonLd = await extractJsonLd(page);
129
+ const metaTags = await extractMetaTags(page);
130
+ const finalUrl = page.url();
131
+ return { html, text, title, h1, jsonLd, metaTags, finalUrl };
132
+ }
133
+
134
+ async function autoScroll(page, opts = {}) {
135
+ const { distance = 400, delay = 150 } = opts;
136
+ try {
137
+ await page.evaluate(async ({ distance, delay }) => {
138
+ await new Promise(resolve => {
139
+ let total = 0;
140
+ const t = setInterval(() => {
141
+ const h = document.body.scrollHeight;
142
+ window.scrollBy(0, distance);
143
+ total += distance;
144
+ if (total >= h) {
145
+ clearInterval(t);
146
+ resolve();
147
+ }
148
+ }, delay);
149
+ setTimeout(() => { clearInterval(t); resolve(); }, 15000);
150
+ });
151
+ }, { distance, delay });
152
+ } catch (_) {}
153
+ }
154
+
155
+ async function clickExpandables(page) {
156
+ try {
157
+ await page.evaluate(() => {
158
+ const selectors = [
159
+ '[data-toggle]', '[aria-expanded="false"]',
160
+ '.accordion-button', '.expand-btn', '.show-more',
161
+ '[class*="expand"]', '[class*="toggle"]', '[class*="load-more"]',
162
+ 'button[class*="more"]', 'a[class*="more"]',
163
+ ];
164
+ for (const sel of selectors) {
165
+ document.querySelectorAll(sel).forEach(el => { try { el.click(); } catch (_) {} });
166
+ }
167
+ });
168
+ await sleep(500);
169
+ } catch (_) {}
170
+ }
171
+
172
+ async function extractH1(page) {
173
+ try {
174
+ return await page.evaluate(() => {
175
+ const h1s = Array.from(document.querySelectorAll('h1'));
176
+ for (const h of h1s) {
177
+ const t = (h.innerText || h.textContent || '').trim();
178
+ if (t.length > 0 && t.length < 200) return t;
179
+ }
180
+ return '';
181
+ });
182
+ } catch (_) { return ''; }
183
+ }
184
+
185
+ async function extractJsonLd(page) {
186
+ try {
187
+ return await page.evaluate(() => {
188
+ const scripts = Array.from(document.querySelectorAll('script[type="application/ld+json"]'));
189
+ const out = [];
190
+ for (const s of scripts) {
191
+ try {
192
+ const parsed = JSON.parse(s.textContent);
193
+ if (Array.isArray(parsed)) out.push(...parsed);
194
+ else out.push(parsed);
195
+ } catch (_) {}
196
+ }
197
+ return out;
198
+ });
199
+ } catch (_) { return []; }
200
+ }
201
+
202
+ async function extractMetaTags(page) {
203
+ try {
204
+ return await page.evaluate(() => {
205
+ const m = {};
206
+ document.querySelectorAll('meta').forEach(el => {
207
+ const k = el.getAttribute('name') || el.getAttribute('property') || el.getAttribute('itemprop');
208
+ const v = el.getAttribute('content');
209
+ if (k && v) m[k] = v;
210
+ });
211
+ return m;
212
+ });
213
+ } catch (_) { return {}; }
214
+ }
215
+
216
+ async function captureScreenshot(page, storeUrl, screenshotDir, attempt = 1) {
217
+ if (!screenshotDir) return null;
218
+ try {
219
+ fs.mkdirSync(screenshotDir, { recursive: true });
220
+ const safe = storeUrl.replace(/[^a-z0-9]/gi, '_').slice(0, 60);
221
+ const p = path.join(screenshotDir, `${safe}_attempt${attempt}.png`);
222
+ await page.screenshot({ path: p, fullPage: false, type: 'png' });
223
+ return fs.readFileSync(p).toString('base64');
224
+ } catch (_) { return null; }
225
+ }
226
+
227
+ function sleep(ms) { return new Promise(r => setTimeout(r, ms)); }
228
+
229
+ module.exports = {
230
+ launchBrowser, newPage, loadPage, loadPageWithStrategy,
231
+ attachXhrInterceptor, captureScreenshot,
232
+ autoScroll, clickExpandables, sleep,
233
+ XHR_CAPTURE_PATTERNS,
234
+ };
@@ -0,0 +1,235 @@
1
+ 'use strict';
2
+
3
+ const { URL } = require('url');
4
+
5
+ /**
6
+ * Deterministic field extractors — no LLM, regex/DOM only.
7
+ * Each function returns the field value or '' if not found.
8
+ */
9
+
10
+ const PHONE_RE = /(?:\+?1[\s.\-]?)?\(?([2-9]\d{2})\)?[\s.\-]?([2-9]\d{2})[\s.\-]?(\d{4})\b/g;
11
+
12
+ /**
13
+ * Extract a phone number from page text. Returns "(XXX) XXX-XXXX" or ''.
14
+ * Prefers numbers near "phone", "call", "tel" keywords.
15
+ */
16
+ function extractPhone(text, jsonLd) {
17
+ // 1. JSON-LD telephone property wins
18
+ const fromLd = phoneFromJsonLd(jsonLd);
19
+ if (fromLd) return formatPhone(fromLd);
20
+
21
+ if (!text) return '';
22
+
23
+ // 2. Look in proximity of phone/tel keywords first
24
+ const keywordRe = /(?:phone|tel(?:ephone)?|call)[\s:]*([^\n]{0,40})/gi;
25
+ let m;
26
+ while ((m = keywordRe.exec(text)) !== null) {
27
+ const window = m[1];
28
+ const phone = matchPhone(window);
29
+ if (phone) return formatPhone(phone);
30
+ }
31
+
32
+ // 3. Fallback: first phone-shaped number in the text
33
+ const first = matchPhone(text);
34
+ return first ? formatPhone(first) : '';
35
+ }
36
+
37
+ function matchPhone(s) {
38
+ if (!s) return null;
39
+ PHONE_RE.lastIndex = 0;
40
+ const m = PHONE_RE.exec(s);
41
+ return m ? [m[1], m[2], m[3]] : null;
42
+ }
43
+
44
+ function formatPhone(parts) {
45
+ if (!parts) return '';
46
+ return `(${parts[0]}) ${parts[1]}-${parts[2]}`;
47
+ }
48
+
49
+ function phoneFromJsonLd(jsonLd) {
50
+ if (!jsonLd) return null;
51
+ const arr = Array.isArray(jsonLd) ? jsonLd : [jsonLd];
52
+ for (const node of arr) {
53
+ const found = walkPhone(node);
54
+ if (found) return found;
55
+ }
56
+ return null;
57
+ }
58
+
59
+ function walkPhone(node) {
60
+ if (!node || typeof node !== 'object') return null;
61
+ if (Array.isArray(node)) {
62
+ for (const n of node) { const r = walkPhone(n); if (r) return r; }
63
+ return null;
64
+ }
65
+ if (node.telephone) {
66
+ const phone = matchPhone(String(node.telephone));
67
+ if (phone) return phone;
68
+ }
69
+ if (node['@graph']) return walkPhone(node['@graph']);
70
+ for (const k of Object.keys(node)) {
71
+ const v = node[k];
72
+ if (v && typeof v === 'object') { const r = walkPhone(v); if (r) return r; }
73
+ }
74
+ return null;
75
+ }
76
+
77
+ /**
78
+ * Reduce a social URL to a canonical username/page key so the same page is
79
+ * recognized across path variants.
80
+ * facebook.com/pg/Foo/about/ → fb:foo
81
+ * facebook.com/Foo → fb:foo
82
+ * instagram.com/Foo/?hl=en → ig:foo
83
+ * x.com/Foo → tw:foo
84
+ */
85
+ function socialKey(url) {
86
+ if (!url) return null;
87
+ let u;
88
+ try { u = new URL(url); } catch { return null; }
89
+ const host = u.hostname.replace(/^www\.|^m\./, '').toLowerCase();
90
+ const segs = u.pathname.replace(/^\/+/, '').replace(/\/+$/, '').split('/').filter(Boolean);
91
+ if (segs.length === 0) return null;
92
+ // Strip platform-specific prefix segments to get to the page identity.
93
+ const prefixes = new Set(['pg', 'pages', 'profile.php', 'people']);
94
+ let i = 0;
95
+ while (i < segs.length && prefixes.has(segs[i].toLowerCase())) i++;
96
+ let id = segs[i] || '';
97
+ // YouTube paths look like /@handle, /c/x, /channel/UC..., /user/x
98
+ if (host.includes('youtube.com')) {
99
+ if (segs[0] === 'channel' || segs[0] === 'c' || segs[0] === 'user') id = segs[1] || '';
100
+ else if (segs[0] && segs[0].startsWith('@')) id = segs[0].slice(1);
101
+ }
102
+ if (host.includes('tiktok.com') && id.startsWith('@')) id = id.slice(1);
103
+ if (!id) return null;
104
+ id = id.toLowerCase();
105
+ if (host.includes('facebook.com')) return `fb:${id}`;
106
+ if (host.includes('instagram.com')) return `ig:${id}`;
107
+ if (host.includes('twitter.com') || host.includes('x.com')) return `tw:${id}`;
108
+ if (host.includes('youtube.com')) return `yt:${id}`;
109
+ if (host.includes('tiktok.com')) return `tt:${id}`;
110
+ if (host.includes('pinterest.')) return `pi:${id}`;
111
+ return null;
112
+ }
113
+
114
+ /**
115
+ * Extract social media URLs from anchor list.
116
+ * Excludes mall-wide socials (passed in mallSocials) via canonical socialKey.
117
+ */
118
+ function extractSocials(links, mallSocials = {}) {
119
+ const out = {
120
+ facebook: '', instagram: '', twitter: '',
121
+ youtube: '', tiktok: '', pinterest: '',
122
+ };
123
+ if (!links) return out;
124
+
125
+ const patterns = {
126
+ facebook: /^https?:\/\/(?:www\.|m\.)?facebook\.com\/(?!sharer|share|tr\b)/i,
127
+ instagram: /^https?:\/\/(?:www\.)?instagram\.com\/(?!p\/|reel\/)/i,
128
+ twitter: /^https?:\/\/(?:www\.)?(?:twitter|x)\.com\/(?!intent|share)/i,
129
+ youtube: /^https?:\/\/(?:www\.)?youtube\.com\/(?:@|c\/|channel\/|user\/)/i,
130
+ tiktok: /^https?:\/\/(?:www\.)?tiktok\.com\/@/i,
131
+ pinterest: /^https?:\/\/(?:www\.)?pinterest\.[a-z.]+\//i,
132
+ };
133
+
134
+ const mallKeys = new Set();
135
+ for (const v of Object.values(mallSocials || {})) {
136
+ const k = socialKey(v);
137
+ if (k) mallKeys.add(k);
138
+ }
139
+
140
+ for (const l of links) {
141
+ if (!l.href) continue;
142
+ const linkKey = socialKey(l.href);
143
+ if (linkKey && mallKeys.has(linkKey)) continue;
144
+ for (const [k, re] of Object.entries(patterns)) {
145
+ if (!out[k] && re.test(l.href)) out[k] = l.href.split(/[?#]/)[0];
146
+ }
147
+ }
148
+ return out;
149
+ }
150
+
151
+ /**
152
+ * Find the store's own external website. Requires a meaningful name-to-domain
153
+ * affinity or an explicit "Visit website / Official site" anchor text — otherwise
154
+ * returns ''.
155
+ *
156
+ * @param {Array<{href, text}>} links
157
+ * @param {string} mallOrigin
158
+ * @param {string} storeName - used to score domain match
159
+ * @param {Set<string>|string[]} [mallEcosystem] - sister/owner domains to exclude
160
+ */
161
+ function extractWebsite(links, mallOrigin, storeName, mallEcosystem = []) {
162
+ if (!links) return '';
163
+ const socialHosts = /(facebook|instagram|twitter|x\.com|youtube|tiktok|pinterest|google\.|maps\.|linkedin|snapchat)\./i;
164
+ const ecoSet = new Set((Array.isArray(mallEcosystem) ? mallEcosystem : Array.from(mallEcosystem))
165
+ .map(d => String(d).toLowerCase().replace(/^www\./, '')));
166
+
167
+ const externals = [];
168
+ for (const l of links) {
169
+ if (!l.href) continue;
170
+ try {
171
+ const u = new URL(l.href);
172
+ if (u.origin === mallOrigin) continue;
173
+ if (socialHosts.test(u.hostname)) continue;
174
+ if (!/^https?:$/.test(u.protocol)) continue;
175
+ const host = u.hostname.replace(/^www\./, '');
176
+ if (ecoSet.has(host)) continue;
177
+ externals.push({ href: l.href, host, text: l.text || '' });
178
+ } catch (_) {}
179
+ }
180
+ if (externals.length === 0) return '';
181
+
182
+ // Build two clean forms of the name — URLs sometimes spell out "&" as "and"
183
+ // (bathandbodyworks.com) and sometimes drop it entirely (aw.ca).
184
+ const rawLower = String(storeName || '').toLowerCase();
185
+ const cleanNameAnd = rawLower.replace(/&/g, 'and').replace(/[^a-z0-9]/g, '');
186
+ const cleanNameDrop = rawLower.replace(/[^a-z0-9]/g, '');
187
+ const score = (e) => {
188
+ let s = 0;
189
+ const host = e.host.toLowerCase();
190
+ const t = (e.text || '').toLowerCase();
191
+ const hostFlat = host.replace(/\./g, '');
192
+ const matches = (n) => n && n.length >= 2 && hostFlat.includes(n);
193
+
194
+ if (matches(cleanNameAnd) || matches(cleanNameDrop)) s += 30;
195
+ else if (cleanNameAnd.length >= 5 && hostFlat.includes(cleanNameAnd.slice(0, 5))) s += 25;
196
+ else if (cleanNameAnd.length >= 3 && hostFlat.includes(cleanNameAnd.slice(0, 3))) s += 15;
197
+
198
+ if (/^website|^visit|store website|official|view (the )?site/i.test(t)) s += 40;
199
+ if (/\.googleusercontent|\.cloudfront|cdn\.|locator/.test(host)) s -= 20;
200
+ return s;
201
+ };
202
+
203
+ let best = externals[0];
204
+ let bestScore = score(best);
205
+ for (const e of externals.slice(1)) {
206
+ const s = score(e);
207
+ if (s > bestScore) { best = e; bestScore = s; }
208
+ }
209
+
210
+ // Strictness gate: only accept if score is meaningful.
211
+ if (bestScore < 25) return '';
212
+ try {
213
+ const u = new URL(best.href);
214
+ return `${u.protocol}//${u.host}/`;
215
+ } catch { return best.href; }
216
+ }
217
+
218
+ /**
219
+ * Detect store status flags from text.
220
+ */
221
+ function detectStatusFlags(text) {
222
+ const t = String(text || '').toLowerCase();
223
+ return {
224
+ is_coming_soon_store: /\bcoming soon\b/.test(t),
225
+ is_new_store: /\b(now open|new store|newly opened|just opened)\b/.test(t),
226
+ is_relocated_store: /\b(relocated|new location|moved to)\b/.test(t),
227
+ is_temporarily_closed: /\b(temporarily closed|temporarily shut|closed for renovation)\b/.test(t),
228
+ };
229
+ }
230
+
231
+ module.exports = {
232
+ extractPhone, formatPhone, matchPhone,
233
+ extractSocials, socialKey, extractWebsite,
234
+ detectStatusFlags,
235
+ };