mallmaverick-store-scraper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,239 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Store-fields LLM extractor.
5
+ *
6
+ * Responsibilities:
7
+ * - Extract the LLM-only fields: description, categories, location_type,
8
+ * restaurant fields, amenities, parking_information, etc.
9
+ * - DOES NOT extract hours (hoursPipeline already did that).
10
+ * - DOES NOT extract phone/socials/website (deterministic.js did those).
11
+ * - Receives directoryLogoUrl and pageLogoCandidates and returns the picked logo.
12
+ *
13
+ * Returns a partial store object to merge into the final record + a per-field
14
+ * confidence summary.
15
+ */
16
+
17
+ const MODEL_PRICING = {
18
+ 'gpt-4o': { input: 0.005, output: 0.015 },
19
+ 'gpt-4o-mini': { input: 0.00015, output: 0.0006 },
20
+ 'gpt-4.1': { input: 0.008, output: 0.024 },
21
+ 'gpt-4.1-mini': { input: 0.0004, output: 0.0016 },
22
+ 'gpt-4.1-nano': { input: 0.0001, output: 0.0004 },
23
+ 'gpt-4-turbo': { input: 0.01, output: 0.03 },
24
+ 'gpt-3.5-turbo': { input: 0.0005, output: 0.0015 },
25
+ 'gpt-5': { input: 0.015, output: 0.060 },
26
+ 'gpt-5-mini': { input: 0.003, output: 0.012 },
27
+ 'gpt-5.4-mini': { input: 0.00075, output: 0.0045 },
28
+ };
29
+
30
+ const SYSTEM_PROMPT = `You are a data extractor for retail store directory pages.
31
+ Return ONLY valid JSON. Be precise. Never invent values not present in the input.
32
+ For missing strings use "" — for missing booleans use false.
33
+
34
+ You are NOT extracting: name, store_hours, phone, social media URLs, website,
35
+ logo_image_url, brand_image_url, store_front_image_url. Those are handled by
36
+ other systems. Focus on the remaining fields.
37
+
38
+ FIELD RULES:
39
+
40
+ description:
41
+ - The store's marketing/about description from the page. Multi-sentence prose.
42
+ - Preserve sentences as written but strip HTML tags.
43
+ - If only a short tagline is available, use it.
44
+
45
+ brief_description:
46
+ - 1-2 sentence summary (≤200 chars). Derive from description if needed.
47
+
48
+ plain_text_description:
49
+ - description with all HTML/markdown stripped, no formatting characters.
50
+
51
+ categories:
52
+ - Semicolon-separated retail classification labels.
53
+ - Use ONLY classification labels — "Fashion", "Food & Beverage", "Beauty & Personal Care",
54
+ "Services", "Home", "Accessories", "Electronics", "Health & Wellness", "Restaurant", etc.
55
+ - 1-4 categories typical. Don't put product names here.
56
+
57
+ location_type:
58
+ - e.g. "Restaurant", "Kiosk", "Cart", "In-line store", "Anchor", "Outparcel", "Service".
59
+
60
+ location_information:
61
+ - Free text about where the store is in the mall ("Upper level, near Centre Court", etc.)
62
+
63
+ unit:
64
+ - Unit number if present (e.g. "Unit 215").
65
+
66
+ parking_information:
67
+ - Parking notes specific to this store.
68
+
69
+ restaurant fields:
70
+ - is_restaurant: true if food/beverage establishment.
71
+ - restaurant_cuisines: cuisine type ("Mexican", "Sushi", "Coffee", etc.)
72
+ - menu_url: link to menu if visible.
73
+ - restaurant_info: extra restaurant details (takeout, delivery, alcohol, etc).
74
+
75
+ status flags (set ONLY if the page explicitly says so):
76
+ - is_new_store, is_coming_soon_store, is_relocated_store, is_temporarily_closed,
77
+ exterior_retailer.
78
+
79
+ tags:
80
+ - ONLY explicit keyword tags/hashtags. NOT categories, NOT product types.
81
+ - If unsure, leave "".
82
+
83
+ products: semicolon-separated product types the store sells.
84
+ service: services offered (e.g. "Alterations;Personal Shopping").
85
+ brand_description: paragraph describing the brand.
86
+ amenities: semicolon-separated (e.g. "Wi-Fi;Wheelchair accessible").
87
+ payment_methods: semicolon-separated (e.g. "Visa;Mastercard;Cash").
88
+ return_policy: return policy text.
89
+
90
+ confidence: float 0-1 reflecting fill rate of the requested fields.`;
91
+
92
+ function buildUserPrompt({
93
+ url, urlSlug, h1, title, hoursCanonical,
94
+ jsonLd, metaTags, interceptedJson, textContent,
95
+ }) {
96
+ const parts = [];
97
+ parts.push(`URL: ${url}`);
98
+ parts.push(`URL_SLUG: ${urlSlug || '(none)'}`);
99
+ parts.push(`H1_TEXT: ${h1 || '(none)'}`);
100
+ parts.push(`PAGE_TITLE: ${title || '(none)'}`);
101
+ parts.push(`HOURS_ALREADY_EXTRACTED: ${hoursCanonical || '(none)'}`);
102
+
103
+ if (jsonLd && jsonLd.length) {
104
+ parts.push('\n--- JSON-LD ---');
105
+ parts.push(JSON.stringify(jsonLd, null, 2).slice(0, 3000));
106
+ }
107
+ if (metaTags && Object.keys(metaTags).length) {
108
+ parts.push('\n--- Meta Tags ---');
109
+ parts.push(JSON.stringify(metaTags, null, 2).slice(0, 1200));
110
+ }
111
+ if (interceptedJson && interceptedJson.length) {
112
+ parts.push('\n--- Intercepted XHR JSON (truncated) ---');
113
+ for (const j of interceptedJson.slice(0, 3)) parts.push(String(j).slice(0, 2000));
114
+ }
115
+ parts.push('\n--- Page Text ---');
116
+ parts.push((textContent || '').slice(0, 10000));
117
+
118
+ return parts.join('\n');
119
+ }
120
+
121
+ class StoreExtractor {
122
+ constructor({ client, model = 'gpt-5.4-mini', useVision = false, logger }) {
123
+ this.client = client;
124
+ this.model = model;
125
+ this.useVision = useVision;
126
+ this.logger = logger;
127
+ this.totalTokensInput = 0;
128
+ this.totalTokensOutput = 0;
129
+ this.totalCost = 0;
130
+ this.extractionCount = 0;
131
+ }
132
+
133
+ async extract(pageData, hoursCanonical) {
134
+ const userText = buildUserPrompt({ ...pageData, hoursCanonical });
135
+
136
+ const isGpt5Family = /^gpt-5(\.|-|$)/i.test(this.model);
137
+ const messages = [
138
+ { role: 'system', content: SYSTEM_PROMPT },
139
+ this._buildUserMessage(userText, pageData.screenshotBase64),
140
+ ];
141
+ const params = {
142
+ model: this.model,
143
+ messages,
144
+ response_format: { type: 'json_object' },
145
+ };
146
+ if (isGpt5Family) params.max_completion_tokens = 1800;
147
+ else { params.max_tokens = 1800; params.temperature = 0.0; }
148
+
149
+ let raw;
150
+ try {
151
+ const resp = await this.client.chat.completions.create(params);
152
+ this._trackUsage(resp);
153
+ raw = JSON.parse(resp.choices[0].message.content);
154
+ } catch (err) {
155
+ if (this.logger) this.logger.warn(` ⚠ Store LLM extract failed: ${err.message}`);
156
+ return { fields: {}, confidence: 0 };
157
+ }
158
+
159
+ const fields = this._sanitize(raw, pageData);
160
+ const confidence = typeof raw.confidence === 'number'
161
+ ? Math.min(1, Math.max(0, raw.confidence))
162
+ : estimateConfidence(fields);
163
+
164
+ this.extractionCount++;
165
+ return { fields, confidence };
166
+ }
167
+
168
+ _buildUserMessage(text, screenshotBase64) {
169
+ if (!this.useVision || !screenshotBase64) {
170
+ return { role: 'user', content: `Extract the JSON now:\n\n${text}` };
171
+ }
172
+ return {
173
+ role: 'user',
174
+ content: [
175
+ { type: 'text', text: `Extract the JSON now (screenshot for visual context):\n\n${text}` },
176
+ { type: 'image_url', image_url: { url: `data:image/png;base64,${screenshotBase64}`, detail: 'low' } },
177
+ ],
178
+ };
179
+ }
180
+
181
+ _sanitize(raw, _pageData) {
182
+ const out = {};
183
+ const stringFields = [
184
+ 'description', 'brief_description', 'plain_text_description',
185
+ 'categories', 'location_type', 'location_information', 'unit',
186
+ 'parking_information', 'restaurant_cuisines', 'menu_url', 'restaurant_info',
187
+ 'tags', 'products', 'service', 'brand_description', 'amenities',
188
+ 'payment_methods', 'return_policy', 'store_recommendations',
189
+ ];
190
+ const boolFields = [
191
+ 'is_restaurant', 'is_new_store', 'is_coming_soon_store',
192
+ 'is_relocated_store', 'is_temporarily_closed', 'exterior_retailer',
193
+ ];
194
+
195
+ for (const f of stringFields) {
196
+ if (f in raw) out[f] = sanitizeString(raw[f]);
197
+ }
198
+ for (const f of boolFields) {
199
+ if (f in raw) out[f] = Boolean(raw[f]);
200
+ }
201
+
202
+ return out;
203
+ }
204
+
205
+ _trackUsage(response) {
206
+ if (!response.usage) return;
207
+ const { prompt_tokens, completion_tokens } = response.usage;
208
+ this.totalTokensInput += prompt_tokens || 0;
209
+ this.totalTokensOutput += completion_tokens || 0;
210
+ const p = MODEL_PRICING[this.model];
211
+ if (p) {
212
+ this.totalCost += ((prompt_tokens || 0) / 1000) * p.input
213
+ + ((completion_tokens || 0) / 1000) * p.output;
214
+ }
215
+ }
216
+
217
+ getUsageSummary() {
218
+ return {
219
+ model: this.model,
220
+ extractions: this.extractionCount,
221
+ totalInputTokens: this.totalTokensInput,
222
+ totalOutputTokens: this.totalTokensOutput,
223
+ estimatedCost: `$${this.totalCost.toFixed(4)}`,
224
+ };
225
+ }
226
+ }
227
+
228
+ function sanitizeString(v) {
229
+ if (v == null) return '';
230
+ return String(v).replace(/[\r\n]+/g, ' ').replace(/\s{2,}/g, ' ').trim();
231
+ }
232
+
233
+ function estimateConfidence(fields) {
234
+ const key = ['description', 'categories', 'location_type'];
235
+ const populated = key.filter(f => fields[f] && fields[f].length > 0).length;
236
+ return populated / key.length;
237
+ }
238
+
239
+ module.exports = { StoreExtractor, MODEL_PRICING };
@@ -0,0 +1,147 @@
1
+ 'use strict';
2
+
3
+ const STORE_FIELDS = [
4
+ 'mm_id',
5
+ 'name',
6
+ 'website',
7
+ 'description',
8
+ 'categories',
9
+ 'phone',
10
+ 'location_type',
11
+ 'store_recommendations',
12
+ 'logo_image_url',
13
+ 'brand_image_url',
14
+ 'store_front_image_url',
15
+ 'is_new_store',
16
+ 'is_coming_soon_store',
17
+ 'is_relocated_store',
18
+ 'exterior_retailer',
19
+ 'is_temporarily_closed',
20
+ 'is_restaurant',
21
+ 'restaurant_cuisines',
22
+ 'menu_url',
23
+ 'restaurant_info',
24
+ 'tags',
25
+ 'sync_with_centre_hours',
26
+ 'is_free_form_hours',
27
+ 'free_form_hours',
28
+ 'store_hours',
29
+ 'hours_source',
30
+ 'hours_confidence',
31
+ 'location_information',
32
+ 'parking_information',
33
+ 'products',
34
+ 'service',
35
+ 'brand_description',
36
+ 'amenities',
37
+ 'payment_methods',
38
+ 'return_policy',
39
+ 'unit',
40
+ 'facebook',
41
+ 'twitter',
42
+ 'instagram',
43
+ 'youtube',
44
+ 'tiktok',
45
+ 'pinterest',
46
+ ];
47
+
48
+ const BOOLEAN_FIELDS = new Set([
49
+ 'is_new_store',
50
+ 'is_coming_soon_store',
51
+ 'is_relocated_store',
52
+ 'exterior_retailer',
53
+ 'is_temporarily_closed',
54
+ 'is_restaurant',
55
+ 'sync_with_centre_hours',
56
+ 'is_free_form_hours',
57
+ ]);
58
+
59
+ const NUMERIC_FIELDS = new Set(['hours_confidence']);
60
+
61
+ // Diagnostic fields kept in JSON but excluded from CSV (not needed by the CMS).
62
+ const CSV_EXCLUDE_FIELDS = new Set(['hours_source', 'hours_confidence']);
63
+
64
+ function createStore(mmId) {
65
+ const store = { mm_id: mmId };
66
+ for (const field of STORE_FIELDS) {
67
+ if (field === 'mm_id') continue;
68
+ if (BOOLEAN_FIELDS.has(field)) store[field] = false;
69
+ else if (NUMERIC_FIELDS.has(field)) store[field] = 0;
70
+ else store[field] = '';
71
+ }
72
+ return store;
73
+ }
74
+
75
+ function mergeExtracted(mmId, aiData) {
76
+ const store = createStore(mmId);
77
+ if (!aiData || typeof aiData !== 'object') return store;
78
+
79
+ for (const field of STORE_FIELDS) {
80
+ if (field === 'mm_id') continue;
81
+ if (!(field in aiData)) continue;
82
+ const val = aiData[field];
83
+
84
+ if (BOOLEAN_FIELDS.has(field)) {
85
+ if (typeof val === 'boolean') store[field] = val;
86
+ else if (typeof val === 'string') store[field] = val.toLowerCase() === 'true' || val === '1';
87
+ else store[field] = Boolean(val);
88
+ } else if (NUMERIC_FIELDS.has(field)) {
89
+ const n = typeof val === 'number' ? val : parseFloat(val);
90
+ store[field] = Number.isFinite(n) ? n : 0;
91
+ } else {
92
+ store[field] = val == null ? '' : String(val).replace(/[\r\n]+/g, ' ').replace(/\s{2,}/g, ' ').trim();
93
+ }
94
+ }
95
+ return store;
96
+ }
97
+
98
+ /**
99
+ * Escape a CSV cell.
100
+ *
101
+ * Modes:
102
+ * - alwaysQuote=true : wrap the value in double quotes even when not strictly
103
+ * required. Defends against upload tools whose CSV parsers don't handle
104
+ * bare URLs / mixed-content cells well — the cell is unambiguously bounded
105
+ * by quote characters so the parser can't accidentally swallow trailing
106
+ * fields into the URL.
107
+ * - alwaysQuote=false : RFC-4180 minimal quoting (only when the cell
108
+ * contains a separator, quote, or newline).
109
+ */
110
+ function csvCell(val, { alwaysQuote = false } = {}) {
111
+ const s = val == null ? '' : String(val);
112
+ const mustQuote = s.includes(',') || s.includes('"') || s.includes('\n') || s.includes('\r');
113
+ if (alwaysQuote || mustQuote) {
114
+ return '"' + s.replace(/"/g, '""') + '"';
115
+ }
116
+ return s;
117
+ }
118
+
119
+ /**
120
+ * Emit CSV. By default strings are always-quoted (defensive) and we use CRLF
121
+ * line endings plus a UTF-8 BOM, which gives the most reliable behavior across
122
+ * Excel, Google Sheets, and naive in-house CSV parsers.
123
+ *
124
+ * Booleans and numeric fields are left unquoted so destination systems can
125
+ * type-detect them.
126
+ */
127
+ function storesToCSV(stores, { lineEnding = '\r\n', bom = true, alwaysQuoteStrings = true } = {}) {
128
+ const csvFields = STORE_FIELDS.filter(f => !CSV_EXCLUDE_FIELDS.has(f));
129
+ const formatCell = (field, val) => {
130
+ if (BOOLEAN_FIELDS.has(field) || NUMERIC_FIELDS.has(field)) {
131
+ return csvCell(val, { alwaysQuote: false });
132
+ }
133
+ return csvCell(val, { alwaysQuote: alwaysQuoteStrings });
134
+ };
135
+ const headerLine = csvFields
136
+ .map(f => csvCell(f, { alwaysQuote: alwaysQuoteStrings }))
137
+ .join(',');
138
+ if (!stores || stores.length === 0) {
139
+ return (bom ? '' : '') + headerLine + lineEnding;
140
+ }
141
+ const rows = stores.map(store =>
142
+ csvFields.map(f => formatCell(f, store[f] == null ? '' : store[f])).join(',')
143
+ );
144
+ return (bom ? '' : '') + [headerLine, ...rows].join(lineEnding) + lineEnding;
145
+ }
146
+
147
+ module.exports = { STORE_FIELDS, BOOLEAN_FIELDS, NUMERIC_FIELDS, CSV_EXCLUDE_FIELDS, createStore, mergeExtracted, storesToCSV };