yotpo-reviews-scraper 0.0.2 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/index.js +768 -124
  2. package/package.json +7 -3
package/index.js CHANGED
@@ -1,150 +1,794 @@
1
+ const fs = require('fs');
2
+ const path = require('path');
3
+
1
4
  const puppeteer = require('puppeteer');
2
5
 
3
6
  async function handleModalsDynamically(page) {
4
- // Remove existing modals based on common patterns
5
- await page.evaluate(() => {
6
- const modals = document.querySelectorAll('[role="dialog"], [aria-modal="true"], div[class*="modal"], div[class*="popup"], div[class*="overlay"]');
7
- modals.forEach(modal => modal.remove());
8
- });
9
-
10
- // Use MutationObserver to watch for new modals
11
- await page.evaluate(() => {
12
- const observer = new MutationObserver(mutations => {
13
- mutations.forEach(mutation => {
14
- if (mutation.addedNodes.length) {
15
- mutation.addedNodes.forEach(node => {
16
- if (node.matches && (node.matches('[role="dialog"], [aria-modal="true"], div[class*="modal"], div[class*="popup"], div[class*="overlay"]'))) {
17
- node.remove();
18
- }
19
- });
20
- }
21
- });
7
+ // Remove existing modals based on common patterns
8
+ await page.evaluate(() => {
9
+ const modals = document.querySelectorAll('[role="dialog"], [aria-modal="true"], div[class*="modal"], div[class*="popup"], div[class*="overlay"]');
10
+ modals.forEach((modal) => modal.remove());
11
+ });
12
+
13
+ // Use MutationObserver to watch for new modals
14
+ await page.evaluate(() => {
15
+ // eslint-disable-next-line no-undef
16
+ const observer = new MutationObserver((mutations) => {
17
+ mutations.forEach((mutation) => {
18
+ if (mutation.addedNodes.length) {
19
+ mutation.addedNodes.forEach((node) => {
20
+ if (node.matches && (node.matches('[role="dialog"], [aria-modal="true"], div[class*="modal"], div[class*="popup"], div[class*="overlay"]'))) {
21
+ node.remove();
22
+ }
23
+ });
24
+ }
25
+ });
26
+ });
27
+
28
+ observer.observe(document.body, {childList: true, subtree: true});
22
29
  });
23
-
24
- observer.observe(document.body, { childList: true, subtree: true });
25
- });
26
30
 
27
- // Fallback method: Pressing the Escape key
28
- await page.keyboard.press('Escape').catch(e => console.log('Error pressing Escape:', e.message));
31
+ // Fallback method: Pressing the Escape key
32
+ await page.keyboard.press('Escape').catch((e) => console.log('Error pressing Escape:', e.message));
29
33
 
30
- // Fallback method: Clicking outside the modal
31
- await page.mouse.click(0, 0).catch(e => console.log('Error clicking outside modal:', e.message));
34
+ // Fallback method: Clicking outside the modal
35
+ await page.mouse.click(0, 0).catch((e) => console.log('Error clicking outside modal:', e.message));
32
36
  }
33
37
 
34
38
  async function waitForContentChange(page, selector, previousContent) {
35
- await page.waitForFunction(
36
- (selector, previousContent) => {
37
- const currentContent = document.querySelector(selector).textContent.trim();
38
- return currentContent !== previousContent;
39
- },
40
- {},
41
- selector,
42
- previousContent
43
- );
39
+ await page.waitForFunction(
40
+ (sel, prevContent) => {
41
+ const currentContent = document.querySelector(sel).textContent.trim();
42
+ return currentContent !== prevContent;
43
+ },
44
+ {},
45
+ selector,
46
+ previousContent
47
+ );
44
48
  }
45
49
 
50
+ // Helper function for delays (since page.waitForTimeout is deprecated)
51
+ function delay(ms) {
52
+ return new Promise((resolve) => {
53
+ setTimeout(resolve, ms);
54
+ });
55
+ }
56
+
57
+ async function yotpoScraper(url, options = {}) {
58
+ let browser;
59
+ // Automatically enable dev mode if headless is false
60
+ // Default to headless: true (production mode) if not specified
61
+ const isHeadless = options.headless !== false;
62
+ // Dev mode is enabled if: explicitly set, OR headless is false, OR NODE_ENV is development
63
+ const devMode = options.dev !== undefined ? options.dev : (!isHeadless || process.env.NODE_ENV === 'development');
64
+ const devOutputDir = path.join(process.cwd(), 'dev-output');
46
65
 
47
- async function yotpoScraper(url) {
48
- let browser;
49
- try {
50
- const selectors = {
51
- yotpo: 'div.yotpo.yotpo-main-widget',
52
- reviews: 'div.yotpo-reviews',
53
- review: 'div.yotpo-review',
54
- name: 'span.yotpo-user-name',
55
- rating: 'div.yotpo-review-stars span.sr-only',
56
- title: 'div.yotpo-main div.content-title',
57
- desc: 'div.content-review',
58
- date: 'span.yotpo-review-date',
59
- pager: 'div.yotpo-pager[data-total]',
60
- next: 'div.yotpo-pager a[rel^=next]',
66
+ // Helper function for dev logging
67
+ const devLog = (...args) => {
68
+ if (devMode) {
69
+ console.log('[DEV]', ...args);
70
+ }
61
71
  };
62
72
 
63
- browser = await puppeteer.launch({
64
- // headless: 'new',
65
- headless: false,
66
- });
73
+ // Helper function to save HTML
74
+ const saveHTML = async (page, filename, description) => {
75
+ if (!devMode) return;
67
76
 
68
- const page = await browser.newPage();
77
+ try {
78
+ if (!fs.existsSync(devOutputDir)) {
79
+ fs.mkdirSync(devOutputDir, {recursive: true});
80
+ }
69
81
 
70
- await page.setViewport({
71
- width: 1280,
72
- height: 1024,
73
- });
82
+ const html = await page.content();
83
+ const filePath = path.join(devOutputDir, filename);
84
+ fs.writeFileSync(filePath, html, 'utf8');
85
+ devLog(`Saved HTML: ${filename} - ${description}`);
86
+ } catch (error) {
87
+ devLog(`Failed to save HTML ${filename}:`, error.message);
88
+ }
89
+ };
74
90
 
75
- // Block Yotpo analytics requests
76
- await page.setRequestInterception(true);
77
- page.on('request', request => {
78
- if (request.url().includes('https://p.yotpo.com/i?e=se&se_ca=reviews&se_ac=shown&se_psk')) {
79
- request.abort();
80
- } else {
81
- request.continue();
82
- }
83
- });
91
+ try {
92
+ // Base selectors - will be updated dynamically based on what's found
93
+ const selectors = {
94
+ yotpo: 'div.yotpo.yotpo-main-widget',
95
+ reviews: 'div.yotpo-reviews',
96
+ review: 'div.yotpo-review',
97
+ name: 'span.yotpo-reviewer-name, span.yotpo-user-name',
98
+ rating: 'div.yotpo-review-rating-title span.sr-only, div.yotpo-review-stars span.sr-only, [aria-label*="star"]',
99
+ title: 'p.yotpo-review-title, div.yotpo-review-rating-title',
100
+ desc: 'div.yotpo-review-content, div.content-review',
101
+ date: 'div.yotpo-date-format, span.yotpo-review-date, div.yotpo-review-date',
102
+ pager: 'div.yotpo-pager[data-total]',
103
+ next: 'nav.yotpo-reviews-pagination-container a[aria-label*="next" i], nav.yotpo-reviews-pagination-container a[aria-label="Navigate to next page"], div.yotpo-pager a[rel^=next]',
104
+ previous: 'nav.yotpo-reviews-pagination-container a[aria-label*="previous" i], nav.yotpo-reviews-pagination-container a[aria-label="Navigate to previous page"]',
105
+ expandButton: 'button.yotpo-sr-bottom-line-summary',
106
+ };
84
107
 
85
- // Log messages from the browser's console.
86
- // page.on('console', message => console.log(message.text()));
87
-
88
- await page.goto(url, { waitUntil: 'networkidle2' });
89
-
90
- await page.waitForSelector(selectors.reviews);
91
-
92
- await handleModalsDynamically(page);
93
-
94
- // const html = await page.evaluate(selector => document.querySelector(selector.reviews).innerHTML, selectors);
95
-
96
- const reviewsTotal = await page.evaluate(selector => document.querySelector(selector.pager).getAttribute('data-total'), selectors);
97
- const reviewsPerPage = await page.evaluate(selector => document.querySelector(selector.pager).getAttribute('data-per-page'), selectors);
98
- const reviewsPages = Math.ceil(reviewsTotal / reviewsPerPage);
99
- console.log('Total reviews:', reviewsTotal);
100
- console.log('Reviews per page:', reviewsPerPage);
101
- console.log('Pages:', reviewsPages);
102
-
103
- let reviewsArr = [];
104
- // while() { 'div.yotpo-pager a:not[.yotpo-disabled]' }
105
- for (let p = 1; p < reviewsPages + 1; p += 1) {
106
- console.log('Getting page:', p);
107
- const d = await page.evaluate((selector) => {
108
- const reviews = document.querySelectorAll(selector.review);
109
- const data = [];
110
- for (let r = 0; r < reviews.length; r += 1) {
111
- // const reviewNumber = data.length + 1;
112
- data.push({
113
- name: document.querySelectorAll(selector.name)[r].textContent.trim(),
114
- rating: document.querySelectorAll(selector.rating)[r].textContent.trim(),
115
- title: document.querySelectorAll(selector.title)[r].textContent.trim(),
116
- desc: document.querySelectorAll(selector.desc)[r].textContent.trim(),
117
- date: document.querySelectorAll(selector.date)[r].textContent.trim(),
118
- });
119
- }
120
- return data;
121
- }, selectors);
108
+ // Alternative selector patterns to try
109
+ const selectorVariants = {
110
+ yotpo: [
111
+ 'div.yotpo.yotpo-main-widget',
112
+ 'div#yotpo-reviews-main-widget',
113
+ 'div.yotpo-reviews-main-widget',
114
+ ],
115
+ reviews: [
116
+ 'div.yotpo-reviews',
117
+ 'div#yotpo-reviews-main-widget',
118
+ 'div.yotpo-reviews-main-widget',
119
+ 'div.yotpo-main-layout',
120
+ ],
121
+ review: [
122
+ 'div.yotpo-review',
123
+ '[class*="yotpo-review"]',
124
+ ],
125
+ };
126
+
127
+ devLog('=== Starting Yotpo Scraper ===');
128
+ devLog('URL:', url);
129
+ devLog('Dev mode:', devMode);
130
+ devLog('Selectors:', JSON.stringify(selectors, null, 2));
122
131
 
123
- console.log('Got', d.length, 'reviews from page', p);
132
+ browser = await puppeteer.launch({
133
+ headless: isHeadless,
134
+ });
135
+
136
+ devLog('Browser launched');
124
137
 
125
- // add reviews to array
126
- reviewsArr = [...reviewsArr, ...d];
138
+ const page = await browser.newPage();
127
139
 
128
- // if not last page in pagination, click to next page
129
- if (reviewsPages !== p) {
130
- const previousContent = await page.$eval(selectors.review, el => el.textContent.trim());
131
- await page.click(selectors.next);
132
- await page.waitForResponse(response => {
133
- return response.url().includes('https://staticw2.yotpo.com/batch/app_key') && response.status() === 200;
140
+ await page.setViewport({
141
+ width: 1280,
142
+ height: 1024,
134
143
  });
135
- await waitForContentChange(page, selectors.review, previousContent);
136
- }
137
- }
138
144
 
139
- return reviewsArr;
140
- } catch (error) {
141
- console.log(`Error: ${error}`);
142
- return [];
143
- } finally {
144
- if (browser) {
145
- await browser.close();
145
+ devLog('Viewport set to 1280x1024');
146
+
147
+ // Block Yotpo analytics requests
148
+ await page.setRequestInterception(true);
149
+ page.on('request', (request) => {
150
+ if (request.url().includes('https://p.yotpo.com/i?e=se&se_ca=reviews&se_ac=shown&se_psk')) {
151
+ request.abort();
152
+ devLog(`Blocked analytics request: ${request.url()}`);
153
+ } else {
154
+ request.continue();
155
+ }
156
+ });
157
+
158
+ // Log messages from the browser's console in dev mode
159
+ if (devMode) {
160
+ page.on('console', (message) => {
161
+ const text = message.text();
162
+ // Filter out common noise messages that don't affect functionality
163
+ const noisePatterns = [
164
+ /Content Security Policy/i,
165
+ /Refused to evaluate.*unsafe-eval/i,
166
+ /script-src 'none'/i,
167
+ /net::ERR_FAILED/i,
168
+ /CORS policy/i,
169
+ /Access to XMLHttpRequest/i,
170
+ ];
171
+
172
+ const isNoise = noisePatterns.some((pattern) => pattern.test(text));
173
+
174
+ if (!isNoise) {
175
+ devLog(`[Browser Console ${message.type()}]:`, text);
176
+ }
177
+ });
178
+
179
+ page.on('pageerror', (error) => {
180
+ devLog(`[Page Error]:`, error.message);
181
+ });
182
+ }
183
+
184
+ devLog('Navigating to:', url);
185
+ await page.goto(url, {waitUntil: 'networkidle2', timeout: 30000});
186
+ devLog('Page loaded');
187
+
188
+ const pageTitle = await page.title();
189
+ const pageURL = page.url();
190
+ devLog('Page title:', pageTitle);
191
+ devLog('Final URL:', pageURL);
192
+
193
+ await saveHTML(page, '01-initial-load.html', 'After initial page load');
194
+
195
+ // Check if selectors exist before waiting
196
+ devLog('Checking for selectors...');
197
+ const yotpoExists = await page.$(selectors.yotpo);
198
+ const reviewsExists = await page.$(selectors.reviews);
199
+
200
+ devLog(`Selector '${selectors.yotpo}' exists:`, !!yotpoExists);
201
+ devLog(`Selector '${selectors.reviews}' exists:`, !!reviewsExists);
202
+
203
+ if (!yotpoExists) {
204
+ devLog(`WARNING: Main Yotpo widget not found with selector: ${selectors.yotpo}`);
205
+ // Try to find any Yotpo-related elements
206
+ const yotpoElements = await page.evaluate(() => {
207
+ const elements = Array.from(document.querySelectorAll('[class*="yotpo"], [id*="yotpo"]'));
208
+ return elements.slice(0, 10).map((el) => ({
209
+ tag: el.tagName,
210
+ id: el.id,
211
+ classes: el.className,
212
+ text: el.textContent.substring(0, 100),
213
+ }));
214
+ });
215
+ devLog('Found Yotpo-related elements:', JSON.stringify(yotpoElements, null, 2));
216
+ }
217
+
218
+ // Try to find the reviews widget using multiple strategies
219
+ let reviewsWidget = null;
220
+ const reviewsVariants = selectorVariants.reviews;
221
+
222
+ for (const variant of reviewsVariants) {
223
+ const element = await page.$(variant);
224
+ if (element) {
225
+ devLog(`Found reviews widget with selector: ${variant}`);
226
+ reviewsWidget = variant;
227
+ selectors.reviews = variant;
228
+ if (variant.includes('yotpo-reviews-main-widget')) {
229
+ selectors.yotpo = variant;
230
+ }
231
+ break;
232
+ }
233
+ }
234
+
235
+ if (!reviewsWidget) {
236
+ devLog(`WARNING: Reviews container not found with any selector variant`);
237
+ devLog('Waiting 3 seconds for dynamic content to load...');
238
+ await delay(3000);
239
+
240
+ await saveHTML(page, '02-after-wait.html', 'After waiting 3 seconds for dynamic content');
241
+
242
+ // Check again after wait
243
+ for (const variant of reviewsVariants) {
244
+ const element = await page.$(variant);
245
+ if (element) {
246
+ devLog(`Found reviews widget after wait with selector: ${variant}`);
247
+ reviewsWidget = variant;
248
+ selectors.reviews = variant;
249
+ if (variant.includes('yotpo-reviews-main-widget')) {
250
+ selectors.yotpo = variant;
251
+ }
252
+ break;
253
+ }
254
+ }
255
+
256
+ if (!reviewsWidget) {
257
+ // Check if there's an expand button that needs to be clicked
258
+ const expandButton = await page.$(selectors.expandButton);
259
+ if (expandButton) {
260
+ devLog('Found expand button, clicking to load reviews...');
261
+ try {
262
+ await expandButton.click();
263
+ await delay(2000);
264
+ await saveHTML(page, '02b-after-expand-click.html', 'After clicking expand button');
265
+
266
+ // Check again after clicking
267
+ for (const variant of reviewsVariants) {
268
+ const element = await page.$(variant);
269
+ if (element) {
270
+ devLog(`Found reviews widget after expand with selector: ${variant}`);
271
+ reviewsWidget = variant;
272
+ selectors.reviews = variant;
273
+ if (variant.includes('yotpo-reviews-main-widget')) {
274
+ selectors.yotpo = variant;
275
+ }
276
+ break;
277
+ }
278
+ }
279
+ } catch (error) {
280
+ devLog('Error clicking expand button:', error.message);
281
+ }
282
+ }
283
+
284
+ if (!reviewsWidget) {
285
+ // Get all divs with classes containing 'review' or 'yotpo'
286
+ const reviewLikeElements = await page.evaluate(() => {
287
+ const elements = Array.from(document.querySelectorAll('div[class*="review"], div[class*="yotpo"]'));
288
+ return elements.slice(0, 20).map((el) => ({
289
+ tag: el.tagName,
290
+ id: el.id,
291
+ classes: el.className,
292
+ visible: el.offsetParent !== null,
293
+ }));
294
+ });
295
+ devLog('Found review-like elements:', JSON.stringify(reviewLikeElements, null, 2));
296
+ }
297
+ }
298
+ }
299
+
300
+ devLog(`Waiting for selector: ${selectors.reviews}`);
301
+ try {
302
+ await page.waitForSelector(selectors.reviews, {timeout: 15000});
303
+ devLog('Reviews selector found!');
304
+
305
+ // Wait a bit more for reviews to actually render inside the widget
306
+ devLog('Waiting for reviews to render...');
307
+ await delay(2000);
308
+
309
+ // Check if there are actual review elements
310
+ const reviewCount = await page.evaluate((selector) => {
311
+ const reviews = document.querySelectorAll(selector);
312
+ return reviews.length;
313
+ }, selectors.review);
314
+ devLog(`Found ${reviewCount} review elements in the widget`);
315
+ } catch (error) {
316
+ devLog('ERROR: Failed to find reviews selector:', error.message);
317
+ await saveHTML(page, '03-selector-not-found.html', 'Full page HTML when selector not found');
318
+
319
+ // Try to get page content analysis
320
+ const pageAnalysis = await page.evaluate(() => ({
321
+ bodyHTML: document.body.innerHTML.substring(0, 5000),
322
+ scripts: Array.from(document.querySelectorAll('script[src*="yotpo"]')).map((s) => s.src),
323
+ divsWithYotpo: Array.from(document.querySelectorAll('div[class*="yotpo"]')).map((d) => d.className).slice(0, 10),
324
+ }));
325
+ devLog('Page analysis:', JSON.stringify(pageAnalysis, null, 2));
326
+
327
+ throw error;
328
+ }
329
+
330
+ await saveHTML(page, '04-reviews-found.html', 'After reviews selector found');
331
+
332
+ devLog('Handling modals...');
333
+ await handleModalsDynamically(page);
334
+ devLog('Modals handled');
335
+
336
+ // Check for pager element
337
+ const pagerExists = await page.$(selectors.pager);
338
+ devLog(`Pager selector '${selectors.pager}' exists:`, !!pagerExists);
339
+
340
+ if (!pagerExists) {
341
+ devLog('WARNING: Pager element not found, trying to find alternative...');
342
+ const pagerAlternatives = await page.evaluate(() => {
343
+ const elements = Array.from(document.querySelectorAll('[class*="pager"], [class*="pagination"], [data-total]'));
344
+ return elements.slice(0, 5).map((el) => ({
345
+ tag: el.tagName,
346
+ classes: el.className,
347
+ dataAttributes: Array.from(el.attributes).filter((attr) => attr.name.startsWith('data-')).map((attr) => `${attr.name}="${attr.value}"`),
348
+ text: el.textContent.substring(0, 100),
349
+ }));
350
+ });
351
+ devLog('Found pager-like elements:', JSON.stringify(pagerAlternatives, null, 2));
352
+ }
353
+
354
+ devLog('Extracting pagination info...');
355
+ let reviewsTotal = await page.evaluate((selector) => {
356
+ const pager = document.querySelector(selector.pager);
357
+ return pager ? pager.getAttribute('data-total') : null;
358
+ }, selectors);
359
+
360
+ let reviewsPerPage = await page.evaluate((selector) => {
361
+ const pager = document.querySelector(selector.pager);
362
+ return pager ? pager.getAttribute('data-per-page') : null;
363
+ }, selectors);
364
+
365
+ // Try alternative methods to get pagination info
366
+ if (!reviewsTotal || !reviewsPerPage) {
367
+ devLog('Trying alternative methods to detect pagination...');
368
+ const paginationInfo = await page.evaluate((selector) => {
369
+ // Try to find pagination container
370
+ const paginationContainer = document.querySelector('nav.yotpo-reviews-pagination-container, nav[class*="pagination"]');
371
+ if (paginationContainer) {
372
+ // Try to extract page numbers from pagination
373
+ const pageLinks = Array.from(paginationContainer.querySelectorAll('a, button'));
374
+ const pageNumbers = pageLinks
375
+ .map((link) => {
376
+ const text = link.textContent.trim();
377
+ const num = parseInt(text, 10);
378
+ return Number.isNaN(num) ? null : num;
379
+ })
380
+ .filter((num) => num !== null);
381
+
382
+ // Try to find total from star ratings widget
383
+ const starWidget = document.querySelector('#yotpo-reviews-star-ratings-widget');
384
+ let totalFromWidget = null;
385
+ if (starWidget) {
386
+ const widgetText = starWidget.textContent || '';
387
+ const match = widgetText.match(/(\d+)\s*(?:reviews?|total)/i);
388
+ if (match) {
389
+ totalFromWidget = parseInt(match[1], 10);
390
+ }
391
+ }
392
+
393
+ // Count visible reviews
394
+ const visibleReviews = document.querySelectorAll(selector.review).length;
395
+
396
+ // Get max page number from pagination
397
+ const maxPage = pageNumbers.length > 0 ? Math.max(...pageNumbers) : null;
398
+
399
+ return {
400
+ pageNumbers,
401
+ maxPage,
402
+ totalFromWidget,
403
+ visibleReviews,
404
+ paginationExists: !!paginationContainer,
405
+ };
406
+ }
407
+ return null;
408
+ }, selectors);
409
+
410
+ devLog('Alternative pagination info:', JSON.stringify(paginationInfo, null, 2));
411
+
412
+ // Use alternative methods if available
413
+ if (paginationInfo) {
414
+ if (!reviewsTotal && paginationInfo.totalFromWidget) {
415
+ reviewsTotal = paginationInfo.totalFromWidget.toString();
416
+ devLog('Got total reviews from widget text:', reviewsTotal);
417
+ }
418
+ if (!reviewsPerPage && paginationInfo.visibleReviews) {
419
+ reviewsPerPage = paginationInfo.visibleReviews.toString();
420
+ devLog('Got reviews per page from visible count:', reviewsPerPage);
421
+ }
422
+ // If we have max page number, calculate from that
423
+ if (!reviewsTotal && paginationInfo.maxPage && paginationInfo.visibleReviews) {
424
+ reviewsTotal = (paginationInfo.maxPage * paginationInfo.visibleReviews).toString();
425
+ devLog('Estimated total reviews from max page:', reviewsTotal);
426
+ }
427
+ }
428
+
429
+ if (!reviewsTotal || !reviewsPerPage) {
430
+ devLog('ERROR: Could not get pagination info. Saving HTML for analysis...');
431
+ await saveHTML(page, '05-pagination-error.html', 'Pagination info missing');
432
+
433
+ // Try to count reviews directly
434
+ const directReviewCount = await page.evaluate(
435
+ (selector) => document.querySelectorAll(selector.review).length,
436
+ selectors
437
+ );
438
+ devLog('Direct review count from DOM:', directReviewCount);
439
+ }
440
+ }
441
+
442
+ // Calculate pages - if we don't have total, try to detect from pagination
443
+ let reviewsPages = 1;
444
+ if (reviewsTotal && reviewsPerPage) {
445
+ reviewsPages = Math.ceil(parseInt(reviewsTotal, 10) / parseInt(reviewsPerPage, 10));
446
+ } else {
447
+ // Try to detect number of pages from pagination elements
448
+ const pageCount = await page.evaluate(() => {
449
+ const paginationLinks = Array.from(
450
+ document.querySelectorAll('nav[class*="pagination"] a, nav[class*="pagination"] button')
451
+ );
452
+ const pageNumbers = paginationLinks
453
+ .map((link) => {
454
+ const text = link.textContent.trim();
455
+ const num = parseInt(text, 10);
456
+ return Number.isNaN(num) ? null : num;
457
+ })
458
+ .filter((num) => num !== null && num > 0);
459
+ return pageNumbers.length > 0 ? Math.max(...pageNumbers) : 1;
460
+ });
461
+ reviewsPages = pageCount || 1;
462
+ devLog('Detected number of pages from pagination:', reviewsPages);
463
+ }
464
+
465
+ devLog('Final pagination info:');
466
+ devLog(' - Total reviews:', reviewsTotal || 'unknown');
467
+ devLog(' - Reviews per page:', reviewsPerPage || 'unknown');
468
+ devLog(' - Total pages:', reviewsPages);
469
+
470
+ devLog('Final pagination info:');
471
+ devLog(' - Total reviews:', reviewsTotal || 'unknown');
472
+ devLog(' - Reviews per page:', reviewsPerPage || 'unknown');
473
+ devLog(' - Total pages:', reviewsPages);
474
+ console.log('Total reviews:', reviewsTotal || 'unknown');
475
+ console.log('Reviews per page:', reviewsPerPage || 'unknown');
476
+ console.log('Pages:', reviewsPages);
477
+
478
+ let reviewsArr = [];
479
+ devLog('Starting to scrape reviews...');
480
+
481
+ for (let p = 1; p < reviewsPages + 1; p += 1) {
482
+ devLog(`--- Processing page ${p} of ${reviewsPages} ---`);
483
+ console.log('Getting page:', p);
484
+
485
+ // Check how many reviews are visible before scraping
486
+ const visibleReviews = await page.evaluate(
487
+ (selector) => document.querySelectorAll(selector.review).length,
488
+ selectors
489
+ );
490
+ devLog(`Visible reviews on page ${p}:`, visibleReviews);
491
+
492
+ // In dev mode, inspect the structure of the first review
493
+ if (devMode && visibleReviews > 0) {
494
+ const firstReviewStructure = await page.evaluate((selector) => {
495
+ const firstReview = document.querySelector(selector.review);
496
+ if (!firstReview) return null;
497
+
498
+ return {
499
+ tag: firstReview.tagName,
500
+ id: firstReview.id,
501
+ classes: firstReview.className,
502
+ html: firstReview.innerHTML.substring(0, 1000),
503
+ allSelectors: {
504
+ name: Array.from(firstReview.querySelectorAll('[class*="user"], [class*="name"]')).map((el) => {
505
+ const className = String(el.className || '');
506
+ return {
507
+ selector: el.tagName + (className ? `.${className.split(' ').join('.')}` : ''),
508
+ text: el.textContent.trim().substring(0, 50),
509
+ };
510
+ }),
511
+ rating: Array.from(firstReview.querySelectorAll('[class*="star"], [class*="rating"]')).map((el) => {
512
+ const className = String(el.className || '');
513
+ return {
514
+ selector: el.tagName + (className ? `.${className.split(' ').join('.')}` : ''),
515
+ text: el.textContent.trim().substring(0, 50),
516
+ srOnly: Array.from(el.querySelectorAll('[class*="sr-only"], [aria-label]')).map((sr) => sr.textContent || sr.getAttribute('aria-label')),
517
+ };
518
+ }),
519
+ title: Array.from(firstReview.querySelectorAll('[class*="title"], [class*="head"]')).map((el) => {
520
+ const className = String(el.className || '');
521
+ return {
522
+ selector: el.tagName + (className ? `.${className.split(' ').join('.')}` : ''),
523
+ text: el.textContent.trim().substring(0, 50),
524
+ };
525
+ }),
526
+ desc: Array.from(firstReview.querySelectorAll('[class*="content"], [class*="review"], [class*="desc"]')).map((el) => {
527
+ const className = String(el.className || '');
528
+ return {
529
+ selector: el.tagName + (className ? `.${className.split(' ').join('.')}` : ''),
530
+ text: el.textContent.trim().substring(0, 100),
531
+ };
532
+ }),
533
+ date: Array.from(firstReview.querySelectorAll('[class*="date"], [class*="time"]')).map((el) => {
534
+ const className = String(el.className || '');
535
+ return {
536
+ selector: el.tagName + (className ? `.${className.split(' ').join('.')}` : ''),
537
+ text: el.textContent.trim().substring(0, 50),
538
+ };
539
+ }),
540
+ },
541
+ };
542
+ }, selectors);
543
+ devLog('First review structure:', JSON.stringify(firstReviewStructure, null, 2));
544
+ }
545
+
546
+ const d = await page.evaluate((selector) => {
547
+ const reviews = document.querySelectorAll(selector.review);
548
+ const data = [];
549
+ for (let r = 0; r < reviews.length; r += 1) {
550
+ try {
551
+ const reviewEl = reviews[r];
552
+
553
+ // Try to find elements within each review element
554
+ // Try multiple selector patterns for each field
555
+ const nameEl = reviewEl.querySelector('span.yotpo-reviewer-name')
556
+ || reviewEl.querySelector('span.yotpo-user-name')
557
+ || reviewEl.querySelector('[class*="reviewer-name"]')
558
+ || reviewEl.querySelector('[class*="user-name"]');
559
+
560
+ // For rating, check for sr-only text or aria-label
561
+ let ratingText = 'N/A';
562
+ const ratingContainer = reviewEl.querySelector('div.yotpo-review-rating-title');
563
+ if (ratingContainer) {
564
+ const srOnly = ratingContainer.querySelector('span.sr-only');
565
+ if (srOnly) {
566
+ ratingText = srOnly.textContent.trim();
567
+ }
568
+ }
569
+ if (ratingText === 'N/A') {
570
+ const ariaRating = reviewEl.querySelector('[aria-label*="star"], [aria-label*="rating"]');
571
+ if (ariaRating) {
572
+ ratingText = ariaRating.getAttribute('aria-label');
573
+ }
574
+ }
575
+
576
+ // Title - prefer p.yotpo-review-title over div.yotpo-review-rating-title
577
+ const titleEl = reviewEl.querySelector('p.yotpo-review-title')
578
+ || reviewEl.querySelector('div.yotpo-review-rating-title')
579
+ || reviewEl.querySelector('[class*="review-title"]');
580
+
581
+ const descEl = reviewEl.querySelector('div.yotpo-review-content')
582
+ || reviewEl.querySelector('div.content-review')
583
+ || reviewEl.querySelector('[class*="review-content"]');
584
+
585
+ // Date - try to get just the date part, not the label
586
+ const dateFormatEl = reviewEl.querySelector('div.yotpo-date-format');
587
+ let dateText = 'N/A';
588
+ if (dateFormatEl) {
589
+ dateText = dateFormatEl.textContent.trim();
590
+ } else {
591
+ const dateEl = reviewEl.querySelector('div.yotpo-review-date')
592
+ || reviewEl.querySelector('span.yotpo-review-date');
593
+ if (dateEl) {
594
+ // Try to extract just the date part (remove "Published date" prefix)
595
+ const fullText = dateEl.textContent.trim();
596
+ const dateMatch = fullText.match(/(\d{2}\/\d{2}\/\d{2})/);
597
+ dateText = dateMatch ? dateMatch[1] : fullText;
598
+ }
599
+ }
600
+
601
+ data.push({
602
+ name: nameEl ? nameEl.textContent.trim() : 'N/A',
603
+ rating: ratingText,
604
+ title: titleEl ? titleEl.textContent.trim() : 'N/A',
605
+ desc: descEl ? descEl.textContent.trim() : 'N/A',
606
+ date: dateText,
607
+ });
608
+ } catch (error) {
609
+ console.error(`Error extracting review ${r}:`, error);
610
+ }
611
+ }
612
+ return data;
613
+ }, selectors);
614
+
615
+ devLog(`Extracted ${d.length} reviews from page ${p}`);
616
+ if (devMode && d.length > 0) {
617
+ devLog('Sample review:', JSON.stringify(d[0], null, 2));
618
+ }
619
+ console.log('Got', d.length, 'reviews from page', p);
620
+
621
+ // add reviews to array
622
+ reviewsArr = [...reviewsArr, ...d];
623
+
624
+ // if not last page in pagination, navigate to next page
625
+ if (reviewsPages !== p) {
626
+ devLog(`Navigating to page ${p + 1}...`);
627
+ let navigated = false;
628
+
629
+ // Strategy 1: Try next button (check if it's enabled)
630
+ const nextButton = await page.$(selectors.next);
631
+ let isNextButtonEnabled = false;
632
+
633
+ if (nextButton) {
634
+ isNextButtonEnabled = await page.evaluate((sel) => {
635
+ const btn = document.querySelector(sel);
636
+ if (!btn) return false;
637
+ // Check multiple ways button could be disabled
638
+ const ariaDisabled = btn.getAttribute('aria-disabled');
639
+ const hasDisabledClass = btn.classList.contains('disabled');
640
+ const isDisabledAttr = btn.disabled;
641
+
642
+ return ariaDisabled !== 'true' && !hasDisabledClass && !isDisabledAttr;
643
+ }, selectors.next);
644
+ }
645
+
646
+ devLog(`Next button exists:`, !!nextButton);
647
+ devLog(`Next button enabled:`, isNextButtonEnabled);
648
+
649
+ if (nextButton && isNextButtonEnabled) {
650
+ try {
651
+ const previousContent = await page.$eval(selectors.review, (el) => el.textContent.trim()).catch(() => '');
652
+ devLog('Clicking next button...');
653
+ await page.click(selectors.next);
654
+
655
+ devLog('Waiting for response...');
656
+ try {
657
+ await page.waitForResponse((response) => {
658
+ try {
659
+ const u = new URL(response.url());
660
+ return (
661
+ u.protocol === 'https:'
662
+ && u.hostname === 'staticw2.yotpo.com'
663
+ && u.pathname.startsWith('/batch/app_key')
664
+ && response.status() === 200
665
+ );
666
+ } catch (e) {
667
+ return false;
668
+ }
669
+ }, {timeout: 10000});
670
+ devLog('Response received');
671
+ } catch (error) {
672
+ devLog('WARNING: Did not receive expected response:', error.message);
673
+ }
674
+
675
+ devLog('Waiting for content change...');
676
+ await waitForContentChange(page, selectors.review, previousContent);
677
+ devLog('Content changed, ready for next page');
678
+ navigated = true;
679
+ } catch (error) {
680
+ devLog('Error clicking next button:', error.message);
681
+ }
682
+ }
683
+
684
+ // Strategy 2: Try clicking specific page number
685
+ if (!navigated) {
686
+ devLog('Trying to click page number directly...');
687
+ try {
688
+ const pageClicked = await page.evaluate((targetPage) => {
689
+ const paginationLinks = Array.from(
690
+ document.querySelectorAll('nav[class*="pagination"] a, nav[class*="pagination"] button')
691
+ );
692
+ for (const link of paginationLinks) {
693
+ const text = link.textContent.trim();
694
+ const num = parseInt(text, 10);
695
+ if (num === targetPage && !link.disabled && !link.classList.contains('disabled')) {
696
+ link.click();
697
+ return true;
698
+ }
699
+ }
700
+ return false;
701
+ }, p + 1);
702
+
703
+ if (pageClicked) {
704
+ devLog(`Clicked page ${p + 1} number`);
705
+ await delay(2000);
706
+ const previousContent = await page.$eval(selectors.review, (el) => el.textContent.trim()).catch(() => '');
707
+ await waitForContentChange(page, selectors.review, previousContent);
708
+ navigated = true;
709
+ }
710
+ } catch (error) {
711
+ devLog('Error clicking page number:', error.message);
712
+ }
713
+ }
714
+
715
+ // Strategy 3: Try finding and clicking any non-disabled next/arrow button
716
+ if (!navigated) {
717
+ devLog('Trying to find alternative next/arrow button...');
718
+ try {
719
+ const alternativeNext = await page.evaluate(() => {
720
+ const buttons = Array.from(
721
+ document.querySelectorAll(
722
+ 'a[aria-label*="next" i], button[aria-label*="next" i], '
723
+ + 'a[class*="next"], button[class*="next"], '
724
+ + 'a[rel*="next"], [data-direction="next"]'
725
+ )
726
+ );
727
+ for (const btn of buttons) {
728
+ if (!btn.disabled && !btn.classList.contains('disabled')) {
729
+ btn.click();
730
+ return true;
731
+ }
732
+ }
733
+ return false;
734
+ });
735
+
736
+ if (alternativeNext) {
737
+ devLog('Clicked alternative next button');
738
+ await delay(2000);
739
+ navigated = true;
740
+ }
741
+ } catch (error) {
742
+ devLog('Error with alternative next button:', error.message);
743
+ }
744
+ }
745
+
746
+ if (!navigated) {
747
+ devLog('ERROR: Could not navigate to next page!');
748
+ await saveHTML(page, `06-page-${p}-navigation-failed.html`, `Page ${p} - Navigation failed`);
749
+ // Try to continue anyway - maybe we're already on the last page
750
+ const currentReviews = await page.evaluate(
751
+ (selector) => document.querySelectorAll(selector.review).length,
752
+ selectors
753
+ );
754
+ if (currentReviews === 0) {
755
+ devLog('No reviews found, stopping pagination');
756
+ break;
757
+ }
758
+ } else {
759
+ await saveHTML(page, `07-page-${p + 1}-loaded.html`, `Page ${p + 1} loaded`);
760
+ }
761
+ }
762
+ }
763
+
764
+ devLog(`=== Scraping complete: ${reviewsArr.length} total reviews ===`);
765
+
766
+ return reviewsArr;
767
+ } catch (error) {
768
+ console.log(`Error: ${error}`);
769
+ devLog('Full error details:', error);
770
+ devLog('Error stack:', error.stack);
771
+
772
+ // Save HTML on error if in dev mode
773
+ if (devMode && browser) {
774
+ try {
775
+ const pages = await browser.pages();
776
+ if (pages.length > 0) {
777
+ await saveHTML(pages[0], 'error-state.html', 'Page state when error occurred');
778
+ }
779
+ } catch (saveError) {
780
+ devLog('Could not save error HTML:', saveError.message);
781
+ }
782
+ }
783
+
784
+ return [];
785
+ } finally {
786
+ if (browser) {
787
+ devLog('Closing browser...');
788
+ await browser.close();
789
+ devLog('Browser closed');
790
+ }
146
791
  }
147
- }
148
792
  }
149
793
 
150
- exports.yotpoScraper = yotpoScraper;
794
+ exports.yotpoScraper = yotpoScraper;
package/package.json CHANGED
@@ -1,11 +1,13 @@
1
1
  {
2
2
  "name": "yotpo-reviews-scraper",
3
- "version": "0.0.2",
3
+ "version": "1.0.1",
4
4
  "description": "Scrape Yotpo reviews from websites using Puppeteer",
5
5
  "main": "index.js",
6
6
  "scripts": {
7
7
  "start": "node index.js",
8
- "example": "node example.js"
8
+ "example": "node example.js",
9
+ "lint": "eslint index.js example.js",
10
+ "lint:fix": "eslint index.js example.js --fix"
9
11
  },
10
12
  "keywords": [
11
13
  "yotpo",
@@ -38,6 +40,8 @@
38
40
  "puppeteer": "^24.25.0"
39
41
  },
40
42
  "devDependencies": {
41
- "eslint": "^8.22.0"
43
+ "eslint": "^8.22.0",
44
+ "eslint-config-airbnb-base": "^15.0.0",
45
+ "eslint-plugin-import": "^2.32.0"
42
46
  }
43
47
  }