amzpy 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
amzpy/__init__.py CHANGED
@@ -3,6 +3,7 @@ AmzPy - Amazon Product Scraper
3
3
  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4
4
 
5
5
  A lightweight Python library for scraping product information from Amazon.
6
+ Now using curl_cffi for better anti-bot protection.
6
7
 
7
8
  Basic usage:
8
9
  >>> from amzpy import AmazonScraper
@@ -16,6 +17,6 @@ Basic usage:
16
17
 
17
18
  from .scraper import AmazonScraper
18
19
 
19
- __version__ = "0.1.0"
20
+ __version__ = "0.2.0"
20
21
  __author__ = "Anil Sardiwal"
21
22
  __license__ = "MIT"
amzpy/parser.py CHANGED
@@ -1,59 +1,580 @@
1
+ """
2
+ Amazon HTML Parsing Module
3
+ ~~~~~~~~~~~~~~~~~~~~~~~~~
4
+
5
+ This module contains parsing functions for Amazon pages:
6
+ - Product detail pages (individual products)
7
+ - Search results pages (listings of products)
8
+
9
+ It uses BeautifulSoup to extract structured data from Amazon's HTML.
10
+ """
11
+
12
+
13
+ import re
14
+ import json
15
+ from urllib.parse import urljoin, urlparse
1
16
  from bs4 import BeautifulSoup
2
- from typing import Dict, Optional
3
- from amzpy.engine import RequestEngine
17
+ from typing import Dict, Optional, TYPE_CHECKING, Any, List, Tuple
18
+
19
+ # Using string annotation to avoid circular imports
20
+ if TYPE_CHECKING:
21
+ from amzpy.session import AmzSession
22
+
23
+ from amzpy.utils import extract_brand_name, format_canonical_url
24
+
4
25
 
5
- def parse_product_page(html_content: str, url: str = None, engine: RequestEngine = None, max_retries: int = 0) -> Optional[Dict]:
26
+ def parse_product_page(html_content: str, url: str = None, country_code: str = None) -> Optional[Dict]:
6
27
  """
7
- Parse Amazon product page HTML and extract relevant information
28
+ Parse Amazon product page HTML and extract structured product data.
29
+
30
+ This function extracts key product information including:
31
+ - Product title
32
+ - Price and currency
33
+ - Brand name
34
+ - Product image URL
8
35
 
9
36
  Args:
10
37
  html_content (str): Raw HTML content of the product page
11
- url (str, optional): Product URL for retrying if needed
12
- engine (RequestEngine, optional): RequestEngine instance for retries
13
- max_retries (int): Number of remaining retry attempts
38
+ url (str, optional): Product URL for reference
39
+ country_code (str, optional): Country code for URL formatting
14
40
 
15
41
  Returns:
16
- Dict: Extracted product information (title, price, img_url, currency)
17
- None: If parsing fails after all retries
42
+ Dict: Extracted product information
43
+ None: If parsing fails or HTML indicates a CAPTCHA/block
18
44
  """
19
45
  if not html_content:
46
+ print("Error: Received empty HTML content")
20
47
  return None
21
48
 
22
- soup = BeautifulSoup(html_content, 'html.parser')
49
+ # Use lxml parser for better performance
50
+ soup = BeautifulSoup(html_content, 'lxml')
51
+
52
+ # Check for CAPTCHA / Block Page before detailed parsing
53
+ if "captcha" in html_content.lower() or "api-services-support@amazon.com" in html_content:
54
+ print("Possible CAPTCHA or block page detected in HTML content")
55
+ return None
23
56
 
24
57
  try:
25
- # Get title
26
- title = soup.select_one('#productTitle')
27
- title = title.text.strip() if title else None
58
+ # Extract product title
59
+ title_element = soup.select_one('#productTitle')
60
+ title = title_element.text.strip() if title_element else None
61
+
62
+ # Extract price information
63
+ # We check multiple price selectors since Amazon's HTML structure varies
64
+ price = None
65
+ price_whole = soup.select_one('.a-price-whole')
66
+ price_fraction = soup.select_one('.a-price-fraction')
28
67
 
29
- # If title is None and we have retries left, try again
30
- if title is None and max_retries > 0 and url and engine:
31
- print(f"Retry attempt {max_retries} - Anti-bot measure detected")
32
- new_html = engine.get(url)
33
- return parse_product_page(new_html, url, engine, max_retries - 1)
68
+ if price_whole:
69
+ # Get whole number part
70
+ price_text = price_whole.text.strip().replace(',', '')
71
+ # Add decimal part if available
72
+ if price_fraction:
73
+ fraction_text = price_fraction.text.strip()
74
+ price = float(f"{price_text}.{fraction_text}")
75
+ else:
76
+ price = float(price_text)
34
77
 
35
- # Get price
36
- price_element = soup.select_one('.a-price-whole')
37
- price = float(price_element.text.strip().replace(',', '')) if price_element else None
78
+ # Alternative price selectors for different Amazon layouts
79
+ if price is None:
80
+ price_element = soup.select_one('span.a-offscreen')
81
+ if price_element:
82
+ price_text = price_element.text.strip()
83
+ # Extract numeric value from price text (e.g., "$29.99" -> 29.99)
84
+ price_match = re.search(r'[\d,]+\.?\d*', price_text)
85
+ if price_match:
86
+ price = float(price_match.group().replace(',', ''))
38
87
 
39
- # Get currency symbol
88
+ # Extract currency symbol
40
89
  currency_element = soup.select_one('.a-price-symbol')
41
90
  currency = currency_element.text.strip() if currency_element else None
42
91
 
43
- # Get main product image
92
+ # Extract currency from alternate sources if first method fails
93
+ if not currency and price is not None:
94
+ price_element = soup.select_one('span.a-offscreen')
95
+ if price_element:
96
+ price_text = price_element.text.strip()
97
+ currency_match = re.search(r'^[^\d]+', price_text)
98
+ if currency_match:
99
+ currency = currency_match.group().strip()
100
+
101
+ # Extract brand name
102
+ brand_name = None
103
+ brand_element = soup.select_one('#bylineInfo')
104
+ if brand_element:
105
+ brand_name = extract_brand_name(brand_element.text.strip())
106
+
107
+ # Try alternative brand selectors if first method fails
108
+ if not brand_name:
109
+ # Try looking in the product details section
110
+ detail_bullets = soup.select('#detailBullets_feature_div li')
111
+ for bullet in detail_bullets:
112
+ if 'brand' in bullet.text.lower():
113
+ brand_name = bullet.select_one('.a-text-bold + span')
114
+ if brand_name:
115
+ brand_name = brand_name.text.strip()
116
+ break
117
+
118
+ # Extract main product image
44
119
  img_element = soup.select_one('#landingImage') or soup.select_one('#imgBlkFront')
45
120
  img_url = img_element.get('src') if img_element else None
46
121
 
47
- return {
122
+ # Try to get high-resolution image URL if available
123
+ if img_element and not img_url:
124
+ data_old_hires = img_element.get('data-old-hires')
125
+ data_a_dynamic_image = img_element.get('data-a-dynamic-image')
126
+
127
+ if data_old_hires:
128
+ img_url = data_old_hires
129
+ elif data_a_dynamic_image:
130
+ # This attribute contains a JSON string with multiple image URLs
131
+ try:
132
+ image_dict = json.loads(data_a_dynamic_image)
133
+ # Get the URL with the highest resolution
134
+ if image_dict:
135
+ img_url = list(image_dict.keys())[0]
136
+ except Exception:
137
+ pass
138
+
139
+ # Extract ASIN (Amazon Standard Identification Number)
140
+ asin = None
141
+ if url:
142
+ asin_match = re.search(r'/dp/([A-Z0-9]{10})', url)
143
+ if asin_match:
144
+ asin = asin_match.group(1)
145
+
146
+ # Extract ratings if available
147
+ rating = None
148
+ rating_element = soup.select_one('#acrPopover') or soup.select_one('span.a-icon-alt')
149
+ if rating_element:
150
+ rating_text = rating_element.get('title', '') or rating_element.text
151
+ rating_match = re.search(r'([\d\.]+)\s+out\s+of\s+5', rating_text)
152
+ if rating_match:
153
+ rating = float(rating_match.group(1))
154
+
155
+ # Format canonical URL if ASIN is available
156
+ canonical_url = format_canonical_url(url, asin, country_code) if asin else url
157
+
158
+ # Build the final product data dictionary
159
+ product_data = {
48
160
  "title": title,
49
161
  "price": price,
50
162
  "img_url": img_url,
51
- "currency": currency
163
+ "currency": currency,
164
+ "brand": brand_name,
165
+ "url": canonical_url,
166
+ "asin": asin,
167
+ "rating": rating
52
168
  }
53
- except Exception:
54
- # If we have retries left, try again
55
- if max_retries > 0 and url and engine:
56
- print(f"Retry attempt {max_retries} - Error occurred")
57
- new_html = engine.get(url)
58
- return parse_product_page(new_html, url, engine, max_retries - 1)
169
+
170
+ return product_data
171
+
172
+ except Exception as e:
173
+ print(f"Error parsing product page: {e}")
59
174
  return None
175
+
176
+
177
+ def parse_search_page(html_content: str, base_url: str = None, country_code: str = None) -> List[Dict]:
178
+ """
179
+ Parse Amazon search results page HTML and extract product listings.
180
+
181
+ This function extracts a list of products from search or category pages:
182
+ - Product title, URL, and ASIN
183
+ - Price and currency
184
+ - Thumbnail image
185
+ - Ratings and review count when available
186
+ - Prime eligibility
187
+ - Color variants
188
+ - Discounts
189
+
190
+ Args:
191
+ html_content (str): Raw HTML content of the search results page
192
+ base_url (str, optional): Base URL for resolving relative URLs
193
+ country_code (str, optional): Country code for URL formatting
194
+
195
+ Returns:
196
+ List[Dict]: List of extracted product data dictionaries
197
+ Empty list: If parsing fails or HTML indicates a CAPTCHA/block
198
+ """
199
+ if not html_content:
200
+ print("Error: Received empty HTML content for search page")
201
+ return []
202
+
203
+ # Use lxml parser for better performance on large search pages
204
+ soup = BeautifulSoup(html_content, 'lxml')
205
+
206
+ # Check for CAPTCHA / Block Page before detailed parsing
207
+ if "captcha" in html_content.lower() or "api-services-support@amazon.com" in html_content:
208
+ print("CAPTCHA or block page detected in search results")
209
+ return []
210
+
211
+ # Prepare results list
212
+ results = []
213
+
214
+ try:
215
+ # Try to locate search result containers - Amazon has multiple formats
216
+ # Try the most common selectors first
217
+ product_containers = soup.select('div[data-component-type="s-search-result"]')
218
+
219
+ # Alternative selectors for different Amazon layouts
220
+ if not product_containers:
221
+ product_containers = soup.select('.s-result-item[data-asin]')
222
+
223
+ if not product_containers:
224
+ # Try more generic selectors as fallback
225
+ product_containers = soup.select('.s-result-item')
226
+
227
+ print(f"Found {len(product_containers)} potential product containers")
228
+
229
+ # Process each product container
230
+ for container in product_containers:
231
+
232
+ try:
233
+ # Skip sponsored listings if they don't have complete data
234
+ if 'AdHolder' in container.get('class', []):
235
+ continue
236
+
237
+ # Skip non-product containers (sometimes Amazon includes dividers, etc.)
238
+ # Extract ASIN (Amazon Standard Identification Number)
239
+ asin = container.get('data-asin') or container.get('asin')
240
+ if not asin or asin == "":
241
+ continue
242
+
243
+ # Initialize product data dictionary
244
+ product_data = {"asin": asin}
245
+
246
+ # Extract product URL and title (multiple possible selectors)
247
+ title_link = None
248
+
249
+ # Try various title selectors that appear across different Amazon layouts
250
+ title_selectors = [
251
+ 'h2 a.a-link-normal', # Common layout
252
+ '.a-text-normal[href]', # Alternative layout
253
+ 'h2.a-size-base-plus a', # Layout from example
254
+ 'a.s-line-clamp-2', # Another layout from example
255
+ '.a-text-normal[data-hover]', # Alternative layout
256
+ '.a-size-base-plus[aria-label]' # Layout with aria-label
257
+ ]
258
+
259
+ for selector in title_selectors:
260
+ title_link = container.select_one(selector)
261
+ if title_link:
262
+ break
263
+
264
+ if title_link:
265
+ # Extract title - check multiple attributes
266
+ if title_link.get('aria-label'):
267
+ product_data['title'] = title_link.get('aria-label')
268
+ elif title_link.select_one('span'):
269
+ product_data['title'] = title_link.select_one('span').text.strip()
270
+ else:
271
+ product_data['title'] = title_link.text.strip()
272
+
273
+ # Extract URL from href attribute
274
+ href = title_link.get('href')
275
+ if href:
276
+ # Handle relative URLs
277
+ if href.startswith('/'):
278
+ product_url = urljoin(base_url, href) if base_url else href
279
+ else:
280
+ product_url = href
281
+
282
+ # Store the URL but also create a canonical version
283
+ product_data['url'] = format_canonical_url(product_url, asin, country_code)
284
+
285
+ # Extract brand (multiple possible locations)
286
+ brand_selectors = [
287
+ '.a-row .a-size-base-plus.a-color-base', # Common location
288
+ '.a-size-base-plus:not([aria-label])', # Alternative location
289
+ 'h2 .a-size-base-plus', # Format from example
290
+ '.s-line-clamp-1 span' # Another common format
291
+ ]
292
+
293
+ for selector in brand_selectors:
294
+ brand_elem = container.select_one(selector)
295
+ if brand_elem and brand_elem.text.strip():
296
+ product_data['brand'] = brand_elem.text.strip()
297
+ break
298
+
299
+ # Extract price information (multiple possible selectors)
300
+ # First, look for the a-price structure (most common)
301
+ price_element = container.select_one('.a-price .a-offscreen')
302
+ if price_element:
303
+ price_text = price_element.text.strip()
304
+ # Parse price and currency
305
+ currency_match = re.search(r'^[^\d]+', price_text)
306
+ price_match = re.search(r'[\d,]+\.?\d*', price_text)
307
+
308
+ if currency_match:
309
+ product_data['currency'] = currency_match.group().strip()
310
+
311
+ if price_match:
312
+ price_str = price_match.group().replace(',', '')
313
+ # Only convert to float if it's a valid number (not just a decimal point)
314
+ if price_str and price_str != ".":
315
+ try:
316
+ product_data['price'] = float(price_str)
317
+ except ValueError:
318
+ # If conversion fails, just log and continue without price
319
+ print(f"Warning: Could not convert price string: '{price_str}'")
320
+
321
+ # If price not found, try alternative selectors
322
+ if 'price' not in product_data:
323
+ price_whole = container.select_one('.a-price-whole')
324
+ price_fraction = container.select_one('.a-price-fraction')
325
+ if price_whole:
326
+ price_text = price_whole.text.strip().replace(',', '')
327
+ if price_text and price_text != ".":
328
+ try:
329
+ if price_fraction:
330
+ fraction_text = price_fraction.text.strip()
331
+ if fraction_text and fraction_text != ".":
332
+ product_data['price'] = float(f"{price_text}.{fraction_text}")
333
+ else:
334
+ product_data['price'] = float(price_text)
335
+ except ValueError:
336
+ print(f"Warning: Could not convert price parts: '{price_text}' and '{fraction_text if price_fraction else ''}'")
337
+
338
+ # Extract currency symbol if not already found
339
+ if 'currency' not in product_data and container.select_one('.a-price-symbol'):
340
+ product_data['currency'] = container.select_one('.a-price-symbol').text.strip()
341
+
342
+ # Extract original price and calculate discount (if available)
343
+ original_price_elem = container.select_one('.a-price.a-text-price .a-offscreen')
344
+ if original_price_elem:
345
+ original_price_text = original_price_elem.text.strip()
346
+ price_match = re.search(r'[\d,]+\.?\d*', original_price_text)
347
+ if price_match:
348
+ price_str = price_match.group().replace(',', '')
349
+ if price_str and price_str != ".":
350
+ try:
351
+ original_price = float(price_str)
352
+ product_data['original_price'] = original_price
353
+
354
+ # Calculate discount percentage if both prices are available
355
+ if 'price' in product_data and product_data['price'] > 0:
356
+ discount = round(100 - (product_data['price'] / original_price * 100))
357
+ product_data['discount_percent'] = discount
358
+ except ValueError:
359
+ print(f"Warning: Could not convert original price string: '{price_str}'")
360
+
361
+ # Extract discount percentage directly if available
362
+ discount_text = container.select_one('span:-soup-contains("% off")')
363
+ if discount_text and 'discount_percent' not in product_data:
364
+ discount_match = re.search(r'(\d+)%', discount_text.text)
365
+ if discount_match:
366
+ product_data['discount_percent'] = int(discount_match.group(1))
367
+
368
+ # Extract product image (multiple possible selectors)
369
+ img_selectors = [
370
+ 'img.s-image', # Common layout
371
+ '.s-image img', # Alternative layout
372
+ '.a-section img[srcset]', # Layout from example
373
+ '.s-product-image-container img' # Another layout
374
+ ]
375
+
376
+ for selector in img_selectors:
377
+ img_element = container.select_one(selector)
378
+ if img_element:
379
+ # First try to get the highest resolution version using srcset
380
+ if img_element.get('srcset'):
381
+ srcset = img_element.get('srcset')
382
+ srcset_parts = srcset.split(',')
383
+ if srcset_parts:
384
+ # Get the last one (usually highest resolution)
385
+ highest_res = srcset_parts[-1].strip().split(' ')[0]
386
+ product_data['img_url'] = highest_res
387
+ # Fallback to src attribute
388
+ if 'img_url' not in product_data and img_element.get('src'):
389
+ product_data['img_url'] = img_element.get('src')
390
+ break
391
+
392
+ # Extract ratings (multiple possible formats)
393
+ rating_selectors = [
394
+ 'i.a-icon-star-small', # Common layout
395
+ '.a-icon-star', # Alternative layout
396
+ 'span.a-icon-alt', # Text inside span
397
+ 'i.a-star-mini-4', # Format from example
398
+ '[aria-label*="out of 5 stars"]' # Aria-label format
399
+ ]
400
+
401
+ for selector in rating_selectors:
402
+ rating_element = container.select_one(selector)
403
+ if rating_element:
404
+ # Try to extract from aria-label first
405
+ if rating_element.get('aria-label') and 'out of 5' in rating_element.get('aria-label'):
406
+ rating_text = rating_element.get('aria-label')
407
+ # Try alt text next
408
+ elif rating_element.get('alt') and 'out of 5' in rating_element.get('alt'):
409
+ rating_text = rating_element.get('alt')
410
+ # Try inner text or parent text
411
+ else:
412
+ rating_text = rating_element.text.strip()
413
+ # If no text, try parent
414
+ if not rating_text and rating_element.parent:
415
+ rating_text = rating_element.parent.text.strip()
416
+
417
+ # Extract the numeric rating
418
+ rating_match = re.search(r'([\d\.]+)(?:\s+out\s+of\s+5)?', rating_text)
419
+ if rating_match:
420
+ rating_str = rating_match.group(1)
421
+ if rating_str and rating_str != ".":
422
+ try:
423
+ product_data['rating'] = float(rating_str)
424
+ except ValueError:
425
+ print(f"Warning: Could not convert rating string: '{rating_str}'")
426
+ break
427
+
428
+ # Extract reviews count (multiple possible formats)
429
+ reviews_selectors = [
430
+ 'span[aria-label*="reviews"]', # Common layout
431
+ '.a-size-base.s-underline-text', # Format from example
432
+ 'a:-soup-contains("ratings")', # Alternative text-based
433
+ 'a:-soup-contains("reviews")', # Another alternative
434
+ '.a-link-normal .a-size-base' # Generic link to reviews
435
+ ]
436
+
437
+ for selector in reviews_selectors:
438
+ reviews_element = container.select_one(selector)
439
+ if reviews_element:
440
+ reviews_text = ""
441
+ # Try aria-label first
442
+ if reviews_element.get('aria-label'):
443
+ reviews_text = reviews_element.get('aria-label')
444
+ # Otherwise use text content
445
+ else:
446
+ reviews_text = reviews_element.text.strip()
447
+
448
+ # Extract digits with K/M suffix handling
449
+ reviews_match = re.search(r'([\d,\.]+)(?:K|k|M)?', reviews_text)
450
+ if reviews_match:
451
+ count_text = reviews_match.group(1).replace(',', '')
452
+ if count_text and count_text != ".":
453
+ try:
454
+ count = float(count_text)
455
+
456
+ # Handle K/M suffixes
457
+ if 'K' in reviews_text or 'k' in reviews_text:
458
+ count *= 1000
459
+ elif 'M' in reviews_text:
460
+ count *= 1000000
461
+
462
+ product_data['reviews_count'] = int(count)
463
+ except ValueError:
464
+ print(f"Warning: Could not convert reviews count: '{count_text}'")
465
+ break
466
+
467
+ # Check for Prime eligibility
468
+ prime_selectors = [
469
+ 'i.a-icon-prime', # Common layout
470
+ '.a-icon-prime', # Alternative layout
471
+ 'span:-soup-contains("Prime")', # Text-based detection
472
+ '.aok-relative.s-icon-text-medium', # Format from example
473
+ '[aria-label="Prime"]' # Aria-label based
474
+ ]
475
+
476
+ product_data['prime'] = any(container.select_one(selector) for selector in prime_selectors)
477
+
478
+ # Extract color variants if available
479
+ color_variants = []
480
+ color_swatches = container.select('.s-color-swatch-outer-circle')
481
+
482
+ if color_swatches:
483
+ for swatch in color_swatches:
484
+ color_link = swatch.select_one('a')
485
+ if color_link:
486
+ color_name = color_link.get('aria-label', '')
487
+ color_url = color_link.get('href', '')
488
+ color_asin = None
489
+
490
+ # Try to extract ASIN from URL
491
+ if color_url:
492
+ asin_match = re.search(r'/dp/([A-Z0-9]{10})', color_url)
493
+ if asin_match:
494
+ color_asin = asin_match.group(1)
495
+
496
+ if color_name:
497
+ if color_url.startswith('/'):
498
+ color_url = urljoin(base_url, color_url) if base_url else color_url
499
+
500
+ # Format the canonical URL for color variant
501
+ canonical_color_url = format_canonical_url(color_url, color_asin, country_code) if color_asin else color_url
502
+
503
+ color_variants.append({
504
+ 'name': color_name,
505
+ 'url': canonical_color_url,
506
+ 'asin': color_asin
507
+ })
508
+
509
+ if color_variants:
510
+ product_data['color_variants'] = color_variants
511
+
512
+ # Extract "Amazon's Choice" or "Best Seller" badges
513
+ badge_text = None
514
+ badge_element = container.select_one('.a-badge-text') or container.select_one('[aria-label*="Choice"]')
515
+ if badge_element:
516
+ badge_text = badge_element.text.strip()
517
+ if not badge_text and badge_element.get('aria-label'):
518
+ badge_text = badge_element.get('aria-label')
519
+
520
+ if badge_text:
521
+ product_data['badge'] = badge_text
522
+
523
+ # Extract delivery information
524
+ delivery_element = container.select_one('.a-row:-soup-contains("delivery")') or container.select_one('[aria-label*="delivery"]')
525
+ if delivery_element:
526
+ delivery_text = delivery_element.text.strip()
527
+ product_data['delivery_info'] = delivery_text
528
+
529
+ # Extract "Deal" information
530
+ deal_element = container.select_one('span:-soup-contains("Deal")') or container.select_one('.a-badge:-soup-contains("Deal")')
531
+ if deal_element:
532
+ product_data['deal'] = True
533
+
534
+ # Add the product to our results list if we have the key information
535
+ if product_data.get('title') and product_data.get('asin'):
536
+ results.append(product_data)
537
+
538
+ except Exception as e:
539
+ print(f"Error parsing individual search result: {e}")
540
+ continue # Skip this item and continue with the next
541
+
542
+ return results
543
+
544
+ except Exception as e:
545
+ print(f"Error parsing search page: {e}")
546
+ return []
547
+
548
+
549
+ def parse_pagination_url(html_content: str, base_url: str = None) -> Optional[str]:
550
+ """
551
+ Extract the URL for the next page from search results pagination.
552
+
553
+ Args:
554
+ html_content (str): Raw HTML content of the search results page
555
+ base_url (str, optional): Base URL for resolving relative URLs
556
+
557
+ Returns:
558
+ Optional[str]: URL of the next page, or None if there isn't one
559
+ """
560
+ if not html_content:
561
+ return None
562
+
563
+ soup = BeautifulSoup(html_content, 'lxml')
564
+
565
+ # Try multiple selectors for pagination "Next" button
566
+ next_link = (
567
+ soup.select_one('a.s-pagination-next:not(.s-pagination-disabled)') or
568
+ soup.select_one('li.a-last:not(.a-disabled) a') or
569
+ soup.select_one('a:has(span:contains("Next"))') or
570
+ soup.select_one('a[aria-label="Go to next page"]')
571
+ )
572
+
573
+ if next_link and next_link.get('href'):
574
+ next_url = next_link['href']
575
+ # Handle relative URLs
576
+ if next_url.startswith('/'):
577
+ return urljoin(base_url, next_url) if base_url else next_url
578
+ return next_url
579
+
580
+ return None