amzpy 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- amzpy/__init__.py +2 -1
- amzpy/parser.py +552 -31
- amzpy/scraper.py +205 -34
- amzpy/session.py +226 -0
- amzpy/utils.py +43 -1
- amzpy-0.2.1.dist-info/METADATA +221 -0
- amzpy-0.2.1.dist-info/RECORD +11 -0
- {amzpy-0.1.2.dist-info → amzpy-0.2.1.dist-info}/WHEEL +1 -1
- amzpy/engine.py +0 -36
- amzpy-0.1.2.dist-info/METADATA +0 -93
- amzpy-0.1.2.dist-info/RECORD +0 -11
- {amzpy-0.1.2.dist-info → amzpy-0.2.1.dist-info}/top_level.txt +0 -0
amzpy/__init__.py
CHANGED
@@ -3,6 +3,7 @@ AmzPy - Amazon Product Scraper
|
|
3
3
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
4
4
|
|
5
5
|
A lightweight Python library for scraping product information from Amazon.
|
6
|
+
Now using curl_cffi for better anti-bot protection.
|
6
7
|
|
7
8
|
Basic usage:
|
8
9
|
>>> from amzpy import AmazonScraper
|
@@ -16,6 +17,6 @@ Basic usage:
|
|
16
17
|
|
17
18
|
from .scraper import AmazonScraper
|
18
19
|
|
19
|
-
__version__ = "0.
|
20
|
+
__version__ = "0.2.0"
|
20
21
|
__author__ = "Anil Sardiwal"
|
21
22
|
__license__ = "MIT"
|
amzpy/parser.py
CHANGED
@@ -1,59 +1,580 @@
|
|
1
|
+
"""
|
2
|
+
Amazon HTML Parsing Module
|
3
|
+
~~~~~~~~~~~~~~~~~~~~~~~~~
|
4
|
+
|
5
|
+
This module contains parsing functions for Amazon pages:
|
6
|
+
- Product detail pages (individual products)
|
7
|
+
- Search results pages (listings of products)
|
8
|
+
|
9
|
+
It uses BeautifulSoup to extract structured data from Amazon's HTML.
|
10
|
+
"""
|
11
|
+
|
12
|
+
|
13
|
+
import re
|
14
|
+
import json
|
15
|
+
from urllib.parse import urljoin, urlparse
|
1
16
|
from bs4 import BeautifulSoup
|
2
|
-
from typing import Dict, Optional
|
3
|
-
|
17
|
+
from typing import Dict, Optional, TYPE_CHECKING, Any, List, Tuple
|
18
|
+
|
19
|
+
# Using string annotation to avoid circular imports
|
20
|
+
if TYPE_CHECKING:
|
21
|
+
from amzpy.session import AmzSession
|
22
|
+
|
23
|
+
from amzpy.utils import extract_brand_name, format_canonical_url
|
24
|
+
|
4
25
|
|
5
|
-
def parse_product_page(html_content: str, url: str = None,
|
26
|
+
def parse_product_page(html_content: str, url: str = None, country_code: str = None) -> Optional[Dict]:
|
6
27
|
"""
|
7
|
-
Parse Amazon product page HTML and extract
|
28
|
+
Parse Amazon product page HTML and extract structured product data.
|
29
|
+
|
30
|
+
This function extracts key product information including:
|
31
|
+
- Product title
|
32
|
+
- Price and currency
|
33
|
+
- Brand name
|
34
|
+
- Product image URL
|
8
35
|
|
9
36
|
Args:
|
10
37
|
html_content (str): Raw HTML content of the product page
|
11
|
-
url (str, optional): Product URL for
|
12
|
-
|
13
|
-
max_retries (int): Number of remaining retry attempts
|
38
|
+
url (str, optional): Product URL for reference
|
39
|
+
country_code (str, optional): Country code for URL formatting
|
14
40
|
|
15
41
|
Returns:
|
16
|
-
Dict: Extracted product information
|
17
|
-
None: If parsing fails
|
42
|
+
Dict: Extracted product information
|
43
|
+
None: If parsing fails or HTML indicates a CAPTCHA/block
|
18
44
|
"""
|
19
45
|
if not html_content:
|
46
|
+
print("Error: Received empty HTML content")
|
20
47
|
return None
|
21
48
|
|
22
|
-
|
49
|
+
# Use lxml parser for better performance
|
50
|
+
soup = BeautifulSoup(html_content, 'lxml')
|
51
|
+
|
52
|
+
# Check for CAPTCHA / Block Page before detailed parsing
|
53
|
+
if "captcha" in html_content.lower() or "api-services-support@amazon.com" in html_content:
|
54
|
+
print("Possible CAPTCHA or block page detected in HTML content")
|
55
|
+
return None
|
23
56
|
|
24
57
|
try:
|
25
|
-
#
|
26
|
-
|
27
|
-
title =
|
58
|
+
# Extract product title
|
59
|
+
title_element = soup.select_one('#productTitle')
|
60
|
+
title = title_element.text.strip() if title_element else None
|
61
|
+
|
62
|
+
# Extract price information
|
63
|
+
# We check multiple price selectors since Amazon's HTML structure varies
|
64
|
+
price = None
|
65
|
+
price_whole = soup.select_one('.a-price-whole')
|
66
|
+
price_fraction = soup.select_one('.a-price-fraction')
|
28
67
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
68
|
+
if price_whole:
|
69
|
+
# Get whole number part
|
70
|
+
price_text = price_whole.text.strip().replace(',', '')
|
71
|
+
# Add decimal part if available
|
72
|
+
if price_fraction:
|
73
|
+
fraction_text = price_fraction.text.strip()
|
74
|
+
price = float(f"{price_text}.{fraction_text}")
|
75
|
+
else:
|
76
|
+
price = float(price_text)
|
34
77
|
|
35
|
-
#
|
36
|
-
|
37
|
-
|
78
|
+
# Alternative price selectors for different Amazon layouts
|
79
|
+
if price is None:
|
80
|
+
price_element = soup.select_one('span.a-offscreen')
|
81
|
+
if price_element:
|
82
|
+
price_text = price_element.text.strip()
|
83
|
+
# Extract numeric value from price text (e.g., "$29.99" -> 29.99)
|
84
|
+
price_match = re.search(r'[\d,]+\.?\d*', price_text)
|
85
|
+
if price_match:
|
86
|
+
price = float(price_match.group().replace(',', ''))
|
38
87
|
|
39
|
-
#
|
88
|
+
# Extract currency symbol
|
40
89
|
currency_element = soup.select_one('.a-price-symbol')
|
41
90
|
currency = currency_element.text.strip() if currency_element else None
|
42
91
|
|
43
|
-
#
|
92
|
+
# Extract currency from alternate sources if first method fails
|
93
|
+
if not currency and price is not None:
|
94
|
+
price_element = soup.select_one('span.a-offscreen')
|
95
|
+
if price_element:
|
96
|
+
price_text = price_element.text.strip()
|
97
|
+
currency_match = re.search(r'^[^\d]+', price_text)
|
98
|
+
if currency_match:
|
99
|
+
currency = currency_match.group().strip()
|
100
|
+
|
101
|
+
# Extract brand name
|
102
|
+
brand_name = None
|
103
|
+
brand_element = soup.select_one('#bylineInfo')
|
104
|
+
if brand_element:
|
105
|
+
brand_name = extract_brand_name(brand_element.text.strip())
|
106
|
+
|
107
|
+
# Try alternative brand selectors if first method fails
|
108
|
+
if not brand_name:
|
109
|
+
# Try looking in the product details section
|
110
|
+
detail_bullets = soup.select('#detailBullets_feature_div li')
|
111
|
+
for bullet in detail_bullets:
|
112
|
+
if 'brand' in bullet.text.lower():
|
113
|
+
brand_name = bullet.select_one('.a-text-bold + span')
|
114
|
+
if brand_name:
|
115
|
+
brand_name = brand_name.text.strip()
|
116
|
+
break
|
117
|
+
|
118
|
+
# Extract main product image
|
44
119
|
img_element = soup.select_one('#landingImage') or soup.select_one('#imgBlkFront')
|
45
120
|
img_url = img_element.get('src') if img_element else None
|
46
121
|
|
47
|
-
|
122
|
+
# Try to get high-resolution image URL if available
|
123
|
+
if img_element and not img_url:
|
124
|
+
data_old_hires = img_element.get('data-old-hires')
|
125
|
+
data_a_dynamic_image = img_element.get('data-a-dynamic-image')
|
126
|
+
|
127
|
+
if data_old_hires:
|
128
|
+
img_url = data_old_hires
|
129
|
+
elif data_a_dynamic_image:
|
130
|
+
# This attribute contains a JSON string with multiple image URLs
|
131
|
+
try:
|
132
|
+
image_dict = json.loads(data_a_dynamic_image)
|
133
|
+
# Get the URL with the highest resolution
|
134
|
+
if image_dict:
|
135
|
+
img_url = list(image_dict.keys())[0]
|
136
|
+
except Exception:
|
137
|
+
pass
|
138
|
+
|
139
|
+
# Extract ASIN (Amazon Standard Identification Number)
|
140
|
+
asin = None
|
141
|
+
if url:
|
142
|
+
asin_match = re.search(r'/dp/([A-Z0-9]{10})', url)
|
143
|
+
if asin_match:
|
144
|
+
asin = asin_match.group(1)
|
145
|
+
|
146
|
+
# Extract ratings if available
|
147
|
+
rating = None
|
148
|
+
rating_element = soup.select_one('#acrPopover') or soup.select_one('span.a-icon-alt')
|
149
|
+
if rating_element:
|
150
|
+
rating_text = rating_element.get('title', '') or rating_element.text
|
151
|
+
rating_match = re.search(r'([\d\.]+)\s+out\s+of\s+5', rating_text)
|
152
|
+
if rating_match:
|
153
|
+
rating = float(rating_match.group(1))
|
154
|
+
|
155
|
+
# Format canonical URL if ASIN is available
|
156
|
+
canonical_url = format_canonical_url(url, asin, country_code) if asin else url
|
157
|
+
|
158
|
+
# Build the final product data dictionary
|
159
|
+
product_data = {
|
48
160
|
"title": title,
|
49
161
|
"price": price,
|
50
162
|
"img_url": img_url,
|
51
|
-
"currency": currency
|
163
|
+
"currency": currency,
|
164
|
+
"brand": brand_name,
|
165
|
+
"url": canonical_url,
|
166
|
+
"asin": asin,
|
167
|
+
"rating": rating
|
52
168
|
}
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
return parse_product_page(new_html, url, engine, max_retries - 1)
|
169
|
+
|
170
|
+
return product_data
|
171
|
+
|
172
|
+
except Exception as e:
|
173
|
+
print(f"Error parsing product page: {e}")
|
59
174
|
return None
|
175
|
+
|
176
|
+
|
177
|
+
def parse_search_page(html_content: str, base_url: str = None, country_code: str = None) -> List[Dict]:
|
178
|
+
"""
|
179
|
+
Parse Amazon search results page HTML and extract product listings.
|
180
|
+
|
181
|
+
This function extracts a list of products from search or category pages:
|
182
|
+
- Product title, URL, and ASIN
|
183
|
+
- Price and currency
|
184
|
+
- Thumbnail image
|
185
|
+
- Ratings and review count when available
|
186
|
+
- Prime eligibility
|
187
|
+
- Color variants
|
188
|
+
- Discounts
|
189
|
+
|
190
|
+
Args:
|
191
|
+
html_content (str): Raw HTML content of the search results page
|
192
|
+
base_url (str, optional): Base URL for resolving relative URLs
|
193
|
+
country_code (str, optional): Country code for URL formatting
|
194
|
+
|
195
|
+
Returns:
|
196
|
+
List[Dict]: List of extracted product data dictionaries
|
197
|
+
Empty list: If parsing fails or HTML indicates a CAPTCHA/block
|
198
|
+
"""
|
199
|
+
if not html_content:
|
200
|
+
print("Error: Received empty HTML content for search page")
|
201
|
+
return []
|
202
|
+
|
203
|
+
# Use lxml parser for better performance on large search pages
|
204
|
+
soup = BeautifulSoup(html_content, 'lxml')
|
205
|
+
|
206
|
+
# Check for CAPTCHA / Block Page before detailed parsing
|
207
|
+
if "captcha" in html_content.lower() or "api-services-support@amazon.com" in html_content:
|
208
|
+
print("CAPTCHA or block page detected in search results")
|
209
|
+
return []
|
210
|
+
|
211
|
+
# Prepare results list
|
212
|
+
results = []
|
213
|
+
|
214
|
+
try:
|
215
|
+
# Try to locate search result containers - Amazon has multiple formats
|
216
|
+
# Try the most common selectors first
|
217
|
+
product_containers = soup.select('div[data-component-type="s-search-result"]')
|
218
|
+
|
219
|
+
# Alternative selectors for different Amazon layouts
|
220
|
+
if not product_containers:
|
221
|
+
product_containers = soup.select('.s-result-item[data-asin]')
|
222
|
+
|
223
|
+
if not product_containers:
|
224
|
+
# Try more generic selectors as fallback
|
225
|
+
product_containers = soup.select('.s-result-item')
|
226
|
+
|
227
|
+
print(f"Found {len(product_containers)} potential product containers")
|
228
|
+
|
229
|
+
# Process each product container
|
230
|
+
for container in product_containers:
|
231
|
+
|
232
|
+
try:
|
233
|
+
# Skip sponsored listings if they don't have complete data
|
234
|
+
if 'AdHolder' in container.get('class', []):
|
235
|
+
continue
|
236
|
+
|
237
|
+
# Skip non-product containers (sometimes Amazon includes dividers, etc.)
|
238
|
+
# Extract ASIN (Amazon Standard Identification Number)
|
239
|
+
asin = container.get('data-asin') or container.get('asin')
|
240
|
+
if not asin or asin == "":
|
241
|
+
continue
|
242
|
+
|
243
|
+
# Initialize product data dictionary
|
244
|
+
product_data = {"asin": asin}
|
245
|
+
|
246
|
+
# Extract product URL and title (multiple possible selectors)
|
247
|
+
title_link = None
|
248
|
+
|
249
|
+
# Try various title selectors that appear across different Amazon layouts
|
250
|
+
title_selectors = [
|
251
|
+
'h2 a.a-link-normal', # Common layout
|
252
|
+
'.a-text-normal[href]', # Alternative layout
|
253
|
+
'h2.a-size-base-plus a', # Layout from example
|
254
|
+
'a.s-line-clamp-2', # Another layout from example
|
255
|
+
'.a-text-normal[data-hover]', # Alternative layout
|
256
|
+
'.a-size-base-plus[aria-label]' # Layout with aria-label
|
257
|
+
]
|
258
|
+
|
259
|
+
for selector in title_selectors:
|
260
|
+
title_link = container.select_one(selector)
|
261
|
+
if title_link:
|
262
|
+
break
|
263
|
+
|
264
|
+
if title_link:
|
265
|
+
# Extract title - check multiple attributes
|
266
|
+
if title_link.get('aria-label'):
|
267
|
+
product_data['title'] = title_link.get('aria-label')
|
268
|
+
elif title_link.select_one('span'):
|
269
|
+
product_data['title'] = title_link.select_one('span').text.strip()
|
270
|
+
else:
|
271
|
+
product_data['title'] = title_link.text.strip()
|
272
|
+
|
273
|
+
# Extract URL from href attribute
|
274
|
+
href = title_link.get('href')
|
275
|
+
if href:
|
276
|
+
# Handle relative URLs
|
277
|
+
if href.startswith('/'):
|
278
|
+
product_url = urljoin(base_url, href) if base_url else href
|
279
|
+
else:
|
280
|
+
product_url = href
|
281
|
+
|
282
|
+
# Store the URL but also create a canonical version
|
283
|
+
product_data['url'] = format_canonical_url(product_url, asin, country_code)
|
284
|
+
|
285
|
+
# Extract brand (multiple possible locations)
|
286
|
+
brand_selectors = [
|
287
|
+
'.a-row .a-size-base-plus.a-color-base', # Common location
|
288
|
+
'.a-size-base-plus:not([aria-label])', # Alternative location
|
289
|
+
'h2 .a-size-base-plus', # Format from example
|
290
|
+
'.s-line-clamp-1 span' # Another common format
|
291
|
+
]
|
292
|
+
|
293
|
+
for selector in brand_selectors:
|
294
|
+
brand_elem = container.select_one(selector)
|
295
|
+
if brand_elem and brand_elem.text.strip():
|
296
|
+
product_data['brand'] = brand_elem.text.strip()
|
297
|
+
break
|
298
|
+
|
299
|
+
# Extract price information (multiple possible selectors)
|
300
|
+
# First, look for the a-price structure (most common)
|
301
|
+
price_element = container.select_one('.a-price .a-offscreen')
|
302
|
+
if price_element:
|
303
|
+
price_text = price_element.text.strip()
|
304
|
+
# Parse price and currency
|
305
|
+
currency_match = re.search(r'^[^\d]+', price_text)
|
306
|
+
price_match = re.search(r'[\d,]+\.?\d*', price_text)
|
307
|
+
|
308
|
+
if currency_match:
|
309
|
+
product_data['currency'] = currency_match.group().strip()
|
310
|
+
|
311
|
+
if price_match:
|
312
|
+
price_str = price_match.group().replace(',', '')
|
313
|
+
# Only convert to float if it's a valid number (not just a decimal point)
|
314
|
+
if price_str and price_str != ".":
|
315
|
+
try:
|
316
|
+
product_data['price'] = float(price_str)
|
317
|
+
except ValueError:
|
318
|
+
# If conversion fails, just log and continue without price
|
319
|
+
print(f"Warning: Could not convert price string: '{price_str}'")
|
320
|
+
|
321
|
+
# If price not found, try alternative selectors
|
322
|
+
if 'price' not in product_data:
|
323
|
+
price_whole = container.select_one('.a-price-whole')
|
324
|
+
price_fraction = container.select_one('.a-price-fraction')
|
325
|
+
if price_whole:
|
326
|
+
price_text = price_whole.text.strip().replace(',', '')
|
327
|
+
if price_text and price_text != ".":
|
328
|
+
try:
|
329
|
+
if price_fraction:
|
330
|
+
fraction_text = price_fraction.text.strip()
|
331
|
+
if fraction_text and fraction_text != ".":
|
332
|
+
product_data['price'] = float(f"{price_text}.{fraction_text}")
|
333
|
+
else:
|
334
|
+
product_data['price'] = float(price_text)
|
335
|
+
except ValueError:
|
336
|
+
print(f"Warning: Could not convert price parts: '{price_text}' and '{fraction_text if price_fraction else ''}'")
|
337
|
+
|
338
|
+
# Extract currency symbol if not already found
|
339
|
+
if 'currency' not in product_data and container.select_one('.a-price-symbol'):
|
340
|
+
product_data['currency'] = container.select_one('.a-price-symbol').text.strip()
|
341
|
+
|
342
|
+
# Extract original price and calculate discount (if available)
|
343
|
+
original_price_elem = container.select_one('.a-price.a-text-price .a-offscreen')
|
344
|
+
if original_price_elem:
|
345
|
+
original_price_text = original_price_elem.text.strip()
|
346
|
+
price_match = re.search(r'[\d,]+\.?\d*', original_price_text)
|
347
|
+
if price_match:
|
348
|
+
price_str = price_match.group().replace(',', '')
|
349
|
+
if price_str and price_str != ".":
|
350
|
+
try:
|
351
|
+
original_price = float(price_str)
|
352
|
+
product_data['original_price'] = original_price
|
353
|
+
|
354
|
+
# Calculate discount percentage if both prices are available
|
355
|
+
if 'price' in product_data and product_data['price'] > 0:
|
356
|
+
discount = round(100 - (product_data['price'] / original_price * 100))
|
357
|
+
product_data['discount_percent'] = discount
|
358
|
+
except ValueError:
|
359
|
+
print(f"Warning: Could not convert original price string: '{price_str}'")
|
360
|
+
|
361
|
+
# Extract discount percentage directly if available
|
362
|
+
discount_text = container.select_one('span:-soup-contains("% off")')
|
363
|
+
if discount_text and 'discount_percent' not in product_data:
|
364
|
+
discount_match = re.search(r'(\d+)%', discount_text.text)
|
365
|
+
if discount_match:
|
366
|
+
product_data['discount_percent'] = int(discount_match.group(1))
|
367
|
+
|
368
|
+
# Extract product image (multiple possible selectors)
|
369
|
+
img_selectors = [
|
370
|
+
'img.s-image', # Common layout
|
371
|
+
'.s-image img', # Alternative layout
|
372
|
+
'.a-section img[srcset]', # Layout from example
|
373
|
+
'.s-product-image-container img' # Another layout
|
374
|
+
]
|
375
|
+
|
376
|
+
for selector in img_selectors:
|
377
|
+
img_element = container.select_one(selector)
|
378
|
+
if img_element:
|
379
|
+
# First try to get the highest resolution version using srcset
|
380
|
+
if img_element.get('srcset'):
|
381
|
+
srcset = img_element.get('srcset')
|
382
|
+
srcset_parts = srcset.split(',')
|
383
|
+
if srcset_parts:
|
384
|
+
# Get the last one (usually highest resolution)
|
385
|
+
highest_res = srcset_parts[-1].strip().split(' ')[0]
|
386
|
+
product_data['img_url'] = highest_res
|
387
|
+
# Fallback to src attribute
|
388
|
+
if 'img_url' not in product_data and img_element.get('src'):
|
389
|
+
product_data['img_url'] = img_element.get('src')
|
390
|
+
break
|
391
|
+
|
392
|
+
# Extract ratings (multiple possible formats)
|
393
|
+
rating_selectors = [
|
394
|
+
'i.a-icon-star-small', # Common layout
|
395
|
+
'.a-icon-star', # Alternative layout
|
396
|
+
'span.a-icon-alt', # Text inside span
|
397
|
+
'i.a-star-mini-4', # Format from example
|
398
|
+
'[aria-label*="out of 5 stars"]' # Aria-label format
|
399
|
+
]
|
400
|
+
|
401
|
+
for selector in rating_selectors:
|
402
|
+
rating_element = container.select_one(selector)
|
403
|
+
if rating_element:
|
404
|
+
# Try to extract from aria-label first
|
405
|
+
if rating_element.get('aria-label') and 'out of 5' in rating_element.get('aria-label'):
|
406
|
+
rating_text = rating_element.get('aria-label')
|
407
|
+
# Try alt text next
|
408
|
+
elif rating_element.get('alt') and 'out of 5' in rating_element.get('alt'):
|
409
|
+
rating_text = rating_element.get('alt')
|
410
|
+
# Try inner text or parent text
|
411
|
+
else:
|
412
|
+
rating_text = rating_element.text.strip()
|
413
|
+
# If no text, try parent
|
414
|
+
if not rating_text and rating_element.parent:
|
415
|
+
rating_text = rating_element.parent.text.strip()
|
416
|
+
|
417
|
+
# Extract the numeric rating
|
418
|
+
rating_match = re.search(r'([\d\.]+)(?:\s+out\s+of\s+5)?', rating_text)
|
419
|
+
if rating_match:
|
420
|
+
rating_str = rating_match.group(1)
|
421
|
+
if rating_str and rating_str != ".":
|
422
|
+
try:
|
423
|
+
product_data['rating'] = float(rating_str)
|
424
|
+
except ValueError:
|
425
|
+
print(f"Warning: Could not convert rating string: '{rating_str}'")
|
426
|
+
break
|
427
|
+
|
428
|
+
# Extract reviews count (multiple possible formats)
|
429
|
+
reviews_selectors = [
|
430
|
+
'span[aria-label*="reviews"]', # Common layout
|
431
|
+
'.a-size-base.s-underline-text', # Format from example
|
432
|
+
'a:-soup-contains("ratings")', # Alternative text-based
|
433
|
+
'a:-soup-contains("reviews")', # Another alternative
|
434
|
+
'.a-link-normal .a-size-base' # Generic link to reviews
|
435
|
+
]
|
436
|
+
|
437
|
+
for selector in reviews_selectors:
|
438
|
+
reviews_element = container.select_one(selector)
|
439
|
+
if reviews_element:
|
440
|
+
reviews_text = ""
|
441
|
+
# Try aria-label first
|
442
|
+
if reviews_element.get('aria-label'):
|
443
|
+
reviews_text = reviews_element.get('aria-label')
|
444
|
+
# Otherwise use text content
|
445
|
+
else:
|
446
|
+
reviews_text = reviews_element.text.strip()
|
447
|
+
|
448
|
+
# Extract digits with K/M suffix handling
|
449
|
+
reviews_match = re.search(r'([\d,\.]+)(?:K|k|M)?', reviews_text)
|
450
|
+
if reviews_match:
|
451
|
+
count_text = reviews_match.group(1).replace(',', '')
|
452
|
+
if count_text and count_text != ".":
|
453
|
+
try:
|
454
|
+
count = float(count_text)
|
455
|
+
|
456
|
+
# Handle K/M suffixes
|
457
|
+
if 'K' in reviews_text or 'k' in reviews_text:
|
458
|
+
count *= 1000
|
459
|
+
elif 'M' in reviews_text:
|
460
|
+
count *= 1000000
|
461
|
+
|
462
|
+
product_data['reviews_count'] = int(count)
|
463
|
+
except ValueError:
|
464
|
+
print(f"Warning: Could not convert reviews count: '{count_text}'")
|
465
|
+
break
|
466
|
+
|
467
|
+
# Check for Prime eligibility
|
468
|
+
prime_selectors = [
|
469
|
+
'i.a-icon-prime', # Common layout
|
470
|
+
'.a-icon-prime', # Alternative layout
|
471
|
+
'span:-soup-contains("Prime")', # Text-based detection
|
472
|
+
'.aok-relative.s-icon-text-medium', # Format from example
|
473
|
+
'[aria-label="Prime"]' # Aria-label based
|
474
|
+
]
|
475
|
+
|
476
|
+
product_data['prime'] = any(container.select_one(selector) for selector in prime_selectors)
|
477
|
+
|
478
|
+
# Extract color variants if available
|
479
|
+
color_variants = []
|
480
|
+
color_swatches = container.select('.s-color-swatch-outer-circle')
|
481
|
+
|
482
|
+
if color_swatches:
|
483
|
+
for swatch in color_swatches:
|
484
|
+
color_link = swatch.select_one('a')
|
485
|
+
if color_link:
|
486
|
+
color_name = color_link.get('aria-label', '')
|
487
|
+
color_url = color_link.get('href', '')
|
488
|
+
color_asin = None
|
489
|
+
|
490
|
+
# Try to extract ASIN from URL
|
491
|
+
if color_url:
|
492
|
+
asin_match = re.search(r'/dp/([A-Z0-9]{10})', color_url)
|
493
|
+
if asin_match:
|
494
|
+
color_asin = asin_match.group(1)
|
495
|
+
|
496
|
+
if color_name:
|
497
|
+
if color_url.startswith('/'):
|
498
|
+
color_url = urljoin(base_url, color_url) if base_url else color_url
|
499
|
+
|
500
|
+
# Format the canonical URL for color variant
|
501
|
+
canonical_color_url = format_canonical_url(color_url, color_asin, country_code) if color_asin else color_url
|
502
|
+
|
503
|
+
color_variants.append({
|
504
|
+
'name': color_name,
|
505
|
+
'url': canonical_color_url,
|
506
|
+
'asin': color_asin
|
507
|
+
})
|
508
|
+
|
509
|
+
if color_variants:
|
510
|
+
product_data['color_variants'] = color_variants
|
511
|
+
|
512
|
+
# Extract "Amazon's Choice" or "Best Seller" badges
|
513
|
+
badge_text = None
|
514
|
+
badge_element = container.select_one('.a-badge-text') or container.select_one('[aria-label*="Choice"]')
|
515
|
+
if badge_element:
|
516
|
+
badge_text = badge_element.text.strip()
|
517
|
+
if not badge_text and badge_element.get('aria-label'):
|
518
|
+
badge_text = badge_element.get('aria-label')
|
519
|
+
|
520
|
+
if badge_text:
|
521
|
+
product_data['badge'] = badge_text
|
522
|
+
|
523
|
+
# Extract delivery information
|
524
|
+
delivery_element = container.select_one('.a-row:-soup-contains("delivery")') or container.select_one('[aria-label*="delivery"]')
|
525
|
+
if delivery_element:
|
526
|
+
delivery_text = delivery_element.text.strip()
|
527
|
+
product_data['delivery_info'] = delivery_text
|
528
|
+
|
529
|
+
# Extract "Deal" information
|
530
|
+
deal_element = container.select_one('span:-soup-contains("Deal")') or container.select_one('.a-badge:-soup-contains("Deal")')
|
531
|
+
if deal_element:
|
532
|
+
product_data['deal'] = True
|
533
|
+
|
534
|
+
# Add the product to our results list if we have the key information
|
535
|
+
if product_data.get('title') and product_data.get('asin'):
|
536
|
+
results.append(product_data)
|
537
|
+
|
538
|
+
except Exception as e:
|
539
|
+
print(f"Error parsing individual search result: {e}")
|
540
|
+
continue # Skip this item and continue with the next
|
541
|
+
|
542
|
+
return results
|
543
|
+
|
544
|
+
except Exception as e:
|
545
|
+
print(f"Error parsing search page: {e}")
|
546
|
+
return []
|
547
|
+
|
548
|
+
|
549
|
+
def parse_pagination_url(html_content: str, base_url: str = None) -> Optional[str]:
|
550
|
+
"""
|
551
|
+
Extract the URL for the next page from search results pagination.
|
552
|
+
|
553
|
+
Args:
|
554
|
+
html_content (str): Raw HTML content of the search results page
|
555
|
+
base_url (str, optional): Base URL for resolving relative URLs
|
556
|
+
|
557
|
+
Returns:
|
558
|
+
Optional[str]: URL of the next page, or None if there isn't one
|
559
|
+
"""
|
560
|
+
if not html_content:
|
561
|
+
return None
|
562
|
+
|
563
|
+
soup = BeautifulSoup(html_content, 'lxml')
|
564
|
+
|
565
|
+
# Try multiple selectors for pagination "Next" button
|
566
|
+
next_link = (
|
567
|
+
soup.select_one('a.s-pagination-next:not(.s-pagination-disabled)') or
|
568
|
+
soup.select_one('li.a-last:not(.a-disabled) a') or
|
569
|
+
soup.select_one('a:has(span:contains("Next"))') or
|
570
|
+
soup.select_one('a[aria-label="Go to next page"]')
|
571
|
+
)
|
572
|
+
|
573
|
+
if next_link and next_link.get('href'):
|
574
|
+
next_url = next_link['href']
|
575
|
+
# Handle relative URLs
|
576
|
+
if next_url.startswith('/'):
|
577
|
+
return urljoin(base_url, next_url) if base_url else next_url
|
578
|
+
return next_url
|
579
|
+
|
580
|
+
return None
|