amzpy 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- amzpy/__init__.py +2 -1
- amzpy/parser.py +552 -31
- amzpy/scraper.py +205 -34
- amzpy/session.py +222 -0
- amzpy/utils.py +43 -1
- amzpy-0.2.0.dist-info/METADATA +221 -0
- amzpy-0.2.0.dist-info/RECORD +11 -0
- {amzpy-0.1.1.dist-info → amzpy-0.2.0.dist-info}/WHEEL +1 -1
- amzpy/engine.py +0 -36
- amzpy-0.1.1.dist-info/METADATA +0 -93
- amzpy-0.1.1.dist-info/RECORD +0 -11
- {amzpy-0.1.1.dist-info → amzpy-0.2.0.dist-info}/top_level.txt +0 -0
amzpy/scraper.py
CHANGED
@@ -1,52 +1,223 @@
|
|
1
1
|
"""
|
2
|
-
|
2
|
+
Amazon Product Scraper Module
|
3
|
+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
4
|
+
|
5
|
+
This is the main module for the Amazon Product API using curl_cffi.
|
6
|
+
It orchestrates the scraping workflow including:
|
7
|
+
- Managing sessions through AmzSession
|
8
|
+
- Fetching product details
|
9
|
+
- Searching for products
|
10
|
+
- Handling configuration
|
11
|
+
|
12
|
+
The AmazonScraper class provides a simple interface for users
|
13
|
+
while handling the complexity of Amazon's anti-bot measures underneath.
|
3
14
|
"""
|
4
15
|
|
5
|
-
from typing import Dict, Optional
|
6
|
-
|
7
|
-
|
16
|
+
from typing import Dict, Optional, List, Union, Any
|
17
|
+
import re
|
18
|
+
|
19
|
+
from amzpy.session import AmzSession, DEFAULT_CONFIG
|
20
|
+
from amzpy.parser import parse_product_page, parse_search_page, parse_pagination_url
|
8
21
|
from amzpy.utils import parse_amazon_url
|
9
22
|
|
23
|
+
|
10
24
|
class AmazonScraper:
|
11
|
-
"""
|
25
|
+
"""
|
26
|
+
Main scraper class for Amazon product data using curl_cffi.
|
27
|
+
|
28
|
+
This class provides a high-level interface to:
|
29
|
+
- Fetch detailed information for individual products
|
30
|
+
- Search for products and extract listings
|
31
|
+
- Configure scraping behavior
|
12
32
|
|
13
|
-
|
14
|
-
|
15
|
-
self.base_url = f"https://www.amazon.{country_code}/"
|
16
|
-
self.engine = RequestEngine()
|
33
|
+
It handles browser impersonation, CAPTCHA avoidance, and parsing
|
34
|
+
through the session and parser modules.
|
17
35
|
|
18
|
-
|
36
|
+
Attributes:
|
37
|
+
country_code (str): Amazon domain country code (e.g. "com", "in")
|
38
|
+
session (AmzSession): Session manager for handling requests
|
39
|
+
user_config (dict): User configuration parameters
|
40
|
+
"""
|
41
|
+
|
42
|
+
def __init__(self, country_code: str = "com", impersonate: str = None, proxies: Optional[Dict] = None):
|
43
|
+
"""
|
44
|
+
Initialize the Amazon scraper with the specified configuration.
|
45
|
+
|
46
|
+
Args:
|
47
|
+
country_code (str): Amazon domain country code (e.g. "com", "in")
|
48
|
+
impersonate (str, optional): Browser to impersonate (e.g. "chrome119")
|
49
|
+
proxies (Dict, optional): Proxy configuration for requests
|
50
|
+
"""
|
51
|
+
self.country_code = country_code
|
52
|
+
self.user_config = DEFAULT_CONFIG.copy()
|
53
|
+
self.session = AmzSession(
|
54
|
+
country_code=country_code,
|
55
|
+
impersonate=impersonate,
|
56
|
+
proxies=proxies
|
57
|
+
)
|
58
|
+
|
59
|
+
print(f"AmazonScraper initialized for amazon.{country_code}")
|
60
|
+
|
61
|
+
def config(self, config_str: str = None, **kwargs) -> Dict:
|
19
62
|
"""
|
20
|
-
|
63
|
+
Configure scraper parameters using either a string or keyword arguments.
|
64
|
+
|
65
|
+
Examples:
|
66
|
+
# Using string configuration
|
67
|
+
scraper.config('MAX_RETRIES = 5, REQUEST_TIMEOUT = 30')
|
68
|
+
|
69
|
+
# Using keyword arguments
|
70
|
+
scraper.config(MAX_RETRIES=5, REQUEST_TIMEOUT=30)
|
21
71
|
|
22
72
|
Args:
|
23
|
-
|
24
|
-
|
73
|
+
config_str (str, optional): Configuration string in format 'PARAM1 = value1, PARAM2 = value2'
|
74
|
+
**kwargs: Configuration parameters as keyword arguments
|
25
75
|
|
26
76
|
Returns:
|
27
|
-
Dict:
|
28
|
-
None: If URL is invalid or error occurs
|
77
|
+
Dict: Current configuration after updates
|
29
78
|
"""
|
30
|
-
|
31
|
-
if
|
32
|
-
|
79
|
+
# Process string configuration if provided
|
80
|
+
if config_str:
|
81
|
+
# Parse the configuration string
|
82
|
+
try:
|
83
|
+
parts = config_str.split(',')
|
84
|
+
for part in parts:
|
85
|
+
key, value = part.split('=', 1)
|
86
|
+
key = key.strip()
|
87
|
+
value = eval(value.strip()) # Safely evaluate the value
|
88
|
+
self.user_config[key] = value
|
89
|
+
except Exception as e:
|
90
|
+
print(f"Error parsing configuration string: {e}")
|
91
|
+
print("Format should be: 'PARAM1 = value1, PARAM2 = value2'")
|
92
|
+
|
93
|
+
# Process keyword arguments if provided
|
94
|
+
if kwargs:
|
95
|
+
self.user_config.update(kwargs)
|
96
|
+
|
97
|
+
# Update the session configuration
|
98
|
+
self.session.update_config(**self.user_config)
|
99
|
+
|
100
|
+
return self.user_config
|
101
|
+
|
102
|
+
def get_product_details(self, url: str) -> Optional[Dict]:
|
103
|
+
"""
|
104
|
+
Fetch and parse details for a product using its Amazon URL.
|
105
|
+
|
106
|
+
This method:
|
107
|
+
1. Parses the product URL to extract the ASIN
|
108
|
+
2. Constructs a canonical product URL
|
109
|
+
3. Fetches the product page HTML
|
110
|
+
4. Parses the HTML to extract structured data
|
111
|
+
|
112
|
+
Args:
|
113
|
+
url (str): Amazon product URL (any format with a valid ASIN)
|
33
114
|
|
34
|
-
|
115
|
+
Returns:
|
116
|
+
Dict: Extracted product details (title, price, etc.)
|
117
|
+
None: If URL is invalid or scraping fails
|
118
|
+
"""
|
119
|
+
# Parse the URL to extract base_url and product_id (ASIN)
|
120
|
+
parsed_info = parse_amazon_url(url)
|
121
|
+
if not parsed_info:
|
122
|
+
print(f"Invalid Amazon product URL: {url}")
|
123
|
+
return None
|
124
|
+
|
125
|
+
base_url, product_id = parsed_info
|
126
|
+
product_url = f"{base_url}dp/{product_id}" # Construct canonical URL
|
127
|
+
print(f"Fetching product data for ASIN: {product_id}")
|
128
|
+
|
129
|
+
# Fetch the product page using the session
|
130
|
+
response = self.session.get(product_url)
|
131
|
+
if not response or not response.text:
|
132
|
+
print(f"Failed to fetch product page for: {product_url}")
|
133
|
+
return None
|
134
|
+
|
135
|
+
# Parse the product page HTML, passing country code for URL formatting
|
136
|
+
product_data = parse_product_page(
|
137
|
+
html_content=response.text,
|
138
|
+
url=product_url,
|
139
|
+
country_code=self.country_code
|
140
|
+
)
|
35
141
|
|
36
|
-
|
37
|
-
|
38
|
-
html_content = self.engine.get(product_url)
|
39
|
-
if not html_content:
|
142
|
+
if not product_data:
|
143
|
+
print(f"Failed to extract product data from: {product_url}")
|
40
144
|
return None
|
41
145
|
|
42
|
-
|
43
|
-
return
|
44
|
-
|
45
|
-
def
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
146
|
+
print(f"Successfully extracted data for: {product_data.get('title', 'Unknown Product')[:50]}...")
|
147
|
+
return product_data
|
148
|
+
|
149
|
+
def search_products(self, query: str = None, search_url: str = None, max_pages: int = 1) -> List[Dict]:
|
150
|
+
"""
|
151
|
+
Search for products on Amazon and extract product listings.
|
152
|
+
|
153
|
+
This method supports two search approaches:
|
154
|
+
1. Using a search query (e.g., "wireless headphones")
|
155
|
+
2. Using a pre-constructed search URL (e.g., category pages, filtered searches)
|
156
|
+
|
157
|
+
It will automatically paginate through results up to max_pages.
|
158
|
+
|
159
|
+
Args:
|
160
|
+
query (str, optional): Search query text (ignored if search_url is provided)
|
161
|
+
search_url (str, optional): Pre-constructed search URL (takes precedence over query)
|
162
|
+
max_pages (int): Maximum number of pages to scrape (default: 1)
|
163
|
+
|
164
|
+
Returns:
|
165
|
+
List[Dict]: List of product data dictionaries from search results
|
166
|
+
Empty list: If search fails or no products are found
|
167
|
+
"""
|
168
|
+
# Validate that we have either a query or a search URL
|
169
|
+
if not query and not search_url:
|
170
|
+
print("Error: Either a search query or search URL must be provided")
|
171
|
+
return []
|
172
|
+
|
173
|
+
# Construct search URL if only query was provided
|
174
|
+
if not search_url and query:
|
175
|
+
search_url = f"https://www.amazon.{self.country_code}/s?k={query.replace(' ', '+')}"
|
176
|
+
|
177
|
+
print(f"Starting product search: {search_url}")
|
178
|
+
|
179
|
+
all_products = [] # Collect products from all pages
|
180
|
+
current_url = search_url
|
181
|
+
current_page = 1
|
182
|
+
|
183
|
+
# Paginate through search results
|
184
|
+
while current_url and current_page <= max_pages:
|
185
|
+
print(f"\nScraping search page {current_page}/{max_pages}: {current_url}")
|
186
|
+
|
187
|
+
# Fetch the search page
|
188
|
+
response = self.session.get(current_url)
|
189
|
+
if not response or not response.text:
|
190
|
+
print(f"Failed to fetch search page: {current_url}")
|
191
|
+
break
|
192
|
+
|
193
|
+
# Parse products from the current page, passing country code for URL formatting
|
194
|
+
base_url = f"https://www.amazon.{self.country_code}"
|
195
|
+
products = parse_search_page(
|
196
|
+
response.text,
|
197
|
+
base_url,
|
198
|
+
country_code=self.country_code
|
199
|
+
)
|
200
|
+
|
201
|
+
# Check if we got valid results
|
202
|
+
if not products:
|
203
|
+
print(f"No products found on page {current_page} (or page was blocked)")
|
204
|
+
break
|
205
|
+
|
206
|
+
print(f"Found {len(products)} products on page {current_page}")
|
207
|
+
all_products.extend(products)
|
208
|
+
|
209
|
+
# Stop if we've reached the requested number of pages
|
210
|
+
if current_page >= max_pages:
|
211
|
+
break
|
212
|
+
|
213
|
+
# Get URL for the next page
|
214
|
+
next_url = parse_pagination_url(response.text, base_url)
|
215
|
+
if not next_url:
|
216
|
+
print("No next page found. End of results.")
|
217
|
+
break
|
218
|
+
|
219
|
+
current_url = next_url
|
220
|
+
current_page += 1
|
221
|
+
|
222
|
+
print(f"\nSearch completed. Total products found: {len(all_products)}")
|
223
|
+
return all_products
|
amzpy/session.py
ADDED
@@ -0,0 +1,222 @@
|
|
1
|
+
"""
|
2
|
+
Amazon Session Manager Module
|
3
|
+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
4
|
+
|
5
|
+
This module provides a robust session management system for Amazon scraping.
|
6
|
+
It handles:
|
7
|
+
- Browser impersonation with curl_cffi
|
8
|
+
- Request retries with intelligent backoff
|
9
|
+
- CAPTCHA/block detection and avoidance
|
10
|
+
- User-agent rotation with fake_useragent
|
11
|
+
- Proxy support
|
12
|
+
"""
|
13
|
+
|
14
|
+
import random
|
15
|
+
import time
|
16
|
+
from typing import Dict, Optional, Tuple, Any, Union
|
17
|
+
|
18
|
+
import curl_cffi.requests
|
19
|
+
from curl_cffi.requests.errors import RequestsError
|
20
|
+
from fake_useragent import UserAgent
|
21
|
+
|
22
|
+
# Default configuration (can be overridden by user)
|
23
|
+
DEFAULT_CONFIG = {
|
24
|
+
'MAX_RETRIES': 3,
|
25
|
+
'REQUEST_TIMEOUT': 25,
|
26
|
+
'DELAY_BETWEEN_REQUESTS': (2, 5),
|
27
|
+
'DEFAULT_IMPERSONATE': 'chrome119' # part of curl_cffi's impersonation
|
28
|
+
}
|
29
|
+
|
30
|
+
# Default header template
|
31
|
+
DEFAULT_HEADERS = {
|
32
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
33
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
34
|
+
'Upgrade-Insecure-Requests': '1',
|
35
|
+
'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
|
36
|
+
'Sec-Ch-Ua-Mobile': '?0',
|
37
|
+
'Sec-Ch-Ua-Platform': '"Windows"',
|
38
|
+
'Sec-Fetch-Dest': 'document',
|
39
|
+
'Sec-Fetch-Mode': 'navigate',
|
40
|
+
'Sec-Fetch-Site': 'none',
|
41
|
+
'Sec-Fetch-User': '?1',
|
42
|
+
}
|
43
|
+
|
44
|
+
|
45
|
+
class AmzSession:
|
46
|
+
"""
|
47
|
+
Enhanced session manager using curl_cffi for Amazon requests.
|
48
|
+
|
49
|
+
This class implements sophisticated request handling including:
|
50
|
+
- Browser fingerprint spoofing (via curl_cffi impersonation)
|
51
|
+
- Randomized user agents (via fake_useragent)
|
52
|
+
- CAPTCHA/anti-bot detection and avoidance
|
53
|
+
- Intelligent retry logic with exponential backoff
|
54
|
+
- Proxy support for IP rotation
|
55
|
+
|
56
|
+
Attributes:
|
57
|
+
country_code (str): Amazon domain country code (e.g., "com", "in", "co.uk")
|
58
|
+
base_url (str): Constructed base URL for the Amazon domain
|
59
|
+
session (curl_cffi.requests.Session): The curl_cffi session instance
|
60
|
+
config (dict): Configuration parameters for request behavior
|
61
|
+
ua_generator (UserAgent): User agent generator for browser fingerprinting
|
62
|
+
"""
|
63
|
+
|
64
|
+
def __init__(self, country_code: str = "com",
|
65
|
+
impersonate: str = None,
|
66
|
+
proxies: Optional[Dict] = None,
|
67
|
+
config: Optional[Dict] = None):
|
68
|
+
"""
|
69
|
+
Initialize the Amazon session manager.
|
70
|
+
|
71
|
+
Args:
|
72
|
+
country_code (str): Amazon domain country code (e.g. "com", "in")
|
73
|
+
impersonate (str, optional): Browser to impersonate (e.g. "chrome119")
|
74
|
+
proxies (Dict, optional): Proxy configuration for requests
|
75
|
+
config (Dict, optional): Override default configuration parameters
|
76
|
+
"""
|
77
|
+
# Initialize country and base URL
|
78
|
+
self.country_code = country_code
|
79
|
+
self.base_url = f"https://www.amazon.{self.country_code}/"
|
80
|
+
|
81
|
+
# Set up configuration (with user overrides if provided)
|
82
|
+
self.config = DEFAULT_CONFIG.copy()
|
83
|
+
if config:
|
84
|
+
self.config.update(config)
|
85
|
+
|
86
|
+
# Initialize fake_useragent with common browser and OS combinations
|
87
|
+
self.ua_generator = UserAgent(browsers=['Chrome', 'Edge', 'Safari'],
|
88
|
+
os=['Windows', 'MacOS', 'Linux'])
|
89
|
+
|
90
|
+
# Create curl_cffi session
|
91
|
+
self.session = curl_cffi.requests.Session()
|
92
|
+
|
93
|
+
# Set up headers with randomized user agent
|
94
|
+
headers = DEFAULT_HEADERS.copy()
|
95
|
+
headers['User-Agent'] = self.ua_generator.random
|
96
|
+
self.session.headers = headers
|
97
|
+
|
98
|
+
# Set browser impersonation if provided, otherwise use default
|
99
|
+
self.session.impersonate = impersonate or self.config['DEFAULT_IMPERSONATE']
|
100
|
+
|
101
|
+
# Configure proxies if provided
|
102
|
+
if proxies:
|
103
|
+
self.session.proxies = proxies
|
104
|
+
|
105
|
+
# Print session initialization info
|
106
|
+
print(f"AmzSession initialized for amazon.{country_code}")
|
107
|
+
print(f"Impersonating: {self.session.impersonate}")
|
108
|
+
print(f"User-Agent: {headers['User-Agent'][:50]}...")
|
109
|
+
if proxies:
|
110
|
+
print(f"Using proxies: {proxies}")
|
111
|
+
|
112
|
+
def get(self, url: str, headers: Optional[Dict] = None) -> Optional[curl_cffi.requests.Response]:
|
113
|
+
"""
|
114
|
+
Perform a GET request using the curl_cffi session with smart retries.
|
115
|
+
|
116
|
+
This method intelligently handles:
|
117
|
+
- URL normalization (relative -> absolute)
|
118
|
+
- Header merging
|
119
|
+
- Random delays between requests
|
120
|
+
- CAPTCHA and anti-bot detection
|
121
|
+
- Automatic retries with exponential backoff
|
122
|
+
- Error handling for network issues
|
123
|
+
|
124
|
+
Args:
|
125
|
+
url (str): URL to fetch (absolute or relative to base_url)
|
126
|
+
headers (Dict, optional): Additional headers to merge with defaults
|
127
|
+
|
128
|
+
Returns:
|
129
|
+
Optional[curl_cffi.requests.Response]: Response object or None if all retries failed
|
130
|
+
"""
|
131
|
+
# Normalize URL (handle both absolute and relative URLs)
|
132
|
+
if not url.startswith("http"):
|
133
|
+
if url.startswith("/"):
|
134
|
+
url = f"{self.base_url.rstrip('/')}{url}"
|
135
|
+
else:
|
136
|
+
url = f"{self.base_url}{url}"
|
137
|
+
|
138
|
+
# Merge headers with fresh random user agent for each request
|
139
|
+
merged_headers = self.session.headers.copy()
|
140
|
+
merged_headers['User-Agent'] = self.ua_generator.random
|
141
|
+
if headers:
|
142
|
+
merged_headers.update(headers)
|
143
|
+
|
144
|
+
# Extract configuration for use in the retry loop
|
145
|
+
max_retries = self.config['MAX_RETRIES']
|
146
|
+
timeout = self.config['REQUEST_TIMEOUT']
|
147
|
+
delay_range = self.config['DELAY_BETWEEN_REQUESTS']
|
148
|
+
|
149
|
+
# Retry loop with exponential backoff
|
150
|
+
for attempt in range(max_retries + 1):
|
151
|
+
try:
|
152
|
+
# Calculate delay with some randomization (increases with each attempt)
|
153
|
+
delay_factor = 1 + (attempt * 0.5) # Exponential backoff factor
|
154
|
+
min_delay, max_delay = delay_range
|
155
|
+
delay = random.uniform(min_delay * delay_factor, max_delay * delay_factor)
|
156
|
+
|
157
|
+
# Log attempt information
|
158
|
+
print(f"Request attempt {attempt+1}/{max_retries+1}: GET {url} (delay: {delay:.2f}s)")
|
159
|
+
time.sleep(delay)
|
160
|
+
|
161
|
+
# Make the actual request using curl_cffi
|
162
|
+
response = self.session.get(
|
163
|
+
url,
|
164
|
+
headers=merged_headers,
|
165
|
+
timeout=timeout,
|
166
|
+
allow_redirects=True
|
167
|
+
)
|
168
|
+
|
169
|
+
# Handle HTTP error codes
|
170
|
+
if response.status_code != 200:
|
171
|
+
print(f"Non-200 status code: {response.status_code}")
|
172
|
+
|
173
|
+
# Handle server errors specifically (5xx)
|
174
|
+
if 500 <= response.status_code < 600 and attempt < max_retries:
|
175
|
+
print(f"Server error {response.status_code}, retrying...")
|
176
|
+
continue
|
177
|
+
|
178
|
+
# For other status codes, continue but warn
|
179
|
+
print(f"Warning: Received HTTP {response.status_code} for {url}")
|
180
|
+
|
181
|
+
# Check for CAPTCHA/blocking patterns in the content
|
182
|
+
if "captcha" in response.text.lower() or "api-services-support@amazon.com" in response.text:
|
183
|
+
print("CAPTCHA or anti-bot measure detected in response")
|
184
|
+
|
185
|
+
if attempt < max_retries:
|
186
|
+
# Apply a longer delay before the next retry for anti-bot
|
187
|
+
captcha_delay = delay * 3
|
188
|
+
print(f"Detected anti-bot measure. Waiting {captcha_delay:.2f}s before retry")
|
189
|
+
time.sleep(captcha_delay)
|
190
|
+
continue
|
191
|
+
|
192
|
+
print("Failed to bypass anti-bot measures after all retries")
|
193
|
+
|
194
|
+
# If everything is good, return the response
|
195
|
+
print(f"Request successful: {url} (Status: {response.status_code})")
|
196
|
+
return response
|
197
|
+
|
198
|
+
except RequestsError as e:
|
199
|
+
print(f"Network error on attempt {attempt+1}: {e}")
|
200
|
+
if attempt == max_retries:
|
201
|
+
print(f"Max retries reached. Network error: {e}")
|
202
|
+
return None
|
203
|
+
time.sleep(delay * 2) # Longer delay after network error
|
204
|
+
|
205
|
+
except Exception as e:
|
206
|
+
print(f"Unexpected error on attempt {attempt+1}: {e}")
|
207
|
+
if attempt == max_retries:
|
208
|
+
print(f"Max retries reached. Error: {e}")
|
209
|
+
return None
|
210
|
+
time.sleep(delay * 2)
|
211
|
+
|
212
|
+
return None
|
213
|
+
|
214
|
+
def update_config(self, **kwargs):
|
215
|
+
"""
|
216
|
+
Update session configuration parameters.
|
217
|
+
|
218
|
+
Args:
|
219
|
+
**kwargs: Configuration key-value pairs to update
|
220
|
+
"""
|
221
|
+
self.config.update(kwargs)
|
222
|
+
print(f"Updated session configuration: {kwargs}")
|
amzpy/utils.py
CHANGED
@@ -35,4 +35,46 @@ def parse_amazon_url(url: str) -> Optional[Tuple[str, str]]:
|
|
35
35
|
# Extract product ID
|
36
36
|
product_id = match.group(2)
|
37
37
|
|
38
|
-
return base_url, product_id
|
38
|
+
return base_url, product_id
|
39
|
+
|
40
|
+
def format_canonical_url(url: str, asin: str, country_code: str = None) -> str:
|
41
|
+
"""
|
42
|
+
Format a canonical Amazon product URL in the form amazon.{country}/dp/{asin}
|
43
|
+
|
44
|
+
Args:
|
45
|
+
url (str): Original Amazon URL
|
46
|
+
asin (str): ASIN of the product
|
47
|
+
country_code (str, optional): Country code (e.g., "com", "in")
|
48
|
+
|
49
|
+
Returns:
|
50
|
+
str: Canonical URL
|
51
|
+
"""
|
52
|
+
if not asin:
|
53
|
+
return url # Return original if no ASIN available
|
54
|
+
|
55
|
+
# If country_code is not provided, try to extract it from the original URL
|
56
|
+
if not country_code:
|
57
|
+
try:
|
58
|
+
parsed_url = urlparse(url)
|
59
|
+
domain_parts = parsed_url.netloc.split('.')
|
60
|
+
# Extract country code from domain (e.g., www.amazon.com -> com)
|
61
|
+
if len(domain_parts) >= 3 and 'amazon' in domain_parts:
|
62
|
+
amazon_index = domain_parts.index('amazon')
|
63
|
+
if amazon_index + 1 < len(domain_parts):
|
64
|
+
country_code = domain_parts[amazon_index + 1]
|
65
|
+
except Exception:
|
66
|
+
country_code = "com" # Default to .com if extraction fails
|
67
|
+
|
68
|
+
# Default to .com if still no country code
|
69
|
+
if not country_code:
|
70
|
+
country_code = "com"
|
71
|
+
|
72
|
+
# Create canonical URL
|
73
|
+
return f"https://www.amazon.{country_code}/dp/{asin}"
|
74
|
+
|
75
|
+
# Function to extract brand name from text
|
76
|
+
def extract_brand_name(text):
|
77
|
+
match = re.search(r'visit the (.+?) store', text, re.IGNORECASE)
|
78
|
+
if match:
|
79
|
+
return match.group(1).strip()
|
80
|
+
return None
|