amzpy 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
amzpy/scraper.py CHANGED
@@ -1,52 +1,223 @@
1
1
  """
2
- The main scraper module for the Amazon Product API.
2
+ Amazon Product Scraper Module
3
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4
+
5
+ This is the main module for the Amazon Product API using curl_cffi.
6
+ It orchestrates the scraping workflow including:
7
+ - Managing sessions through AmzSession
8
+ - Fetching product details
9
+ - Searching for products
10
+ - Handling configuration
11
+
12
+ The AmazonScraper class provides a simple interface for users
13
+ while handling the complexity of Amazon's anti-bot measures underneath.
3
14
  """
4
15
 
5
- from typing import Dict, Optional
6
- from amzpy.engine import RequestEngine
7
- from amzpy.parser import parse_product_page
16
+ from typing import Dict, Optional, List, Union, Any
17
+ import re
18
+
19
+ from amzpy.session import AmzSession, DEFAULT_CONFIG
20
+ from amzpy.parser import parse_product_page, parse_search_page, parse_pagination_url
8
21
  from amzpy.utils import parse_amazon_url
9
22
 
23
+
10
24
  class AmazonScraper:
11
- """Main scraper class for Amazon product data"""
25
+ """
26
+ Main scraper class for Amazon product data using curl_cffi.
27
+
28
+ This class provides a high-level interface to:
29
+ - Fetch detailed information for individual products
30
+ - Search for products and extract listings
31
+ - Configure scraping behavior
12
32
 
13
- def __init__(self, country_code: str = "com"):
14
- """Initialize the Amazon scraper"""
15
- self.base_url = f"https://www.amazon.{country_code}/"
16
- self.engine = RequestEngine()
33
+ It handles browser impersonation, CAPTCHA avoidance, and parsing
34
+ through the session and parser modules.
17
35
 
18
- def get_product_details(self, url: str, max_retries: int = 3) -> Optional[Dict]:
36
+ Attributes:
37
+ country_code (str): Amazon domain country code (e.g. "com", "in")
38
+ session (AmzSession): Session manager for handling requests
39
+ user_config (dict): User configuration parameters
40
+ """
41
+
42
+ def __init__(self, country_code: str = "com", impersonate: str = None, proxies: Optional[Dict] = None):
43
+ """
44
+ Initialize the Amazon scraper with the specified configuration.
45
+
46
+ Args:
47
+ country_code (str): Amazon domain country code (e.g. "com", "in")
48
+ impersonate (str, optional): Browser to impersonate (e.g. "chrome119")
49
+ proxies (Dict, optional): Proxy configuration for requests
50
+ """
51
+ self.country_code = country_code
52
+ self.user_config = DEFAULT_CONFIG.copy()
53
+ self.session = AmzSession(
54
+ country_code=country_code,
55
+ impersonate=impersonate,
56
+ proxies=proxies
57
+ )
58
+
59
+ print(f"AmazonScraper initialized for amazon.{country_code}")
60
+
61
+ def config(self, config_str: str = None, **kwargs) -> Dict:
19
62
  """
20
- Fetch details for a product using its Amazon URL
63
+ Configure scraper parameters using either a string or keyword arguments.
64
+
65
+ Examples:
66
+ # Using string configuration
67
+ scraper.config('MAX_RETRIES = 5, REQUEST_TIMEOUT = 30')
68
+
69
+ # Using keyword arguments
70
+ scraper.config(MAX_RETRIES=5, REQUEST_TIMEOUT=30)
21
71
 
22
72
  Args:
23
- url (str): Amazon product URL
24
- max_retries (int): Maximum retry attempts if anti-bot measures detected
73
+ config_str (str, optional): Configuration string in format 'PARAM1 = value1, PARAM2 = value2'
74
+ **kwargs: Configuration parameters as keyword arguments
25
75
 
26
76
  Returns:
27
- Dict: Product details including title, price, img_url, and currency
28
- None: If URL is invalid or error occurs
77
+ Dict: Current configuration after updates
29
78
  """
30
- parsed = parse_amazon_url(url)
31
- if not parsed:
32
- return None
79
+ # Process string configuration if provided
80
+ if config_str:
81
+ # Parse the configuration string
82
+ try:
83
+ parts = config_str.split(',')
84
+ for part in parts:
85
+ key, value = part.split('=', 1)
86
+ key = key.strip()
87
+ value = eval(value.strip()) # Safely evaluate the value
88
+ self.user_config[key] = value
89
+ except Exception as e:
90
+ print(f"Error parsing configuration string: {e}")
91
+ print("Format should be: 'PARAM1 = value1, PARAM2 = value2'")
92
+
93
+ # Process keyword arguments if provided
94
+ if kwargs:
95
+ self.user_config.update(kwargs)
96
+
97
+ # Update the session configuration
98
+ self.session.update_config(**self.user_config)
99
+
100
+ return self.user_config
101
+
102
+ def get_product_details(self, url: str) -> Optional[Dict]:
103
+ """
104
+ Fetch and parse details for a product using its Amazon URL.
105
+
106
+ This method:
107
+ 1. Parses the product URL to extract the ASIN
108
+ 2. Constructs a canonical product URL
109
+ 3. Fetches the product page HTML
110
+ 4. Parses the HTML to extract structured data
111
+
112
+ Args:
113
+ url (str): Amazon product URL (any format with a valid ASIN)
33
114
 
34
- base_url, product_id = parsed
115
+ Returns:
116
+ Dict: Extracted product details (title, price, etc.)
117
+ None: If URL is invalid or scraping fails
118
+ """
119
+ # Parse the URL to extract base_url and product_id (ASIN)
120
+ parsed_info = parse_amazon_url(url)
121
+ if not parsed_info:
122
+ print(f"Invalid Amazon product URL: {url}")
123
+ return None
124
+
125
+ base_url, product_id = parsed_info
126
+ product_url = f"{base_url}dp/{product_id}" # Construct canonical URL
127
+ print(f"Fetching product data for ASIN: {product_id}")
128
+
129
+ # Fetch the product page using the session
130
+ response = self.session.get(product_url)
131
+ if not response or not response.text:
132
+ print(f"Failed to fetch product page for: {product_url}")
133
+ return None
134
+
135
+ # Parse the product page HTML, passing country code for URL formatting
136
+ product_data = parse_product_page(
137
+ html_content=response.text,
138
+ url=product_url,
139
+ country_code=self.country_code
140
+ )
35
141
 
36
- # Construct product URL and get HTML
37
- product_url = f"{base_url}dp/{product_id}"
38
- html_content = self.engine.get(product_url)
39
- if not html_content:
142
+ if not product_data:
143
+ print(f"Failed to extract product data from: {product_url}")
40
144
  return None
41
145
 
42
- # Parse the product page and return the data
43
- return parse_product_page(html_content, product_url, self.engine, max_retries)
44
-
45
- def main():
46
- scraper = AmazonScraper()
47
- url = "https://www.amazon.in/dp/B0D4J2QDVY"
48
- details = scraper.get_product_details(url, max_retries=5)
49
- print("Product details:", details)
50
-
51
- if __name__ == "__main__":
52
- main()
146
+ print(f"Successfully extracted data for: {product_data.get('title', 'Unknown Product')[:50]}...")
147
+ return product_data
148
+
149
+ def search_products(self, query: str = None, search_url: str = None, max_pages: int = 1) -> List[Dict]:
150
+ """
151
+ Search for products on Amazon and extract product listings.
152
+
153
+ This method supports two search approaches:
154
+ 1. Using a search query (e.g., "wireless headphones")
155
+ 2. Using a pre-constructed search URL (e.g., category pages, filtered searches)
156
+
157
+ It will automatically paginate through results up to max_pages.
158
+
159
+ Args:
160
+ query (str, optional): Search query text (ignored if search_url is provided)
161
+ search_url (str, optional): Pre-constructed search URL (takes precedence over query)
162
+ max_pages (int): Maximum number of pages to scrape (default: 1)
163
+
164
+ Returns:
165
+ List[Dict]: List of product data dictionaries from search results
166
+ Empty list: If search fails or no products are found
167
+ """
168
+ # Validate that we have either a query or a search URL
169
+ if not query and not search_url:
170
+ print("Error: Either a search query or search URL must be provided")
171
+ return []
172
+
173
+ # Construct search URL if only query was provided
174
+ if not search_url and query:
175
+ search_url = f"https://www.amazon.{self.country_code}/s?k={query.replace(' ', '+')}"
176
+
177
+ print(f"Starting product search: {search_url}")
178
+
179
+ all_products = [] # Collect products from all pages
180
+ current_url = search_url
181
+ current_page = 1
182
+
183
+ # Paginate through search results
184
+ while current_url and current_page <= max_pages:
185
+ print(f"\nScraping search page {current_page}/{max_pages}: {current_url}")
186
+
187
+ # Fetch the search page
188
+ response = self.session.get(current_url)
189
+ if not response or not response.text:
190
+ print(f"Failed to fetch search page: {current_url}")
191
+ break
192
+
193
+ # Parse products from the current page, passing country code for URL formatting
194
+ base_url = f"https://www.amazon.{self.country_code}"
195
+ products = parse_search_page(
196
+ response.text,
197
+ base_url,
198
+ country_code=self.country_code
199
+ )
200
+
201
+ # Check if we got valid results
202
+ if not products:
203
+ print(f"No products found on page {current_page} (or page was blocked)")
204
+ break
205
+
206
+ print(f"Found {len(products)} products on page {current_page}")
207
+ all_products.extend(products)
208
+
209
+ # Stop if we've reached the requested number of pages
210
+ if current_page >= max_pages:
211
+ break
212
+
213
+ # Get URL for the next page
214
+ next_url = parse_pagination_url(response.text, base_url)
215
+ if not next_url:
216
+ print("No next page found. End of results.")
217
+ break
218
+
219
+ current_url = next_url
220
+ current_page += 1
221
+
222
+ print(f"\nSearch completed. Total products found: {len(all_products)}")
223
+ return all_products
amzpy/session.py ADDED
@@ -0,0 +1,222 @@
1
+ """
2
+ Amazon Session Manager Module
3
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
4
+
5
+ This module provides a robust session management system for Amazon scraping.
6
+ It handles:
7
+ - Browser impersonation with curl_cffi
8
+ - Request retries with intelligent backoff
9
+ - CAPTCHA/block detection and avoidance
10
+ - User-agent rotation with fake_useragent
11
+ - Proxy support
12
+ """
13
+
14
+ import random
15
+ import time
16
+ from typing import Dict, Optional, Tuple, Any, Union
17
+
18
+ import curl_cffi.requests
19
+ from curl_cffi.requests.errors import RequestsError
20
+ from fake_useragent import UserAgent
21
+
22
+ # Default configuration (can be overridden by user)
23
+ DEFAULT_CONFIG = {
24
+ 'MAX_RETRIES': 3,
25
+ 'REQUEST_TIMEOUT': 25,
26
+ 'DELAY_BETWEEN_REQUESTS': (2, 5),
27
+ 'DEFAULT_IMPERSONATE': 'chrome119' # part of curl_cffi's impersonation
28
+ }
29
+
30
+ # Default header template
31
+ DEFAULT_HEADERS = {
32
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
33
+ 'Accept-Language': 'en-US,en;q=0.9',
34
+ 'Upgrade-Insecure-Requests': '1',
35
+ 'Sec-Ch-Ua': '"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
36
+ 'Sec-Ch-Ua-Mobile': '?0',
37
+ 'Sec-Ch-Ua-Platform': '"Windows"',
38
+ 'Sec-Fetch-Dest': 'document',
39
+ 'Sec-Fetch-Mode': 'navigate',
40
+ 'Sec-Fetch-Site': 'none',
41
+ 'Sec-Fetch-User': '?1',
42
+ }
43
+
44
+
45
+ class AmzSession:
46
+ """
47
+ Enhanced session manager using curl_cffi for Amazon requests.
48
+
49
+ This class implements sophisticated request handling including:
50
+ - Browser fingerprint spoofing (via curl_cffi impersonation)
51
+ - Randomized user agents (via fake_useragent)
52
+ - CAPTCHA/anti-bot detection and avoidance
53
+ - Intelligent retry logic with exponential backoff
54
+ - Proxy support for IP rotation
55
+
56
+ Attributes:
57
+ country_code (str): Amazon domain country code (e.g., "com", "in", "co.uk")
58
+ base_url (str): Constructed base URL for the Amazon domain
59
+ session (curl_cffi.requests.Session): The curl_cffi session instance
60
+ config (dict): Configuration parameters for request behavior
61
+ ua_generator (UserAgent): User agent generator for browser fingerprinting
62
+ """
63
+
64
+ def __init__(self, country_code: str = "com",
65
+ impersonate: str = None,
66
+ proxies: Optional[Dict] = None,
67
+ config: Optional[Dict] = None):
68
+ """
69
+ Initialize the Amazon session manager.
70
+
71
+ Args:
72
+ country_code (str): Amazon domain country code (e.g. "com", "in")
73
+ impersonate (str, optional): Browser to impersonate (e.g. "chrome119")
74
+ proxies (Dict, optional): Proxy configuration for requests
75
+ config (Dict, optional): Override default configuration parameters
76
+ """
77
+ # Initialize country and base URL
78
+ self.country_code = country_code
79
+ self.base_url = f"https://www.amazon.{self.country_code}/"
80
+
81
+ # Set up configuration (with user overrides if provided)
82
+ self.config = DEFAULT_CONFIG.copy()
83
+ if config:
84
+ self.config.update(config)
85
+
86
+ # Initialize fake_useragent with common browser and OS combinations
87
+ self.ua_generator = UserAgent(browsers=['Chrome', 'Edge', 'Safari'],
88
+ os=['Windows', 'MacOS', 'Linux'])
89
+
90
+ # Create curl_cffi session
91
+ self.session = curl_cffi.requests.Session()
92
+
93
+ # Set up headers with randomized user agent
94
+ headers = DEFAULT_HEADERS.copy()
95
+ headers['User-Agent'] = self.ua_generator.random
96
+ self.session.headers = headers
97
+
98
+ # Set browser impersonation if provided, otherwise use default
99
+ self.session.impersonate = impersonate or self.config['DEFAULT_IMPERSONATE']
100
+
101
+ # Configure proxies if provided
102
+ if proxies:
103
+ self.session.proxies = proxies
104
+
105
+ # Print session initialization info
106
+ print(f"AmzSession initialized for amazon.{country_code}")
107
+ print(f"Impersonating: {self.session.impersonate}")
108
+ print(f"User-Agent: {headers['User-Agent'][:50]}...")
109
+ if proxies:
110
+ print(f"Using proxies: {proxies}")
111
+
112
+ def get(self, url: str, headers: Optional[Dict] = None) -> Optional[curl_cffi.requests.Response]:
113
+ """
114
+ Perform a GET request using the curl_cffi session with smart retries.
115
+
116
+ This method intelligently handles:
117
+ - URL normalization (relative -> absolute)
118
+ - Header merging
119
+ - Random delays between requests
120
+ - CAPTCHA and anti-bot detection
121
+ - Automatic retries with exponential backoff
122
+ - Error handling for network issues
123
+
124
+ Args:
125
+ url (str): URL to fetch (absolute or relative to base_url)
126
+ headers (Dict, optional): Additional headers to merge with defaults
127
+
128
+ Returns:
129
+ Optional[curl_cffi.requests.Response]: Response object or None if all retries failed
130
+ """
131
+ # Normalize URL (handle both absolute and relative URLs)
132
+ if not url.startswith("http"):
133
+ if url.startswith("/"):
134
+ url = f"{self.base_url.rstrip('/')}{url}"
135
+ else:
136
+ url = f"{self.base_url}{url}"
137
+
138
+ # Merge headers with fresh random user agent for each request
139
+ merged_headers = self.session.headers.copy()
140
+ merged_headers['User-Agent'] = self.ua_generator.random
141
+ if headers:
142
+ merged_headers.update(headers)
143
+
144
+ # Extract configuration for use in the retry loop
145
+ max_retries = self.config['MAX_RETRIES']
146
+ timeout = self.config['REQUEST_TIMEOUT']
147
+ delay_range = self.config['DELAY_BETWEEN_REQUESTS']
148
+
149
+ # Retry loop with exponential backoff
150
+ for attempt in range(max_retries + 1):
151
+ try:
152
+ # Calculate delay with some randomization (increases with each attempt)
153
+ delay_factor = 1 + (attempt * 0.5) # Exponential backoff factor
154
+ min_delay, max_delay = delay_range
155
+ delay = random.uniform(min_delay * delay_factor, max_delay * delay_factor)
156
+
157
+ # Log attempt information
158
+ print(f"Request attempt {attempt+1}/{max_retries+1}: GET {url} (delay: {delay:.2f}s)")
159
+ time.sleep(delay)
160
+
161
+ # Make the actual request using curl_cffi
162
+ response = self.session.get(
163
+ url,
164
+ headers=merged_headers,
165
+ timeout=timeout,
166
+ allow_redirects=True
167
+ )
168
+
169
+ # Handle HTTP error codes
170
+ if response.status_code != 200:
171
+ print(f"Non-200 status code: {response.status_code}")
172
+
173
+ # Handle server errors specifically (5xx)
174
+ if 500 <= response.status_code < 600 and attempt < max_retries:
175
+ print(f"Server error {response.status_code}, retrying...")
176
+ continue
177
+
178
+ # For other status codes, continue but warn
179
+ print(f"Warning: Received HTTP {response.status_code} for {url}")
180
+
181
+ # Check for CAPTCHA/blocking patterns in the content
182
+ if "captcha" in response.text.lower() or "api-services-support@amazon.com" in response.text:
183
+ print("CAPTCHA or anti-bot measure detected in response")
184
+
185
+ if attempt < max_retries:
186
+ # Apply a longer delay before the next retry for anti-bot
187
+ captcha_delay = delay * 3
188
+ print(f"Detected anti-bot measure. Waiting {captcha_delay:.2f}s before retry")
189
+ time.sleep(captcha_delay)
190
+ continue
191
+
192
+ print("Failed to bypass anti-bot measures after all retries")
193
+
194
+ # If everything is good, return the response
195
+ print(f"Request successful: {url} (Status: {response.status_code})")
196
+ return response
197
+
198
+ except RequestsError as e:
199
+ print(f"Network error on attempt {attempt+1}: {e}")
200
+ if attempt == max_retries:
201
+ print(f"Max retries reached. Network error: {e}")
202
+ return None
203
+ time.sleep(delay * 2) # Longer delay after network error
204
+
205
+ except Exception as e:
206
+ print(f"Unexpected error on attempt {attempt+1}: {e}")
207
+ if attempt == max_retries:
208
+ print(f"Max retries reached. Error: {e}")
209
+ return None
210
+ time.sleep(delay * 2)
211
+
212
+ return None
213
+
214
+ def update_config(self, **kwargs):
215
+ """
216
+ Update session configuration parameters.
217
+
218
+ Args:
219
+ **kwargs: Configuration key-value pairs to update
220
+ """
221
+ self.config.update(kwargs)
222
+ print(f"Updated session configuration: {kwargs}")
amzpy/utils.py CHANGED
@@ -35,4 +35,46 @@ def parse_amazon_url(url: str) -> Optional[Tuple[str, str]]:
35
35
  # Extract product ID
36
36
  product_id = match.group(2)
37
37
 
38
- return base_url, product_id
38
+ return base_url, product_id
39
+
40
+ def format_canonical_url(url: str, asin: str, country_code: str = None) -> str:
41
+ """
42
+ Format a canonical Amazon product URL in the form amazon.{country}/dp/{asin}
43
+
44
+ Args:
45
+ url (str): Original Amazon URL
46
+ asin (str): ASIN of the product
47
+ country_code (str, optional): Country code (e.g., "com", "in")
48
+
49
+ Returns:
50
+ str: Canonical URL
51
+ """
52
+ if not asin:
53
+ return url # Return original if no ASIN available
54
+
55
+ # If country_code is not provided, try to extract it from the original URL
56
+ if not country_code:
57
+ try:
58
+ parsed_url = urlparse(url)
59
+ domain_parts = parsed_url.netloc.split('.')
60
+ # Extract country code from domain (e.g., www.amazon.com -> com)
61
+ if len(domain_parts) >= 3 and 'amazon' in domain_parts:
62
+ amazon_index = domain_parts.index('amazon')
63
+ if amazon_index + 1 < len(domain_parts):
64
+ country_code = domain_parts[amazon_index + 1]
65
+ except Exception:
66
+ country_code = "com" # Default to .com if extraction fails
67
+
68
+ # Default to .com if still no country code
69
+ if not country_code:
70
+ country_code = "com"
71
+
72
+ # Create canonical URL
73
+ return f"https://www.amazon.{country_code}/dp/{asin}"
74
+
75
+ # Function to extract brand name from text
76
+ def extract_brand_name(text):
77
+ match = re.search(r'visit the (.+?) store', text, re.IGNORECASE)
78
+ if match:
79
+ return match.group(1).strip()
80
+ return None