amzpy 0.1.1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- amzpy/__init__.py +2 -1
- amzpy/parser.py +552 -31
- amzpy/scraper.py +205 -34
- amzpy/session.py +222 -0
- amzpy/utils.py +43 -1
- amzpy-0.2.0.dist-info/METADATA +221 -0
- amzpy-0.2.0.dist-info/RECORD +11 -0
- {amzpy-0.1.1.dist-info → amzpy-0.2.0.dist-info}/WHEEL +1 -1
- amzpy/engine.py +0 -36
- amzpy-0.1.1.dist-info/METADATA +0 -93
- amzpy-0.1.1.dist-info/RECORD +0 -11
- {amzpy-0.1.1.dist-info → amzpy-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,221 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: amzpy
|
3
|
+
Version: 0.2.0
|
4
|
+
Summary: A lightweight Amazon scraper library.
|
5
|
+
Home-page: https://github.com/theonlyanil/amzpy
|
6
|
+
Author: Anil Sardiwal
|
7
|
+
Author-email: theonlyanil@gmail.com
|
8
|
+
Keywords: amazon,scraper,web-scraping,product-data,e-commerce,curl_cffi,anti-bot
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
10
|
+
Classifier: Intended Audience :: Developers
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
12
|
+
Classifier: Operating System :: OS Independent
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
14
|
+
Classifier: Programming Language :: Python :: 3.6
|
15
|
+
Classifier: Programming Language :: Python :: 3.7
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
20
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
|
21
|
+
Requires-Python: >=3.6
|
22
|
+
Description-Content-Type: text/markdown
|
23
|
+
Requires-Dist: curl_cffi>=0.5.7
|
24
|
+
Requires-Dist: beautifulsoup4>=4.11.0
|
25
|
+
Requires-Dist: lxml>=4.9.0
|
26
|
+
Requires-Dist: fake_useragent>=1.1.1
|
27
|
+
Dynamic: author
|
28
|
+
Dynamic: author-email
|
29
|
+
Dynamic: classifier
|
30
|
+
Dynamic: description
|
31
|
+
Dynamic: description-content-type
|
32
|
+
Dynamic: home-page
|
33
|
+
Dynamic: keywords
|
34
|
+
Dynamic: requires-dist
|
35
|
+
Dynamic: requires-python
|
36
|
+
Dynamic: summary
|
37
|
+
|
38
|
+
# AmzPy - Amazon Product Scraper [](https://pypi.org/project/amzpy/)
|
39
|
+

|
40
|
+
|
41
|
+
<a href="https://www.producthunt.com/posts/amzpy?embed=true&utm_source=badge-featured&utm_medium=badge&utm_souce=badge-amzpy" target="_blank"><img src="https://api.producthunt.com/widgets/embed-image/v1/featured.svg?post_id=812920&theme=neutral&t=1737654254074" alt="AmzPy - A lightweight Amazon product scraper library. | Product Hunt" style="width: 250px; height: 54px;" width="250" height="54" /></a>
|
42
|
+
|
43
|
+
AmzPy is a lightweight Python library for scraping product information from Amazon. It provides a simple interface to fetch product details like title, price, currency, and image URL while handling anti-bot measures automatically using curl_cffi for enhanced protection.
|
44
|
+
|
45
|
+
## Features
|
46
|
+
|
47
|
+
- Easy-to-use API for scraping Amazon product data
|
48
|
+
- Supports multiple Amazon domains (.com, .in, .co.uk, etc.)
|
49
|
+
- Enhanced anti-bot protection using curl_cffi with browser impersonation
|
50
|
+
- Automatic retries on detection with intelligent delay management
|
51
|
+
- Support for proxies to distribute requests
|
52
|
+
- Dynamic configuration options
|
53
|
+
- Extract color variants, discounts, delivery information, and more
|
54
|
+
- Clean and typed Python interface
|
55
|
+
|
56
|
+
## Installation
|
57
|
+
|
58
|
+
Install using pip:
|
59
|
+
```bash
|
60
|
+
pip install amzpy
|
61
|
+
```
|
62
|
+
|
63
|
+
## Basic Usage
|
64
|
+
|
65
|
+
### Fetching Product Details
|
66
|
+
|
67
|
+
```python
|
68
|
+
from amzpy import AmazonScraper
|
69
|
+
|
70
|
+
# Create scraper with default settings (amazon.com)
|
71
|
+
scraper = AmazonScraper()
|
72
|
+
|
73
|
+
# Fetch product details
|
74
|
+
url = "https://www.amazon.com/dp/B0D4J2QDVY"
|
75
|
+
product = scraper.get_product_details(url)
|
76
|
+
|
77
|
+
if product:
|
78
|
+
print(f"Title: {product['title']}")
|
79
|
+
print(f"Price: {product['currency']}{product['price']}")
|
80
|
+
print(f"Brand: {product['brand']}")
|
81
|
+
print(f"Rating: {product['rating']}")
|
82
|
+
print(f"Image URL: {product['img_url']}")
|
83
|
+
```
|
84
|
+
|
85
|
+
### Searching for Products
|
86
|
+
|
87
|
+
```python
|
88
|
+
from amzpy import AmazonScraper
|
89
|
+
|
90
|
+
# Create scraper for a specific Amazon domain
|
91
|
+
scraper = AmazonScraper(country_code="in")
|
92
|
+
|
93
|
+
# Search by query - get up to 2 pages of results
|
94
|
+
products = scraper.search_products(query="wireless earbuds", max_pages=2)
|
95
|
+
|
96
|
+
# Display the results
|
97
|
+
for i, product in enumerate(products[:5], 1):
|
98
|
+
print(f"{i}. {product['title']} - {product['currency']}{product['price']}")
|
99
|
+
```
|
100
|
+
|
101
|
+
## Advanced Usage
|
102
|
+
|
103
|
+
### Configuration Options
|
104
|
+
|
105
|
+
AmzPy offers flexible configuration options that can be set in multiple ways:
|
106
|
+
|
107
|
+
```python
|
108
|
+
# Method 1: Set at initialization
|
109
|
+
scraper = AmazonScraper(
|
110
|
+
country_code="in",
|
111
|
+
impersonate="chrome119",
|
112
|
+
proxies={"http": "http://user:pass@proxy.example.com:8080"}
|
113
|
+
)
|
114
|
+
|
115
|
+
# Method 2: Using string-based configuration
|
116
|
+
scraper.config('MAX_RETRIES = 5, REQUEST_TIMEOUT = 30, DELAY_BETWEEN_REQUESTS = (3, 8)')
|
117
|
+
|
118
|
+
# Method 3: Using keyword arguments
|
119
|
+
scraper.config(MAX_RETRIES=4, DEFAULT_IMPERSONATE="safari15")
|
120
|
+
```
|
121
|
+
|
122
|
+
### Advanced Search Features
|
123
|
+
|
124
|
+
The search functionality can extract rich product data including:
|
125
|
+
|
126
|
+
```python
|
127
|
+
# Search for products with 5 pages of results
|
128
|
+
products = scraper.search_products(query="men sneakers size 9", max_pages=5)
|
129
|
+
|
130
|
+
# Or search using a pre-constructed URL (e.g., filtered searches)
|
131
|
+
url = "https://www.amazon.in/s?i=shoes&rh=n%3A1983518031&s=popularity-rank"
|
132
|
+
products = scraper.search_products(search_url=url, max_pages=3)
|
133
|
+
|
134
|
+
# Access comprehensive product data
|
135
|
+
for product in products:
|
136
|
+
# Basic information
|
137
|
+
print(f"Title: {product.get('title')}")
|
138
|
+
print(f"ASIN: {product.get('asin')}")
|
139
|
+
print(f"URL: https://www.amazon.{scraper.country_code}/dp/{product.get('asin')}")
|
140
|
+
print(f"Brand: {product.get('brand')}")
|
141
|
+
print(f"Price: {product.get('currency')}{product.get('price')}")
|
142
|
+
|
143
|
+
# Discount information
|
144
|
+
if 'original_price' in product:
|
145
|
+
print(f"Original Price: {product.get('currency')}{product.get('original_price')}")
|
146
|
+
print(f"Discount: {product.get('discount_percent')}% off")
|
147
|
+
|
148
|
+
# Ratings and reviews
|
149
|
+
print(f"Rating: {product.get('rating')} / 5.0 ({product.get('reviews_count')} reviews)")
|
150
|
+
|
151
|
+
# Color variants
|
152
|
+
if 'color_variants' in product:
|
153
|
+
print(f"Available in {len(product['color_variants'])} colors")
|
154
|
+
for variant in product['color_variants']:
|
155
|
+
print(f" - {variant['name']}: https://www.amazon.{scraper.country_code}/dp/{variant['asin']}")
|
156
|
+
|
157
|
+
# Additional information
|
158
|
+
print(f"Prime Eligible: {'Yes' if product.get('prime') else 'No'}")
|
159
|
+
if 'delivery_info' in product:
|
160
|
+
print(f"Delivery: {product.get('delivery_info')}")
|
161
|
+
if 'badge' in product:
|
162
|
+
print(f"Badge: {product.get('badge')}")
|
163
|
+
```
|
164
|
+
|
165
|
+
### Working with Proxies
|
166
|
+
|
167
|
+
To distribute requests and avoid IP blocks, you can use proxies:
|
168
|
+
|
169
|
+
```python
|
170
|
+
# HTTP/HTTPS proxies
|
171
|
+
proxies = {
|
172
|
+
"http": "http://user:pass@proxy.example.com:8080",
|
173
|
+
"https": "http://user:pass@proxy.example.com:8080"
|
174
|
+
}
|
175
|
+
|
176
|
+
# SOCKS5 proxies
|
177
|
+
proxies = {
|
178
|
+
"http": "socks5://user:pass@proxy.example.com:1080",
|
179
|
+
"https": "socks5://user:pass@proxy.example.com:1080"
|
180
|
+
}
|
181
|
+
|
182
|
+
scraper = AmazonScraper(proxies=proxies)
|
183
|
+
```
|
184
|
+
|
185
|
+
### Browser Impersonation
|
186
|
+
|
187
|
+
AmzPy uses curl_cffi's browser impersonation to mimic real browser requests, significantly improving success rates when scraping Amazon:
|
188
|
+
|
189
|
+
```python
|
190
|
+
# Specify a particular browser to impersonate
|
191
|
+
scraper = AmazonScraper(impersonate="chrome119") # Chrome 119
|
192
|
+
scraper = AmazonScraper(impersonate="safari15") # Safari 15
|
193
|
+
scraper = AmazonScraper(impersonate="firefox115") # Firefox 115
|
194
|
+
```
|
195
|
+
|
196
|
+
## Configuration Reference
|
197
|
+
|
198
|
+
These configuration parameters can be adjusted:
|
199
|
+
|
200
|
+
| Parameter | Default | Description |
|
201
|
+
|-----------|---------|-------------|
|
202
|
+
| MAX_RETRIES | 3 | Maximum number of retry attempts for failed requests |
|
203
|
+
| REQUEST_TIMEOUT | 25 | Request timeout in seconds |
|
204
|
+
| DELAY_BETWEEN_REQUESTS | (2, 5) | Random delay range between requests (min, max) in seconds |
|
205
|
+
| DEFAULT_IMPERSONATE | 'chrome119' | Default browser to impersonate |
|
206
|
+
|
207
|
+
## Requirements
|
208
|
+
|
209
|
+
- Python 3.6+
|
210
|
+
- curl_cffi (for enhanced anti-bot protection)
|
211
|
+
- beautifulsoup4
|
212
|
+
- lxml (for faster HTML parsing)
|
213
|
+
- fake_useragent
|
214
|
+
|
215
|
+
## License
|
216
|
+
|
217
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
218
|
+
|
219
|
+
## Contributing
|
220
|
+
|
221
|
+
Contributions are welcome! Please read [CONTRIBUTING.md](CONTRIBUTING.md) for details on how to contribute to this project.
|
@@ -0,0 +1,11 @@
|
|
1
|
+
amzpy/__init__.py,sha256=iAjHxasVnhFoKHX2SrUs35wPlMJeWjqvb3W7A2_U-0c,587
|
2
|
+
amzpy/parser.py,sha256=vZ9RcGlizMf5H-YkkEDEcLziCFr39vTFTLE_SAeE_xg,27953
|
3
|
+
amzpy/scraper.py,sha256=I_bR1WZVwu2IAUM1TyAaEutNiMoBqzhqMpuU0iny6ME,8736
|
4
|
+
amzpy/session.py,sha256=uv6JjXazu-5POVcwrQ57DaFC-VVOi9bVhq4iuz4inoo,9144
|
5
|
+
amzpy/utils.py,sha256=wKPRUk2lypkAjiPVRPhH-lpHqExs1AKP4YyEYQG2HMs,2658
|
6
|
+
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
tests/test_scraper.py,sha256=XVbRrjM-b8ihuPjx5XezEU5o5twJ7xSBwp1ez28OcKs,582
|
8
|
+
amzpy-0.2.0.dist-info/METADATA,sha256=KzfySY40xi_Mr-Jlbl7IaR3T7z9Wl1D6sU4LAyiK1qo,7637
|
9
|
+
amzpy-0.2.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
10
|
+
amzpy-0.2.0.dist-info/top_level.txt,sha256=Srr5VxRSsZN9fziW9RLJtXt4o0dXk-b64YMijWr4y4Y,12
|
11
|
+
amzpy-0.2.0.dist-info/RECORD,,
|
amzpy/engine.py
DELETED
@@ -1,36 +0,0 @@
|
|
1
|
-
import requests
|
2
|
-
from typing import Optional
|
3
|
-
from fake_useragent import UserAgent
|
4
|
-
|
5
|
-
class RequestEngine:
|
6
|
-
"""Handles all HTTP requests to Amazon with anti-bot measures"""
|
7
|
-
|
8
|
-
def __init__(self):
|
9
|
-
self.session = requests.Session()
|
10
|
-
self.ua = UserAgent(browsers=['Edge', 'Chrome'])
|
11
|
-
self.headers = {
|
12
|
-
'User-Agent': self.ua.random,
|
13
|
-
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
14
|
-
'Accept-Language': 'en-US,en;q=0.5',
|
15
|
-
'Connection': 'keep-alive',
|
16
|
-
}
|
17
|
-
|
18
|
-
def get(self, url: str) -> Optional[str]:
|
19
|
-
"""
|
20
|
-
Make a GET request with anti-bot measures
|
21
|
-
|
22
|
-
Args:
|
23
|
-
url (str): URL to fetch
|
24
|
-
|
25
|
-
Returns:
|
26
|
-
str: HTML content if successful
|
27
|
-
None: If request fails
|
28
|
-
"""
|
29
|
-
try:
|
30
|
-
# Update User-Agent for each request
|
31
|
-
self.headers['User-Agent'] = self.ua.random
|
32
|
-
response = self.session.get(url, headers=self.headers)
|
33
|
-
response.raise_for_status()
|
34
|
-
return response.text
|
35
|
-
except Exception:
|
36
|
-
return None
|
amzpy-0.1.1.dist-info/METADATA
DELETED
@@ -1,93 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.2
|
2
|
-
Name: amzpy
|
3
|
-
Version: 0.1.1
|
4
|
-
Summary: A lightweight Amazon scraper library.
|
5
|
-
Home-page: https://github.com/theonlyanil/amzpy
|
6
|
-
Author: Anil Sardiwal
|
7
|
-
Author-email: theonlyanil@gmail.com
|
8
|
-
Keywords: amazon,scraper,web-scraping,product-data,e-commerce
|
9
|
-
Classifier: Development Status :: 3 - Alpha
|
10
|
-
Classifier: Intended Audience :: Developers
|
11
|
-
Classifier: License :: OSI Approved :: MIT License
|
12
|
-
Classifier: Operating System :: OS Independent
|
13
|
-
Classifier: Programming Language :: Python :: 3
|
14
|
-
Classifier: Programming Language :: Python :: 3.6
|
15
|
-
Classifier: Programming Language :: Python :: 3.7
|
16
|
-
Classifier: Programming Language :: Python :: 3.8
|
17
|
-
Classifier: Programming Language :: Python :: 3.9
|
18
|
-
Classifier: Programming Language :: Python :: 3.10
|
19
|
-
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
20
|
-
Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
|
21
|
-
Requires-Python: >=3.6
|
22
|
-
Description-Content-Type: text/markdown
|
23
|
-
Requires-Dist: requests
|
24
|
-
Requires-Dist: beautifulsoup4
|
25
|
-
Requires-Dist: fake-useragent
|
26
|
-
Dynamic: author
|
27
|
-
Dynamic: author-email
|
28
|
-
Dynamic: classifier
|
29
|
-
Dynamic: description
|
30
|
-
Dynamic: description-content-type
|
31
|
-
Dynamic: home-page
|
32
|
-
Dynamic: keywords
|
33
|
-
Dynamic: requires-dist
|
34
|
-
Dynamic: requires-python
|
35
|
-
Dynamic: summary
|
36
|
-
|
37
|
-
# AmzPy - Amazon Product Scraper
|
38
|
-
[](https://pypi.org/project/amzpy/)
|
39
|
-
|
40
|
-
AmzPy is a lightweight Python library for scraping product information from Amazon. It provides a simple interface to fetch product details like title, price, currency, and image URL while handling anti-bot measures automatically.
|
41
|
-
|
42
|
-
## Features
|
43
|
-
|
44
|
-
- Easy-to-use API for scraping Amazon product data
|
45
|
-
- Supports multiple Amazon domains (.com, .in, .co.uk, etc.)
|
46
|
-
- Built-in anti-bot protection
|
47
|
-
- Automatic retries on detection
|
48
|
-
- Clean and typed Python interface
|
49
|
-
|
50
|
-
## Installation
|
51
|
-
|
52
|
-
Install using pip:
|
53
|
-
`pip install amzpy`
|
54
|
-
|
55
|
-
## Usage
|
56
|
-
|
57
|
-
Here's a basic example of how to use AmzPy:
|
58
|
-
|
59
|
-
```python
|
60
|
-
from amzpy import AmazonScraper
|
61
|
-
|
62
|
-
scraper = AmazonScraper()
|
63
|
-
product_details = scraper.get_product_details("https://www.amazon.com/dp/B0D4J2QDVY")
|
64
|
-
print(product_details)
|
65
|
-
```
|
66
|
-
|
67
|
-
This will output the product details including title, price, image URL, and currency.
|
68
|
-
|
69
|
-
Feel free to explore the codebase for more details and advanced features. Happy scraping!
|
70
|
-
|
71
|
-
Output:
|
72
|
-
```json
|
73
|
-
{
|
74
|
-
"title": "Product Title",
|
75
|
-
"price": "299",
|
76
|
-
"currency": "$",
|
77
|
-
"img_url": "https://..."
|
78
|
-
}
|
79
|
-
```
|
80
|
-
## Requirements
|
81
|
-
|
82
|
-
- Python 3.6+
|
83
|
-
- requests
|
84
|
-
- beautifulsoup4
|
85
|
-
- fake-useragent
|
86
|
-
|
87
|
-
## License
|
88
|
-
|
89
|
-
This project is licensed under the MIT License - see the LICENSE file for details.
|
90
|
-
|
91
|
-
## Contributing
|
92
|
-
|
93
|
-
Contributions are welcome! Please read [CONTRIBUTING.md](CONTRIBUTING.md) for details on how to contribute to this project.
|
amzpy-0.1.1.dist-info/RECORD
DELETED
@@ -1,11 +0,0 @@
|
|
1
|
-
amzpy/__init__.py,sha256=D_HAyk3vKeexDTiuLozKEHMPqpWDX_hecvXdGsRL9LE,535
|
2
|
-
amzpy/engine.py,sha256=5ZyzdaQnxEZorYZMxav-qJTPdnRHXFyimwDJvicjtbI,1171
|
3
|
-
amzpy/parser.py,sha256=UxveeNPFu6gvk29LxH3Mp7Lp2USBgGCItJu_aK5C8UI,2354
|
4
|
-
amzpy/scraper.py,sha256=Jp6l9IruNZMDo1SgZI-pvz2HTBMSKoEoWmybSEkDGSE,1710
|
5
|
-
amzpy/utils.py,sha256=iyrAnhDB_1Lnn49CVYEowFZ1QZhonl8U23ZltR_WyZ4,1132
|
6
|
-
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
tests/test_scraper.py,sha256=XVbRrjM-b8ihuPjx5XezEU5o5twJ7xSBwp1ez28OcKs,582
|
8
|
-
amzpy-0.1.1.dist-info/METADATA,sha256=i7o36TDU2j5_BMw9jUeC64zWhFoIzFGJMwyy3MLmxZ4,2712
|
9
|
-
amzpy-0.1.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
10
|
-
amzpy-0.1.1.dist-info/top_level.txt,sha256=Srr5VxRSsZN9fziW9RLJtXt4o0dXk-b64YMijWr4y4Y,12
|
11
|
-
amzpy-0.1.1.dist-info/RECORD,,
|
File without changes
|