amzpy 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- amzpy/__init__.py +21 -0
- amzpy/engine.py +36 -0
- amzpy/parser.py +59 -0
- amzpy/scraper.py +52 -0
- amzpy/utils.py +38 -0
- amzpy-0.1.0.dist-info/METADATA +92 -0
- amzpy-0.1.0.dist-info/RECORD +11 -0
- amzpy-0.1.0.dist-info/WHEEL +5 -0
- amzpy-0.1.0.dist-info/top_level.txt +2 -0
- tests/__init__.py +0 -0
- tests/test_scraper.py +16 -0
amzpy/__init__.py
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
"""
|
2
|
+
AmzPy - Amazon Product Scraper
|
3
|
+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
4
|
+
|
5
|
+
A lightweight Python library for scraping product information from Amazon.
|
6
|
+
|
7
|
+
Basic usage:
|
8
|
+
>>> from amzpy import AmazonScraper
|
9
|
+
>>> scraper = AmazonScraper()
|
10
|
+
>>> details = scraper.get_product_details("https://www.amazon.com/dp/B0D4J2QDVY")
|
11
|
+
>>> print(details)
|
12
|
+
|
13
|
+
:copyright: (c) 2025 by Anil Sardiwal.
|
14
|
+
:license: MIT, see LICENSE for more details.
|
15
|
+
"""
|
16
|
+
|
17
|
+
from .scraper import AmazonScraper
|
18
|
+
|
19
|
+
__version__ = "0.1.0"
|
20
|
+
__author__ = "Anil Sardiwal"
|
21
|
+
__license__ = "MIT"
|
amzpy/engine.py
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
import requests
|
2
|
+
from typing import Optional
|
3
|
+
from fake_useragent import UserAgent
|
4
|
+
|
5
|
+
class RequestEngine:
|
6
|
+
"""Handles all HTTP requests to Amazon with anti-bot measures"""
|
7
|
+
|
8
|
+
def __init__(self):
|
9
|
+
self.session = requests.Session()
|
10
|
+
self.ua = UserAgent(browsers=['Edge', 'Chrome'])
|
11
|
+
self.headers = {
|
12
|
+
'User-Agent': self.ua.random,
|
13
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
14
|
+
'Accept-Language': 'en-US,en;q=0.5',
|
15
|
+
'Connection': 'keep-alive',
|
16
|
+
}
|
17
|
+
|
18
|
+
def get(self, url: str) -> Optional[str]:
|
19
|
+
"""
|
20
|
+
Make a GET request with anti-bot measures
|
21
|
+
|
22
|
+
Args:
|
23
|
+
url (str): URL to fetch
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
str: HTML content if successful
|
27
|
+
None: If request fails
|
28
|
+
"""
|
29
|
+
try:
|
30
|
+
# Update User-Agent for each request
|
31
|
+
self.headers['User-Agent'] = self.ua.random
|
32
|
+
response = self.session.get(url, headers=self.headers)
|
33
|
+
response.raise_for_status()
|
34
|
+
return response.text
|
35
|
+
except Exception:
|
36
|
+
return None
|
amzpy/parser.py
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
from bs4 import BeautifulSoup
|
2
|
+
from typing import Dict, Optional
|
3
|
+
from engine import RequestEngine
|
4
|
+
|
5
|
+
def parse_product_page(html_content: str, url: str = None, engine: RequestEngine = None, max_retries: int = 0) -> Optional[Dict]:
|
6
|
+
"""
|
7
|
+
Parse Amazon product page HTML and extract relevant information
|
8
|
+
|
9
|
+
Args:
|
10
|
+
html_content (str): Raw HTML content of the product page
|
11
|
+
url (str, optional): Product URL for retrying if needed
|
12
|
+
engine (RequestEngine, optional): RequestEngine instance for retries
|
13
|
+
max_retries (int): Number of remaining retry attempts
|
14
|
+
|
15
|
+
Returns:
|
16
|
+
Dict: Extracted product information (title, price, img_url, currency)
|
17
|
+
None: If parsing fails after all retries
|
18
|
+
"""
|
19
|
+
if not html_content:
|
20
|
+
return None
|
21
|
+
|
22
|
+
soup = BeautifulSoup(html_content, 'html.parser')
|
23
|
+
|
24
|
+
try:
|
25
|
+
# Get title
|
26
|
+
title = soup.select_one('#productTitle')
|
27
|
+
title = title.text.strip() if title else None
|
28
|
+
|
29
|
+
# If title is None and we have retries left, try again
|
30
|
+
if title is None and max_retries > 0 and url and engine:
|
31
|
+
print(f"Retry attempt {max_retries} - Anti-bot measure detected")
|
32
|
+
new_html = engine.get(url)
|
33
|
+
return parse_product_page(new_html, url, engine, max_retries - 1)
|
34
|
+
|
35
|
+
# Get price
|
36
|
+
price_element = soup.select_one('.a-price-whole')
|
37
|
+
price = price_element.text.strip().replace(',', '') if price_element else None
|
38
|
+
|
39
|
+
# Get currency symbol
|
40
|
+
currency_element = soup.select_one('.a-price-symbol')
|
41
|
+
currency = currency_element.text.strip() if currency_element else None
|
42
|
+
|
43
|
+
# Get main product image
|
44
|
+
img_element = soup.select_one('#landingImage') or soup.select_one('#imgBlkFront')
|
45
|
+
img_url = img_element.get('src') if img_element else None
|
46
|
+
|
47
|
+
return {
|
48
|
+
"title": title,
|
49
|
+
"price": price,
|
50
|
+
"img_url": img_url,
|
51
|
+
"currency": currency
|
52
|
+
}
|
53
|
+
except Exception:
|
54
|
+
# If we have retries left, try again
|
55
|
+
if max_retries > 0 and url and engine:
|
56
|
+
print(f"Retry attempt {max_retries} - Error occurred")
|
57
|
+
new_html = engine.get(url)
|
58
|
+
return parse_product_page(new_html, url, engine, max_retries - 1)
|
59
|
+
return None
|
amzpy/scraper.py
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
"""
|
2
|
+
The main scraper module for the Amazon Product API.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from typing import Dict, Optional
|
6
|
+
from engine import RequestEngine
|
7
|
+
from parser import parse_product_page
|
8
|
+
from utils import parse_amazon_url
|
9
|
+
|
10
|
+
class AmazonScraper:
|
11
|
+
"""Main scraper class for Amazon product data"""
|
12
|
+
|
13
|
+
def __init__(self, country_code: str = "com"):
|
14
|
+
"""Initialize the Amazon scraper"""
|
15
|
+
self.base_url = f"https://www.amazon.{country_code}/"
|
16
|
+
self.engine = RequestEngine()
|
17
|
+
|
18
|
+
def get_product_details(self, url: str, max_retries: int = 3) -> Optional[Dict]:
|
19
|
+
"""
|
20
|
+
Fetch details for a product using its Amazon URL
|
21
|
+
|
22
|
+
Args:
|
23
|
+
url (str): Amazon product URL
|
24
|
+
max_retries (int): Maximum retry attempts if anti-bot measures detected
|
25
|
+
|
26
|
+
Returns:
|
27
|
+
Dict: Product details including title, price, img_url, and currency
|
28
|
+
None: If URL is invalid or error occurs
|
29
|
+
"""
|
30
|
+
parsed = parse_amazon_url(url)
|
31
|
+
if not parsed:
|
32
|
+
return None
|
33
|
+
|
34
|
+
base_url, product_id = parsed
|
35
|
+
|
36
|
+
# Construct product URL and get HTML
|
37
|
+
product_url = f"{base_url}dp/{product_id}"
|
38
|
+
html_content = self.engine.get(product_url)
|
39
|
+
if not html_content:
|
40
|
+
return None
|
41
|
+
|
42
|
+
# Parse the product page and return the data
|
43
|
+
return parse_product_page(html_content, product_url, self.engine, max_retries)
|
44
|
+
|
45
|
+
def main():
|
46
|
+
scraper = AmazonScraper()
|
47
|
+
url = "https://www.amazon.in/dp/B0D4J2QDVY"
|
48
|
+
details = scraper.get_product_details(url, max_retries=5)
|
49
|
+
print("Product details:", details)
|
50
|
+
|
51
|
+
if __name__ == "__main__":
|
52
|
+
main()
|
amzpy/utils.py
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
from typing import Tuple, Optional
|
2
|
+
import re
|
3
|
+
|
4
|
+
def parse_amazon_url(url: str) -> Optional[Tuple[str, str]]:
|
5
|
+
"""
|
6
|
+
Parse Amazon product URL to extract base URL and product ID
|
7
|
+
|
8
|
+
Args:
|
9
|
+
url (str): Full Amazon product URL
|
10
|
+
|
11
|
+
Returns:
|
12
|
+
Tuple[str, str]: (base_url, product_id) if valid
|
13
|
+
None: If URL is invalid
|
14
|
+
|
15
|
+
Examples:
|
16
|
+
>>> parse_amazon_url("https://www.amazon.com/dp/B0D4J2QDVY")
|
17
|
+
("https://www.amazon.com/", "B0D4J2QDVY")
|
18
|
+
>>> parse_amazon_url("https://www.amazon.in/product-name/dp/B0D4J2QDVY/ref=...")
|
19
|
+
("https://www.amazon.in/", "B0D4J2QDVY")
|
20
|
+
"""
|
21
|
+
# Clean up the URL
|
22
|
+
url = url.strip()
|
23
|
+
|
24
|
+
# Match Amazon domain and product ID
|
25
|
+
pattern = r'https?://(?:www\.)?amazon\.([a-z.]+)(?:/[^/]+)*?/(?:dp|gp/product)/([A-Z0-9]{10})'
|
26
|
+
match = re.search(pattern, url)
|
27
|
+
|
28
|
+
if not match:
|
29
|
+
return None
|
30
|
+
|
31
|
+
# Extract domain and construct base URL
|
32
|
+
domain = match.group(1)
|
33
|
+
base_url = f"https://www.amazon.{domain}/"
|
34
|
+
|
35
|
+
# Extract product ID
|
36
|
+
product_id = match.group(2)
|
37
|
+
|
38
|
+
return base_url, product_id
|
@@ -0,0 +1,92 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: amzpy
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: A lightweight Amazon scraper library.
|
5
|
+
Home-page: https://github.com/theonlyanil/amzpy
|
6
|
+
Author: Anil Sardiwal
|
7
|
+
Author-email: theonlyanil@gmail.com
|
8
|
+
Keywords: amazon,scraper,web-scraping,product-data,e-commerce
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
10
|
+
Classifier: Intended Audience :: Developers
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
12
|
+
Classifier: Operating System :: OS Independent
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
14
|
+
Classifier: Programming Language :: Python :: 3.6
|
15
|
+
Classifier: Programming Language :: Python :: 3.7
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
20
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Dynamic Content
|
21
|
+
Requires-Python: >=3.6
|
22
|
+
Description-Content-Type: text/markdown
|
23
|
+
Requires-Dist: requests
|
24
|
+
Requires-Dist: beautifulsoup4
|
25
|
+
Requires-Dist: fake-useragent
|
26
|
+
Dynamic: author
|
27
|
+
Dynamic: author-email
|
28
|
+
Dynamic: classifier
|
29
|
+
Dynamic: description
|
30
|
+
Dynamic: description-content-type
|
31
|
+
Dynamic: home-page
|
32
|
+
Dynamic: keywords
|
33
|
+
Dynamic: requires-dist
|
34
|
+
Dynamic: requires-python
|
35
|
+
Dynamic: summary
|
36
|
+
|
37
|
+
# AmzPy - Amazon Product Scraper
|
38
|
+
|
39
|
+
AmzPy is a lightweight Python library for scraping product information from Amazon. It provides a simple interface to fetch product details like title, price, currency, and image URL while handling anti-bot measures automatically.
|
40
|
+
|
41
|
+
## Features
|
42
|
+
|
43
|
+
- Easy-to-use API for scraping Amazon product data
|
44
|
+
- Supports multiple Amazon domains (.com, .in, .co.uk, etc.)
|
45
|
+
- Built-in anti-bot protection
|
46
|
+
- Automatic retries on detection
|
47
|
+
- Clean and typed Python interface
|
48
|
+
|
49
|
+
## Installation
|
50
|
+
|
51
|
+
Install using pip:
|
52
|
+
`pip install amzpy`
|
53
|
+
|
54
|
+
## Usage
|
55
|
+
|
56
|
+
Here's a basic example of how to use AmzPy:
|
57
|
+
|
58
|
+
```python
|
59
|
+
from amzpy import AmazonScraper
|
60
|
+
|
61
|
+
scraper = AmazonScraper()
|
62
|
+
product_details = scraper.get_product_details("https://www.amazon.com/dp/B0D4J2QDVY")
|
63
|
+
print(product_details)
|
64
|
+
```
|
65
|
+
|
66
|
+
This will output the product details including title, price, image URL, and currency.
|
67
|
+
|
68
|
+
Feel free to explore the codebase for more details and advanced features. Happy scraping!
|
69
|
+
|
70
|
+
Output:
|
71
|
+
```json
|
72
|
+
{
|
73
|
+
"title": "Product Title",
|
74
|
+
"price": "299",
|
75
|
+
"currency": "$",
|
76
|
+
"img_url": "https://..."
|
77
|
+
}
|
78
|
+
```
|
79
|
+
## Requirements
|
80
|
+
|
81
|
+
- Python 3.6+
|
82
|
+
- requests
|
83
|
+
- beautifulsoup4
|
84
|
+
- fake-useragent
|
85
|
+
|
86
|
+
## License
|
87
|
+
|
88
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
89
|
+
|
90
|
+
## Contributing
|
91
|
+
|
92
|
+
Contributions are welcome! Please read [CONTRIBUTING.md](CONTRIBUTING.md) for details on how to contribute to this project.
|
@@ -0,0 +1,11 @@
|
|
1
|
+
amzpy/__init__.py,sha256=D_HAyk3vKeexDTiuLozKEHMPqpWDX_hecvXdGsRL9LE,535
|
2
|
+
amzpy/engine.py,sha256=5ZyzdaQnxEZorYZMxav-qJTPdnRHXFyimwDJvicjtbI,1171
|
3
|
+
amzpy/parser.py,sha256=yLiJsFxPElTM7muvw7EicB6bjfv5d_uf23IX-LimWbA,2348
|
4
|
+
amzpy/scraper.py,sha256=FmuIKkllHau384nVp1AqnYVk6k8FOaz8nFbGHILo0d4,1692
|
5
|
+
amzpy/utils.py,sha256=iyrAnhDB_1Lnn49CVYEowFZ1QZhonl8U23ZltR_WyZ4,1132
|
6
|
+
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
tests/test_scraper.py,sha256=XVbRrjM-b8ihuPjx5XezEU5o5twJ7xSBwp1ez28OcKs,582
|
8
|
+
amzpy-0.1.0.dist-info/METADATA,sha256=B_jsC5mTALBpXwIMox5dNhEupQhcIogIhNOfy3UvOLI,2632
|
9
|
+
amzpy-0.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
10
|
+
amzpy-0.1.0.dist-info/top_level.txt,sha256=Srr5VxRSsZN9fziW9RLJtXt4o0dXk-b64YMijWr4y4Y,12
|
11
|
+
amzpy-0.1.0.dist-info/RECORD,,
|
tests/__init__.py
ADDED
File without changes
|
tests/test_scraper.py
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
import unittest
|
2
|
+
from amzpy.scraper import AmazonScraper
|
3
|
+
|
4
|
+
class TestAmazonScraper(unittest.TestCase):
|
5
|
+
def setUp(self):
|
6
|
+
self.scraper = AmazonScraper(retries=5, delay=1)
|
7
|
+
|
8
|
+
def test_product_details(self):
|
9
|
+
url = "https://www.amazon.in/dp/B09JK1PZ7N"
|
10
|
+
result = self.scraper.get_product_details(url)
|
11
|
+
|
12
|
+
self.assertNotEqual(result["title"], "Title not found", "Title should be properly scraped.")
|
13
|
+
self.assertNotEqual(result["price"], "Price not found", "Price should be properly scraped.")
|
14
|
+
|
15
|
+
if __name__ == "__main__":
|
16
|
+
unittest.main()
|