amzsc 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- amzsc-0.1.3/LICENSE +21 -0
- amzsc-0.1.3/PKG-INFO +86 -0
- amzsc-0.1.3/README.md +43 -0
- amzsc-0.1.3/pyproject.toml +56 -0
- amzsc-0.1.3/src/amzsc/__init__.py +5 -0
- amzsc-0.1.3/src/amzsc/handlers/__init__.py +5 -0
- amzsc-0.1.3/src/amzsc/handlers/error_handler.py +15 -0
- amzsc-0.1.3/src/amzsc/modules/driver/driver_amazon.py +48 -0
- amzsc-0.1.3/src/amzsc/modules/driver/driver_config.py +66 -0
- amzsc-0.1.3/src/amzsc/modules/driver/driver_manipulator.py +29 -0
- amzsc-0.1.3/src/amzsc/modules/proxy/__init__.py +5 -0
- amzsc-0.1.3/src/amzsc/modules/proxy/proxy.py +15 -0
- amzsc-0.1.3/src/amzsc/modules/proxy/proxy_request.py +54 -0
- amzsc-0.1.3/src/amzsc/scraper.py +192 -0
- amzsc-0.1.3/src/amzsc/utils/__init__.py +7 -0
- amzsc-0.1.3/src/amzsc/utils/constants.py +16 -0
- amzsc-0.1.3/src/amzsc/utils/custom_types.py +5 -0
- amzsc-0.1.3/src/amzsc/utils/file_worker.py +11 -0
- amzsc-0.1.3/src/amzsc/utils/marketplace.py +14 -0
amzsc-0.1.3/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 ryzanbui02
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
amzsc-0.1.3/PKG-INFO
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: amzsc
|
|
3
|
+
Version: 0.1.3
|
|
4
|
+
Summary: Amazon product description scraper
|
|
5
|
+
License: MIT License
|
|
6
|
+
|
|
7
|
+
Copyright (c) 2025 ryzanbui02
|
|
8
|
+
|
|
9
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
10
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
11
|
+
in the Software without restriction, including without limitation the rights
|
|
12
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
13
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
14
|
+
furnished to do so, subject to the following conditions:
|
|
15
|
+
|
|
16
|
+
The above copyright notice and this permission notice shall be included in all
|
|
17
|
+
copies or substantial portions of the Software.
|
|
18
|
+
|
|
19
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
20
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
21
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
22
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
23
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
24
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
25
|
+
SOFTWARE.
|
|
26
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
27
|
+
Classifier: Programming Language :: Python
|
|
28
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
29
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
30
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
31
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
32
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
33
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
34
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
35
|
+
Classifier: Topic :: Software Development :: Testing
|
|
36
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
37
|
+
Requires-Dist: bs4>=0.0.2
|
|
38
|
+
Requires-Dist: fake-useragent>=2.2.0
|
|
39
|
+
Requires-Dist: requests>=2.32.5
|
|
40
|
+
Requires-Dist: selenium>=4.35.0
|
|
41
|
+
Requires-Python: >=3.10
|
|
42
|
+
Description-Content-Type: text/markdown
|
|
43
|
+
|
|
44
|
+
# amzsc
|
|
45
|
+
|
|
46
|
+
[](https://pypi.python.org/pypi/amzsc)
|
|
47
|
+
[](https://img.shields.io/pypi/l/amzsc.svg)
|
|
48
|
+
[](https://img.shields.io/pypi/pyversions/amzsc.svg)
|
|
49
|
+
[](https://github.com/ryzanbui02/amzsc/actions)
|
|
50
|
+
[](https://codecov.io/gh/ryzanbui02/amzsc)
|
|
51
|
+
|
|
52
|
+
`amzsc` is an Amazon product description scraper library that allows you to extract product details such as title, price, description, and reviews using ASINs.
|
|
53
|
+
|
|
54
|
+
## Example Usage
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from amzsc import AmazonScraper
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def main():
|
|
61
|
+
# Initialize the AmazonScraper with your Amazon credentials
|
|
62
|
+
scraper = AmazonScraper()
|
|
63
|
+
asins = ['B08N5WRWNW', 'B07XJ8C8F5'] # Example ASINs
|
|
64
|
+
results = scraper.scrape(asins=asins, marketplace="US") # DataFrame with scraped data
|
|
65
|
+
print(results)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
if __name__ == "__main__":
|
|
69
|
+
main()
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Installation
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
pip install amzsc
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## Contribution
|
|
79
|
+
|
|
80
|
+
Contributions are welcome! Please open an issue or submit a pull request for any improvements or bug fixes.
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
git clone https://github.com/ryzanbui02/amzsc.git
|
|
84
|
+
cd amzsc
|
|
85
|
+
uv sync
|
|
86
|
+
```
|
amzsc-0.1.3/README.md
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# amzsc
|
|
2
|
+
|
|
3
|
+
[](https://pypi.python.org/pypi/amzsc)
|
|
4
|
+
[](https://img.shields.io/pypi/l/amzsc.svg)
|
|
5
|
+
[](https://img.shields.io/pypi/pyversions/amzsc.svg)
|
|
6
|
+
[](https://github.com/ryzanbui02/amzsc/actions)
|
|
7
|
+
[](https://codecov.io/gh/ryzanbui02/amzsc)
|
|
8
|
+
|
|
9
|
+
`amzsc` is an Amazon product description scraper library that allows you to extract product details such as title, price, description, and reviews using ASINs.
|
|
10
|
+
|
|
11
|
+
## Example Usage
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from amzsc import AmazonScraper
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def main():
|
|
18
|
+
# Initialize the AmazonScraper with your Amazon credentials
|
|
19
|
+
scraper = AmazonScraper()
|
|
20
|
+
asins = ['B08N5WRWNW', 'B07XJ8C8F5'] # Example ASINs
|
|
21
|
+
results = scraper.scrape(asins=asins, marketplace="US") # DataFrame with scraped data
|
|
22
|
+
print(results)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
if __name__ == "__main__":
|
|
26
|
+
main()
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install amzsc
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Contribution
|
|
36
|
+
|
|
37
|
+
Contributions are welcome! Please open an issue or submit a pull request for any improvements or bug fixes.
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
git clone https://github.com/ryzanbui02/amzsc.git
|
|
41
|
+
cd amzsc
|
|
42
|
+
uv sync
|
|
43
|
+
```
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
build-backend = "uv_build"
|
|
3
|
+
requires = ["uv_build>=0.9.4,<0.10.0"]
|
|
4
|
+
|
|
5
|
+
[dependency-groups]
|
|
6
|
+
dev = [
|
|
7
|
+
"isort>=6.0.1",
|
|
8
|
+
"pre-commit>=4.3.0",
|
|
9
|
+
"pytest>=8.4.1",
|
|
10
|
+
"pytest-cov>=6.2.1",
|
|
11
|
+
"ruff>=0.12.10"
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
[project]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Programming Language :: Python",
|
|
18
|
+
"Programming Language :: Python :: 3.10",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Programming Language :: Python :: 3.13",
|
|
22
|
+
"Programming Language :: Python :: 3.14",
|
|
23
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
24
|
+
"Topic :: Software Development :: Quality Assurance",
|
|
25
|
+
"Topic :: Software Development :: Testing",
|
|
26
|
+
"Topic :: Software Development :: Libraries"
|
|
27
|
+
]
|
|
28
|
+
dependencies = [
|
|
29
|
+
"bs4>=0.0.2",
|
|
30
|
+
"fake-useragent>=2.2.0",
|
|
31
|
+
"requests>=2.32.5",
|
|
32
|
+
"selenium>=4.35.0"
|
|
33
|
+
]
|
|
34
|
+
description = "Amazon product description scraper"
|
|
35
|
+
license = {file = "LICENSE"}
|
|
36
|
+
name = "amzsc"
|
|
37
|
+
readme = "README.md"
|
|
38
|
+
requires-python = ">=3.10"
|
|
39
|
+
version = "0.1.3"
|
|
40
|
+
|
|
41
|
+
[tool.pytest.ini_options]
|
|
42
|
+
addopts = "--cov=amzsc --cov-fail-under=80 --cov-report=html --cov-report=xml --maxfail=1"
|
|
43
|
+
testpaths = "tests"
|
|
44
|
+
|
|
45
|
+
[tool.ruff]
|
|
46
|
+
line-length = 88
|
|
47
|
+
|
|
48
|
+
[tool.ruff.lint]
|
|
49
|
+
ignore = []
|
|
50
|
+
select = ["E", "F", "W", "I001"]
|
|
51
|
+
|
|
52
|
+
[tool.setuptools.packages.find]
|
|
53
|
+
where = ["src"]
|
|
54
|
+
|
|
55
|
+
[tool.uv.build-backend]
|
|
56
|
+
module-name = "amzsc"
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
|
|
3
|
+
logger = logging.getLogger(__name__)
|
|
4
|
+
logger.setLevel(logging.DEBUG)
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def safe_method(func):
|
|
8
|
+
def wrapper(*args, **kwargs):
|
|
9
|
+
try:
|
|
10
|
+
return func(*args, **kwargs)
|
|
11
|
+
except Exception as e:
|
|
12
|
+
logger.error(f"Error in {func.__name__}: {e}")
|
|
13
|
+
return None
|
|
14
|
+
|
|
15
|
+
return wrapper
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
from selenium.webdriver.common.by import By
|
|
2
|
+
|
|
3
|
+
from amzsc.handlers import safe_method
|
|
4
|
+
from amzsc.modules.driver.driver_manipulator import ChromeManipulator
|
|
5
|
+
from amzsc.utils import Constants, CustomTypes
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class AmazonDriver(ChromeManipulator):
|
|
9
|
+
def __init__(self, driver: CustomTypes.DRIVER_TYPE) -> None:
|
|
10
|
+
super().__init__(driver)
|
|
11
|
+
|
|
12
|
+
@safe_method
|
|
13
|
+
def get_product_overview(self) -> dict[str, str]:
|
|
14
|
+
data = {}
|
|
15
|
+
parent_div = self.driver.find_element(By.ID, Constants.PRODUCT_OVERVIEW)
|
|
16
|
+
child_divs = parent_div.find_elements(By.XPATH, "./div")
|
|
17
|
+
for child_div in child_divs:
|
|
18
|
+
divs = child_div.find_elements(By.TAG_NAME, "div")
|
|
19
|
+
field = divs[0].text.strip()
|
|
20
|
+
value = divs[1].text.strip()
|
|
21
|
+
data[field] = value
|
|
22
|
+
return data
|
|
23
|
+
|
|
24
|
+
@safe_method
|
|
25
|
+
def get_product_specs(self) -> dict[str, str]:
|
|
26
|
+
data = {}
|
|
27
|
+
data_table = self.driver.find_element(By.ID, Constants.PRODUCT_SPECS)
|
|
28
|
+
self.driver.execute_script(Constants.PRODUCT_SPECS_SCRIPT, data_table)
|
|
29
|
+
rows = data_table.find_elements(By.TAG_NAME, "tr")
|
|
30
|
+
for row in rows:
|
|
31
|
+
if row.text.strip() == "":
|
|
32
|
+
continue
|
|
33
|
+
field = row.find_element(By.TAG_NAME, "th").text.strip()
|
|
34
|
+
value = row.find_element(By.TAG_NAME, "td").text.strip()
|
|
35
|
+
data[field] = value
|
|
36
|
+
return data
|
|
37
|
+
|
|
38
|
+
@safe_method
|
|
39
|
+
def get_product_micro(self) -> dict[str, str]:
|
|
40
|
+
data = {}
|
|
41
|
+
data_table = self.driver.find_element(By.CSS_SELECTOR, Constants.PRODUCT_MICRO)
|
|
42
|
+
rows = data_table.find_elements(By.TAG_NAME, "tr")
|
|
43
|
+
for row in rows:
|
|
44
|
+
cells = row.find_elements(By.TAG_NAME, "td")
|
|
45
|
+
field = cells[0].text
|
|
46
|
+
value = cells[1].text
|
|
47
|
+
data[field] = value
|
|
48
|
+
return data
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from selenium.webdriver import Chrome, ChromeOptions, Remote
|
|
2
|
+
|
|
3
|
+
from amzsc.utils import Constants
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ChromeDriverConfig:
|
|
7
|
+
@staticmethod
|
|
8
|
+
def get_options(**kwargs) -> ChromeOptions:
|
|
9
|
+
options = ChromeOptions()
|
|
10
|
+
if kwargs.get("user_agent") is not None:
|
|
11
|
+
options.add_argument(f"user-agent={kwargs.get('user_agent')}")
|
|
12
|
+
if kwargs.get("proxy") is not None:
|
|
13
|
+
options.add_argument(f"--proxy-server={kwargs.get('proxy')}")
|
|
14
|
+
options.add_argument("--disable-logging")
|
|
15
|
+
options.add_argument("--disable-dev-shm-usage")
|
|
16
|
+
if kwargs.get("headless", True):
|
|
17
|
+
options.add_argument("--headless")
|
|
18
|
+
options.add_argument("--log-level=3")
|
|
19
|
+
options.add_argument("--mute-audio")
|
|
20
|
+
options.add_argument("--no-sandbox")
|
|
21
|
+
options.add_argument("--disable-notifications")
|
|
22
|
+
options.add_argument("--disable-infobars")
|
|
23
|
+
options.add_argument("--disable-features=Translate")
|
|
24
|
+
options.add_argument("--disable-popup-blocking")
|
|
25
|
+
options.add_argument("--autoplay-policy-no-user-gesture-required")
|
|
26
|
+
options.add_argument("--no-default-browser-check")
|
|
27
|
+
options.add_argument("--force-dark-mode")
|
|
28
|
+
options.add_argument("--force-show-cursor")
|
|
29
|
+
options.add_argument("--disable-blink-features=AutomationControlled")
|
|
30
|
+
options.add_experimental_option("excludeSwitches", ["enable-automation"])
|
|
31
|
+
options.add_experimental_option("useAutomationExtension", False)
|
|
32
|
+
options.add_argument("--force-device-scale-factor=0.8")
|
|
33
|
+
if kwargs.get("position") is not None:
|
|
34
|
+
options.add_argument(f"--window-position={kwargs.get('position')}")
|
|
35
|
+
if kwargs.get("download_dir") is not None:
|
|
36
|
+
prefs = {
|
|
37
|
+
"download.default_directory": kwargs.get("download_dir"),
|
|
38
|
+
"download.prompt_for_download": False,
|
|
39
|
+
"directory_upgrade": True,
|
|
40
|
+
"safebrowsing.enabled": True,
|
|
41
|
+
}
|
|
42
|
+
options.add_experimental_option("prefs", prefs)
|
|
43
|
+
return options
|
|
44
|
+
|
|
45
|
+
@staticmethod
|
|
46
|
+
def get_chrome_driver(options: ChromeOptions) -> Chrome:
|
|
47
|
+
driver = Chrome(options=options)
|
|
48
|
+
return driver
|
|
49
|
+
|
|
50
|
+
@staticmethod
|
|
51
|
+
def get_remote_driver(options: ChromeOptions, remote_url: str) -> Remote:
|
|
52
|
+
driver = Remote(options=options, command_executor=remote_url)
|
|
53
|
+
return driver
|
|
54
|
+
|
|
55
|
+
@staticmethod
|
|
56
|
+
def get_driver_position(thread_id: int, thread_count: int = 10) -> str:
|
|
57
|
+
min_row = 4
|
|
58
|
+
min_col = 4
|
|
59
|
+
tmp = int(thread_count**0.5)
|
|
60
|
+
row_count = max(min_row, tmp)
|
|
61
|
+
col_count = max(min_col, tmp + 1)
|
|
62
|
+
row_height = Constants.MONITOR_HEIGHT // row_count
|
|
63
|
+
col_width = Constants.MONITOR_WIDTH // col_count
|
|
64
|
+
col = thread_id % col_count
|
|
65
|
+
row = thread_id // col_count
|
|
66
|
+
return f"{col_width * col},{row_height * row}"
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from typing import Callable
|
|
2
|
+
|
|
3
|
+
from selenium.webdriver.remote.webdriver import WebDriver
|
|
4
|
+
from selenium.webdriver.remote.webelement import WebElement
|
|
5
|
+
from selenium.webdriver.support.ui import WebDriverWait
|
|
6
|
+
|
|
7
|
+
from amzsc.utils import CustomTypes
|
|
8
|
+
|
|
9
|
+
CONDITION_TYPE = Callable[[tuple[str, str]], Callable[[WebDriver], WebElement | bool]]
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ChromeManipulator:
|
|
13
|
+
def __init__(self, driver: CustomTypes.DRIVER_TYPE) -> None:
|
|
14
|
+
self.driver = driver
|
|
15
|
+
|
|
16
|
+
def __str__(self) -> str:
|
|
17
|
+
return "DriverManipulator"
|
|
18
|
+
|
|
19
|
+
def get(self, url: str) -> None:
|
|
20
|
+
self.driver.get(url)
|
|
21
|
+
|
|
22
|
+
def refresh(self) -> None:
|
|
23
|
+
self.driver.refresh()
|
|
24
|
+
|
|
25
|
+
def quit(self) -> None:
|
|
26
|
+
self.driver.quit()
|
|
27
|
+
|
|
28
|
+
def wait(self, timeout: int = 10) -> WebDriverWait:
|
|
29
|
+
return WebDriverWait(self.driver, timeout)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from amzsc.handlers import safe_method
|
|
2
|
+
from amzsc.modules.proxy.proxy_request import ProxyRequest
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
@safe_method
|
|
6
|
+
def get_proxy(api_key: str) -> str | None:
|
|
7
|
+
"""
|
|
8
|
+
Get a new proxy from the ProxyRequest module.
|
|
9
|
+
If no new proxy is available, it returns the current proxy.
|
|
10
|
+
"""
|
|
11
|
+
proxy_cli = ProxyRequest(api_key=api_key)
|
|
12
|
+
proxy = proxy_cli.get_new_proxy() or proxy_cli.get_current_proxy()
|
|
13
|
+
if not proxy or not ProxyRequest.is_proxy_live(proxy):
|
|
14
|
+
raise ValueError("No valid proxy available.")
|
|
15
|
+
return proxy
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
|
|
3
|
+
from amzsc.handlers import safe_method
|
|
4
|
+
from amzsc.utils import Constants
|
|
5
|
+
|
|
6
|
+
RESPONSE_STATUS = {"ERROR": "error", "SUCCESS": "success"}
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ProxyRequest:
|
|
10
|
+
def __init__(self, api_key: str) -> None:
|
|
11
|
+
self.__api_key = api_key
|
|
12
|
+
|
|
13
|
+
@property
|
|
14
|
+
def api_key(self) -> str:
|
|
15
|
+
return self.__api_key
|
|
16
|
+
|
|
17
|
+
def request(self, url: str) -> dict:
|
|
18
|
+
params = {"access_token": self.api_key, "location": "", "provider": ""}
|
|
19
|
+
response = requests.get(url, params=params)
|
|
20
|
+
response.raise_for_status()
|
|
21
|
+
return response.json()
|
|
22
|
+
|
|
23
|
+
@safe_method
|
|
24
|
+
def get_new_proxy(self) -> str | None:
|
|
25
|
+
data = self.request(Constants.GET_NEW_ENDPOINT)
|
|
26
|
+
if data is None:
|
|
27
|
+
return None
|
|
28
|
+
if data["status"] == RESPONSE_STATUS["ERROR"]:
|
|
29
|
+
return None
|
|
30
|
+
return data["data"]["proxy"]
|
|
31
|
+
|
|
32
|
+
@safe_method
|
|
33
|
+
def get_current_proxy(self) -> str | None:
|
|
34
|
+
data = self.request(Constants.GET_CURRENT_ENDPOINT)
|
|
35
|
+
if data is None:
|
|
36
|
+
return None
|
|
37
|
+
return data["data"]["proxy"]
|
|
38
|
+
|
|
39
|
+
@staticmethod
|
|
40
|
+
def is_proxy_live(proxy: str | None = None) -> bool:
|
|
41
|
+
if proxy is None:
|
|
42
|
+
return False
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
response = requests.get(f"{Constants.PROXY_LIVE_URL}/{proxy}", timeout=10)
|
|
46
|
+
response.raise_for_status()
|
|
47
|
+
data = response.json()
|
|
48
|
+
if data["status"] == "Live":
|
|
49
|
+
return True
|
|
50
|
+
return False
|
|
51
|
+
except requests.RequestException:
|
|
52
|
+
return False
|
|
53
|
+
except Exception:
|
|
54
|
+
return False
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
import logging
|
|
2
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
from fake_useragent import UserAgent
|
|
7
|
+
|
|
8
|
+
from amzsc.modules.driver.driver_amazon import AmazonDriver
|
|
9
|
+
from amzsc.modules.driver.driver_config import ChromeDriverConfig
|
|
10
|
+
from amzsc.modules.proxy import get_proxy
|
|
11
|
+
from amzsc.utils.file_worker import write_to_json
|
|
12
|
+
from amzsc.utils.marketplace import get_zone
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
logger.setLevel(logging.DEBUG)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def scrape_one(client: AmazonDriver, marketplace: str, asin: str) -> dict[str, str]:
|
|
19
|
+
data = {"asin": asin, "marketplace": marketplace}
|
|
20
|
+
zone = get_zone(marketplace)
|
|
21
|
+
url = f"https://www.amazon.{zone}/dp/{asin}"
|
|
22
|
+
client.get(url)
|
|
23
|
+
|
|
24
|
+
product_overview = client.get_product_overview()
|
|
25
|
+
if product_overview:
|
|
26
|
+
data = data | product_overview
|
|
27
|
+
|
|
28
|
+
product_specs = client.get_product_specs()
|
|
29
|
+
if product_specs:
|
|
30
|
+
data = data | product_specs
|
|
31
|
+
|
|
32
|
+
product_micro = client.get_product_micro()
|
|
33
|
+
if product_micro:
|
|
34
|
+
data = data | product_micro
|
|
35
|
+
|
|
36
|
+
return data
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def scrape_all(
|
|
40
|
+
marketplaces: list[str],
|
|
41
|
+
asins: list[str],
|
|
42
|
+
thread_id: int,
|
|
43
|
+
thread_count: int = 10,
|
|
44
|
+
proxy_key: str | None = None,
|
|
45
|
+
headless: bool = True,
|
|
46
|
+
is_remote: bool = False,
|
|
47
|
+
remote_url: str | None = None,
|
|
48
|
+
jsonl_output_path: Path | None = None,
|
|
49
|
+
) -> list[dict[str, str]]:
|
|
50
|
+
client = None
|
|
51
|
+
data: list[dict[str, str]] = []
|
|
52
|
+
try:
|
|
53
|
+
proxy = get_proxy(proxy_key) if proxy_key else None
|
|
54
|
+
position = ChromeDriverConfig.get_driver_position(thread_id, thread_count)
|
|
55
|
+
options = ChromeDriverConfig.get_options(
|
|
56
|
+
proxy=proxy,
|
|
57
|
+
position=position,
|
|
58
|
+
user_agent=UserAgent().random,
|
|
59
|
+
headless=headless,
|
|
60
|
+
)
|
|
61
|
+
if is_remote:
|
|
62
|
+
driver = ChromeDriverConfig.get_remote_driver(options, remote_url)
|
|
63
|
+
else:
|
|
64
|
+
driver = ChromeDriverConfig.get_chrome_driver(options)
|
|
65
|
+
client = AmazonDriver(driver)
|
|
66
|
+
for i in range(len(asins)):
|
|
67
|
+
asin = asins[i]
|
|
68
|
+
marketplace = marketplaces[i]
|
|
69
|
+
row = scrape_one(client, marketplace, asin)
|
|
70
|
+
logger.info("Thread %d: ASIN %s from %s", thread_id, asin, marketplace)
|
|
71
|
+
if jsonl_output_path:
|
|
72
|
+
write_to_json(jsonl_output_path, row)
|
|
73
|
+
logger.debug("Thread %d: ASIN %s to JSONL file", thread_id, asin)
|
|
74
|
+
data.append(row)
|
|
75
|
+
|
|
76
|
+
except Exception as e:
|
|
77
|
+
logger.error(str(e))
|
|
78
|
+
|
|
79
|
+
finally:
|
|
80
|
+
if client is not None:
|
|
81
|
+
client.quit()
|
|
82
|
+
return data
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class AmazonScraper:
|
|
86
|
+
def __init__(
|
|
87
|
+
self,
|
|
88
|
+
proxy_key: str | None = None,
|
|
89
|
+
headless: bool = True,
|
|
90
|
+
is_remote: bool = False,
|
|
91
|
+
remote_url: str | None = None,
|
|
92
|
+
batch_size: int = 10,
|
|
93
|
+
thread_count: int = 10,
|
|
94
|
+
jsonl_output_path: Path | None = None,
|
|
95
|
+
) -> None:
|
|
96
|
+
"""
|
|
97
|
+
Initialize the AmazonScraper.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
proxy_key: If parsed, use proxy to prevent Amazon from block the host
|
|
101
|
+
computer. WWProxy API key. Defaults to None.
|
|
102
|
+
headless: If set to `True`, run the Selenium instances in headless mode.
|
|
103
|
+
Defaults to True.
|
|
104
|
+
is_remote: If set to `True`, use Selenium Grid for the instances.
|
|
105
|
+
Defaults to False.
|
|
106
|
+
remote_url: Selenium Grid remote URL. Required `is_remote` = True to
|
|
107
|
+
activate. Defaults to None.
|
|
108
|
+
batch_size: The number of URLs to be processed in one instance before
|
|
109
|
+
quitting it. Defaults to 10.
|
|
110
|
+
thread_count: The number of threads to use in the process. Defaults to 10.
|
|
111
|
+
jsonl_output_path: If parsed, append results in JSONL type in the parsed
|
|
112
|
+
path. Defaults to None.
|
|
113
|
+
|
|
114
|
+
Raises:
|
|
115
|
+
ValueError: If `thread_count` is not a positive integer.
|
|
116
|
+
"""
|
|
117
|
+
self.__proxy_key = proxy_key
|
|
118
|
+
self.headless = headless
|
|
119
|
+
self.is_remote = is_remote
|
|
120
|
+
self.remote_url = remote_url
|
|
121
|
+
self.batch_size = batch_size
|
|
122
|
+
self.thread_count = thread_count
|
|
123
|
+
if self.thread_count <= 0:
|
|
124
|
+
raise ValueError("thread_count must be a positive integer")
|
|
125
|
+
|
|
126
|
+
# Set up output options
|
|
127
|
+
self.jsonl_output_path = jsonl_output_path
|
|
128
|
+
|
|
129
|
+
logger.debug(
|
|
130
|
+
"Initializing AmazonScraper with thread_count=%d, batch_size=%d"
|
|
131
|
+
% (self.thread_count, self.batch_size),
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
@property
|
|
135
|
+
def proxy_key(self) -> str | None:
|
|
136
|
+
return self.__proxy_key
|
|
137
|
+
|
|
138
|
+
def scrape(
|
|
139
|
+
self,
|
|
140
|
+
asins: list[str],
|
|
141
|
+
marketplaces: list[str] | None = None,
|
|
142
|
+
marketplace: Literal["US", "UK", "DE", "FR", "ES", "IT"] | None = None,
|
|
143
|
+
) -> list[dict[str, str]]:
|
|
144
|
+
"""
|
|
145
|
+
Scrape product data from Amazon for a list of ASINs.
|
|
146
|
+
|
|
147
|
+
Args:
|
|
148
|
+
asins: An array of ASINs to scrape.
|
|
149
|
+
marketplaces: Corresponding array of marketplaces to ASINs. Defaults to None
|
|
150
|
+
Has to be the same length as `asins`.
|
|
151
|
+
marketplace: Marketplace for all ASINs. Defaults to None.
|
|
152
|
+
If `marketplaces` is not set, `marketplace` must be set.
|
|
153
|
+
If `marketplace` is set, it will be used for all ASINs.
|
|
154
|
+
|
|
155
|
+
Raises:
|
|
156
|
+
ValueError: asins must not be an empty list.
|
|
157
|
+
ValueError: marketplaces must be the same length as asins.
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
A pandas DataFrame containing the scraped data.
|
|
161
|
+
"""
|
|
162
|
+
if len(asins) == 0:
|
|
163
|
+
raise ValueError("asins must not be an empty list")
|
|
164
|
+
if marketplace is not None and marketplaces is None:
|
|
165
|
+
marketplaces = [marketplace] * len(asins)
|
|
166
|
+
if marketplaces is None or len(marketplaces) != len(asins):
|
|
167
|
+
raise ValueError("Invalid marketplaces array length")
|
|
168
|
+
|
|
169
|
+
chunks = [
|
|
170
|
+
(marketplaces[i : i + self.batch_size], asins[i : i + self.batch_size])
|
|
171
|
+
for i in range(0, len(asins), self.batch_size)
|
|
172
|
+
]
|
|
173
|
+
args = [
|
|
174
|
+
self.thread_count,
|
|
175
|
+
self.proxy_key,
|
|
176
|
+
self.headless,
|
|
177
|
+
self.is_remote,
|
|
178
|
+
self.remote_url,
|
|
179
|
+
self.jsonl_output_path,
|
|
180
|
+
]
|
|
181
|
+
with ThreadPoolExecutor(max_workers=self.thread_count) as executor:
|
|
182
|
+
futures = [
|
|
183
|
+
executor.submit(
|
|
184
|
+
scrape_all, chunk[0], chunk[1], i + 1 % self.thread_count, *args
|
|
185
|
+
)
|
|
186
|
+
for i, chunk in enumerate(chunks)
|
|
187
|
+
]
|
|
188
|
+
results: list[dict[str, str]] = []
|
|
189
|
+
for future in futures:
|
|
190
|
+
results.extend(future.result())
|
|
191
|
+
|
|
192
|
+
return results
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
class Constants:
|
|
2
|
+
THREAD_MESSAGE = ""
|
|
3
|
+
PROXY_LIVE_URL = "https://clonebysun.com/api/tienich/checkliveproxy"
|
|
4
|
+
PROXY_BASE_URL = "http://proxy.shoplike.vn/Api"
|
|
5
|
+
GET_NEW_ENDPOINT = PROXY_BASE_URL + "/getNewProxy"
|
|
6
|
+
GET_CURRENT_ENDPOINT = PROXY_BASE_URL + "/getCurrentProxy"
|
|
7
|
+
|
|
8
|
+
PRODUCT_OVERVIEW = "productOverview_hoc_view_div"
|
|
9
|
+
PRODUCT_SPECS = "productSpecifications-content"
|
|
10
|
+
PRODUCT_SPECS_SCRIPT = "arguments[0].style.display = 'block';"
|
|
11
|
+
PRODUCT_MICRO = ".a-normal.a-spacing-micro"
|
|
12
|
+
|
|
13
|
+
MONITOR_WIDTH = 1920
|
|
14
|
+
MONITOR_HEIGHT = 1080
|
|
15
|
+
|
|
16
|
+
LOGGING_LEVELS = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import threading
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
lock = threading.Lock()
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def write_to_json(path: Path, row: dict[str, str]) -> None:
|
|
9
|
+
with lock:
|
|
10
|
+
with open(path, "a", encoding="utf-8") as f:
|
|
11
|
+
f.write(json.dumps(row, ensure_ascii=False) + "\n")
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
def get_zone(marketplace: str | None = None) -> str:
|
|
2
|
+
if marketplace is None:
|
|
3
|
+
marketplace = "de"
|
|
4
|
+
zone = {
|
|
5
|
+
"us": "com",
|
|
6
|
+
"usa": "com",
|
|
7
|
+
"uk": "co.uk",
|
|
8
|
+
"gb": "co.uk",
|
|
9
|
+
"de": "de",
|
|
10
|
+
"fr": "fr",
|
|
11
|
+
"es": "es",
|
|
12
|
+
"it": "it",
|
|
13
|
+
}
|
|
14
|
+
return zone.get(marketplace.lower(), "de")
|