amzsc 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
amzsc-0.1.3/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 ryzanbui02
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
amzsc-0.1.3/PKG-INFO ADDED
@@ -0,0 +1,86 @@
1
+ Metadata-Version: 2.3
2
+ Name: amzsc
3
+ Version: 0.1.3
4
+ Summary: Amazon product description scraper
5
+ License: MIT License
6
+
7
+ Copyright (c) 2025 ryzanbui02
8
+
9
+ Permission is hereby granted, free of charge, to any person obtaining a copy
10
+ of this software and associated documentation files (the "Software"), to deal
11
+ in the Software without restriction, including without limitation the rights
12
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13
+ copies of the Software, and to permit persons to whom the Software is
14
+ furnished to do so, subject to the following conditions:
15
+
16
+ The above copyright notice and this permission notice shall be included in all
17
+ copies or substantial portions of the Software.
18
+
19
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25
+ SOFTWARE.
26
+ Classifier: License :: OSI Approved :: MIT License
27
+ Classifier: Programming Language :: Python
28
+ Classifier: Programming Language :: Python :: 3.10
29
+ Classifier: Programming Language :: Python :: 3.11
30
+ Classifier: Programming Language :: Python :: 3.12
31
+ Classifier: Programming Language :: Python :: 3.13
32
+ Classifier: Programming Language :: Python :: 3.14
33
+ Classifier: Programming Language :: Python :: 3 :: Only
34
+ Classifier: Topic :: Software Development :: Quality Assurance
35
+ Classifier: Topic :: Software Development :: Testing
36
+ Classifier: Topic :: Software Development :: Libraries
37
+ Requires-Dist: bs4>=0.0.2
38
+ Requires-Dist: fake-useragent>=2.2.0
39
+ Requires-Dist: requests>=2.32.5
40
+ Requires-Dist: selenium>=4.35.0
41
+ Requires-Python: >=3.10
42
+ Description-Content-Type: text/markdown
43
+
44
+ # amzsc
45
+
46
+ [![image](https://img.shields.io/pypi/v/amzsc.svg)](https://pypi.python.org/pypi/amzsc)
47
+ [![image](https://img.shields.io/pypi/l/amzsc.svg)](https://img.shields.io/pypi/l/amzsc.svg)
48
+ [![image](https://img.shields.io/pypi/pyversions/amzsc.svg)](https://img.shields.io/pypi/pyversions/amzsc.svg)
49
+ [![Actions status](https://github.com/ryzanbui02/amzsc/actions/workflows/test-and-release.yaml/badge.svg)](https://github.com/ryzanbui02/amzsc/actions)
50
+ [![codecov](https://codecov.io/gh/ryzanbui02/amzsc/branch/main/graph/badge.svg)](https://codecov.io/gh/ryzanbui02/amzsc)
51
+
52
+ `amzsc` is an Amazon product description scraper library that allows you to extract product details such as title, price, description, and reviews using ASINs.
53
+
54
+ ## Example Usage
55
+
56
+ ```python
57
+ from amzsc import AmazonScraper
58
+
59
+
60
+ def main():
61
+ # Initialize the AmazonScraper with your Amazon credentials
62
+ scraper = AmazonScraper()
63
+ asins = ['B08N5WRWNW', 'B07XJ8C8F5'] # Example ASINs
64
+ results = scraper.scrape(asins=asins, marketplace="US") # DataFrame with scraped data
65
+ print(results)
66
+
67
+
68
+ if __name__ == "__main__":
69
+ main()
70
+ ```
71
+
72
+ ## Installation
73
+
74
+ ```bash
75
+ pip install amzsc
76
+ ```
77
+
78
+ ## Contribution
79
+
80
+ Contributions are welcome! Please open an issue or submit a pull request for any improvements or bug fixes.
81
+
82
+ ```bash
83
+ git clone https://github.com/ryzanbui02/amzsc.git
84
+ cd amzsc
85
+ uv sync
86
+ ```
amzsc-0.1.3/README.md ADDED
@@ -0,0 +1,43 @@
1
+ # amzsc
2
+
3
+ [![image](https://img.shields.io/pypi/v/amzsc.svg)](https://pypi.python.org/pypi/amzsc)
4
+ [![image](https://img.shields.io/pypi/l/amzsc.svg)](https://img.shields.io/pypi/l/amzsc.svg)
5
+ [![image](https://img.shields.io/pypi/pyversions/amzsc.svg)](https://img.shields.io/pypi/pyversions/amzsc.svg)
6
+ [![Actions status](https://github.com/ryzanbui02/amzsc/actions/workflows/test-and-release.yaml/badge.svg)](https://github.com/ryzanbui02/amzsc/actions)
7
+ [![codecov](https://codecov.io/gh/ryzanbui02/amzsc/branch/main/graph/badge.svg)](https://codecov.io/gh/ryzanbui02/amzsc)
8
+
9
+ `amzsc` is an Amazon product description scraper library that allows you to extract product details such as title, price, description, and reviews using ASINs.
10
+
11
+ ## Example Usage
12
+
13
+ ```python
14
+ from amzsc import AmazonScraper
15
+
16
+
17
+ def main():
18
+ # Initialize the AmazonScraper with your Amazon credentials
19
+ scraper = AmazonScraper()
20
+ asins = ['B08N5WRWNW', 'B07XJ8C8F5'] # Example ASINs
21
+ results = scraper.scrape(asins=asins, marketplace="US") # DataFrame with scraped data
22
+ print(results)
23
+
24
+
25
+ if __name__ == "__main__":
26
+ main()
27
+ ```
28
+
29
+ ## Installation
30
+
31
+ ```bash
32
+ pip install amzsc
33
+ ```
34
+
35
+ ## Contribution
36
+
37
+ Contributions are welcome! Please open an issue or submit a pull request for any improvements or bug fixes.
38
+
39
+ ```bash
40
+ git clone https://github.com/ryzanbui02/amzsc.git
41
+ cd amzsc
42
+ uv sync
43
+ ```
@@ -0,0 +1,56 @@
1
+ [build-system]
2
+ build-backend = "uv_build"
3
+ requires = ["uv_build>=0.9.4,<0.10.0"]
4
+
5
+ [dependency-groups]
6
+ dev = [
7
+ "isort>=6.0.1",
8
+ "pre-commit>=4.3.0",
9
+ "pytest>=8.4.1",
10
+ "pytest-cov>=6.2.1",
11
+ "ruff>=0.12.10"
12
+ ]
13
+
14
+ [project]
15
+ classifiers = [
16
+ "License :: OSI Approved :: MIT License",
17
+ "Programming Language :: Python",
18
+ "Programming Language :: Python :: 3.10",
19
+ "Programming Language :: Python :: 3.11",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Programming Language :: Python :: 3.13",
22
+ "Programming Language :: Python :: 3.14",
23
+ "Programming Language :: Python :: 3 :: Only",
24
+ "Topic :: Software Development :: Quality Assurance",
25
+ "Topic :: Software Development :: Testing",
26
+ "Topic :: Software Development :: Libraries"
27
+ ]
28
+ dependencies = [
29
+ "bs4>=0.0.2",
30
+ "fake-useragent>=2.2.0",
31
+ "requests>=2.32.5",
32
+ "selenium>=4.35.0"
33
+ ]
34
+ description = "Amazon product description scraper"
35
+ license = {file = "LICENSE"}
36
+ name = "amzsc"
37
+ readme = "README.md"
38
+ requires-python = ">=3.10"
39
+ version = "0.1.3"
40
+
41
+ [tool.pytest.ini_options]
42
+ addopts = "--cov=amzsc --cov-fail-under=80 --cov-report=html --cov-report=xml --maxfail=1"
43
+ testpaths = "tests"
44
+
45
+ [tool.ruff]
46
+ line-length = 88
47
+
48
+ [tool.ruff.lint]
49
+ ignore = []
50
+ select = ["E", "F", "W", "I001"]
51
+
52
+ [tool.setuptools.packages.find]
53
+ where = ["src"]
54
+
55
+ [tool.uv.build-backend]
56
+ module-name = "amzsc"
@@ -0,0 +1,5 @@
1
+ from .scraper import AmazonScraper
2
+
3
+ __all__ = [
4
+ "AmazonScraper",
5
+ ]
@@ -0,0 +1,5 @@
1
+ from .error_handler import safe_method
2
+
3
+ __all__ = [
4
+ "safe_method",
5
+ ]
@@ -0,0 +1,15 @@
1
+ import logging
2
+
3
+ logger = logging.getLogger(__name__)
4
+ logger.setLevel(logging.DEBUG)
5
+
6
+
7
+ def safe_method(func):
8
+ def wrapper(*args, **kwargs):
9
+ try:
10
+ return func(*args, **kwargs)
11
+ except Exception as e:
12
+ logger.error(f"Error in {func.__name__}: {e}")
13
+ return None
14
+
15
+ return wrapper
@@ -0,0 +1,48 @@
1
+ from selenium.webdriver.common.by import By
2
+
3
+ from amzsc.handlers import safe_method
4
+ from amzsc.modules.driver.driver_manipulator import ChromeManipulator
5
+ from amzsc.utils import Constants, CustomTypes
6
+
7
+
8
+ class AmazonDriver(ChromeManipulator):
9
+ def __init__(self, driver: CustomTypes.DRIVER_TYPE) -> None:
10
+ super().__init__(driver)
11
+
12
+ @safe_method
13
+ def get_product_overview(self) -> dict[str, str]:
14
+ data = {}
15
+ parent_div = self.driver.find_element(By.ID, Constants.PRODUCT_OVERVIEW)
16
+ child_divs = parent_div.find_elements(By.XPATH, "./div")
17
+ for child_div in child_divs:
18
+ divs = child_div.find_elements(By.TAG_NAME, "div")
19
+ field = divs[0].text.strip()
20
+ value = divs[1].text.strip()
21
+ data[field] = value
22
+ return data
23
+
24
+ @safe_method
25
+ def get_product_specs(self) -> dict[str, str]:
26
+ data = {}
27
+ data_table = self.driver.find_element(By.ID, Constants.PRODUCT_SPECS)
28
+ self.driver.execute_script(Constants.PRODUCT_SPECS_SCRIPT, data_table)
29
+ rows = data_table.find_elements(By.TAG_NAME, "tr")
30
+ for row in rows:
31
+ if row.text.strip() == "":
32
+ continue
33
+ field = row.find_element(By.TAG_NAME, "th").text.strip()
34
+ value = row.find_element(By.TAG_NAME, "td").text.strip()
35
+ data[field] = value
36
+ return data
37
+
38
+ @safe_method
39
+ def get_product_micro(self) -> dict[str, str]:
40
+ data = {}
41
+ data_table = self.driver.find_element(By.CSS_SELECTOR, Constants.PRODUCT_MICRO)
42
+ rows = data_table.find_elements(By.TAG_NAME, "tr")
43
+ for row in rows:
44
+ cells = row.find_elements(By.TAG_NAME, "td")
45
+ field = cells[0].text
46
+ value = cells[1].text
47
+ data[field] = value
48
+ return data
@@ -0,0 +1,66 @@
1
+ from selenium.webdriver import Chrome, ChromeOptions, Remote
2
+
3
+ from amzsc.utils import Constants
4
+
5
+
6
+ class ChromeDriverConfig:
7
+ @staticmethod
8
+ def get_options(**kwargs) -> ChromeOptions:
9
+ options = ChromeOptions()
10
+ if kwargs.get("user_agent") is not None:
11
+ options.add_argument(f"user-agent={kwargs.get('user_agent')}")
12
+ if kwargs.get("proxy") is not None:
13
+ options.add_argument(f"--proxy-server={kwargs.get('proxy')}")
14
+ options.add_argument("--disable-logging")
15
+ options.add_argument("--disable-dev-shm-usage")
16
+ if kwargs.get("headless", True):
17
+ options.add_argument("--headless")
18
+ options.add_argument("--log-level=3")
19
+ options.add_argument("--mute-audio")
20
+ options.add_argument("--no-sandbox")
21
+ options.add_argument("--disable-notifications")
22
+ options.add_argument("--disable-infobars")
23
+ options.add_argument("--disable-features=Translate")
24
+ options.add_argument("--disable-popup-blocking")
25
+ options.add_argument("--autoplay-policy-no-user-gesture-required")
26
+ options.add_argument("--no-default-browser-check")
27
+ options.add_argument("--force-dark-mode")
28
+ options.add_argument("--force-show-cursor")
29
+ options.add_argument("--disable-blink-features=AutomationControlled")
30
+ options.add_experimental_option("excludeSwitches", ["enable-automation"])
31
+ options.add_experimental_option("useAutomationExtension", False)
32
+ options.add_argument("--force-device-scale-factor=0.8")
33
+ if kwargs.get("position") is not None:
34
+ options.add_argument(f"--window-position={kwargs.get('position')}")
35
+ if kwargs.get("download_dir") is not None:
36
+ prefs = {
37
+ "download.default_directory": kwargs.get("download_dir"),
38
+ "download.prompt_for_download": False,
39
+ "directory_upgrade": True,
40
+ "safebrowsing.enabled": True,
41
+ }
42
+ options.add_experimental_option("prefs", prefs)
43
+ return options
44
+
45
+ @staticmethod
46
+ def get_chrome_driver(options: ChromeOptions) -> Chrome:
47
+ driver = Chrome(options=options)
48
+ return driver
49
+
50
+ @staticmethod
51
+ def get_remote_driver(options: ChromeOptions, remote_url: str) -> Remote:
52
+ driver = Remote(options=options, command_executor=remote_url)
53
+ return driver
54
+
55
+ @staticmethod
56
+ def get_driver_position(thread_id: int, thread_count: int = 10) -> str:
57
+ min_row = 4
58
+ min_col = 4
59
+ tmp = int(thread_count**0.5)
60
+ row_count = max(min_row, tmp)
61
+ col_count = max(min_col, tmp + 1)
62
+ row_height = Constants.MONITOR_HEIGHT // row_count
63
+ col_width = Constants.MONITOR_WIDTH // col_count
64
+ col = thread_id % col_count
65
+ row = thread_id // col_count
66
+ return f"{col_width * col},{row_height * row}"
@@ -0,0 +1,29 @@
1
+ from typing import Callable
2
+
3
+ from selenium.webdriver.remote.webdriver import WebDriver
4
+ from selenium.webdriver.remote.webelement import WebElement
5
+ from selenium.webdriver.support.ui import WebDriverWait
6
+
7
+ from amzsc.utils import CustomTypes
8
+
9
+ CONDITION_TYPE = Callable[[tuple[str, str]], Callable[[WebDriver], WebElement | bool]]
10
+
11
+
12
+ class ChromeManipulator:
13
+ def __init__(self, driver: CustomTypes.DRIVER_TYPE) -> None:
14
+ self.driver = driver
15
+
16
+ def __str__(self) -> str:
17
+ return "DriverManipulator"
18
+
19
+ def get(self, url: str) -> None:
20
+ self.driver.get(url)
21
+
22
+ def refresh(self) -> None:
23
+ self.driver.refresh()
24
+
25
+ def quit(self) -> None:
26
+ self.driver.quit()
27
+
28
+ def wait(self, timeout: int = 10) -> WebDriverWait:
29
+ return WebDriverWait(self.driver, timeout)
@@ -0,0 +1,5 @@
1
+ from .proxy import get_proxy
2
+
3
+ __all__ = [
4
+ "get_proxy",
5
+ ]
@@ -0,0 +1,15 @@
1
+ from amzsc.handlers import safe_method
2
+ from amzsc.modules.proxy.proxy_request import ProxyRequest
3
+
4
+
5
+ @safe_method
6
+ def get_proxy(api_key: str) -> str | None:
7
+ """
8
+ Get a new proxy from the ProxyRequest module.
9
+ If no new proxy is available, it returns the current proxy.
10
+ """
11
+ proxy_cli = ProxyRequest(api_key=api_key)
12
+ proxy = proxy_cli.get_new_proxy() or proxy_cli.get_current_proxy()
13
+ if not proxy or not ProxyRequest.is_proxy_live(proxy):
14
+ raise ValueError("No valid proxy available.")
15
+ return proxy
@@ -0,0 +1,54 @@
1
+ import requests
2
+
3
+ from amzsc.handlers import safe_method
4
+ from amzsc.utils import Constants
5
+
6
+ RESPONSE_STATUS = {"ERROR": "error", "SUCCESS": "success"}
7
+
8
+
9
+ class ProxyRequest:
10
+ def __init__(self, api_key: str) -> None:
11
+ self.__api_key = api_key
12
+
13
+ @property
14
+ def api_key(self) -> str:
15
+ return self.__api_key
16
+
17
+ def request(self, url: str) -> dict:
18
+ params = {"access_token": self.api_key, "location": "", "provider": ""}
19
+ response = requests.get(url, params=params)
20
+ response.raise_for_status()
21
+ return response.json()
22
+
23
+ @safe_method
24
+ def get_new_proxy(self) -> str | None:
25
+ data = self.request(Constants.GET_NEW_ENDPOINT)
26
+ if data is None:
27
+ return None
28
+ if data["status"] == RESPONSE_STATUS["ERROR"]:
29
+ return None
30
+ return data["data"]["proxy"]
31
+
32
+ @safe_method
33
+ def get_current_proxy(self) -> str | None:
34
+ data = self.request(Constants.GET_CURRENT_ENDPOINT)
35
+ if data is None:
36
+ return None
37
+ return data["data"]["proxy"]
38
+
39
+ @staticmethod
40
+ def is_proxy_live(proxy: str | None = None) -> bool:
41
+ if proxy is None:
42
+ return False
43
+
44
+ try:
45
+ response = requests.get(f"{Constants.PROXY_LIVE_URL}/{proxy}", timeout=10)
46
+ response.raise_for_status()
47
+ data = response.json()
48
+ if data["status"] == "Live":
49
+ return True
50
+ return False
51
+ except requests.RequestException:
52
+ return False
53
+ except Exception:
54
+ return False
@@ -0,0 +1,192 @@
1
+ import logging
2
+ from concurrent.futures import ThreadPoolExecutor
3
+ from pathlib import Path
4
+ from typing import Literal
5
+
6
+ from fake_useragent import UserAgent
7
+
8
+ from amzsc.modules.driver.driver_amazon import AmazonDriver
9
+ from amzsc.modules.driver.driver_config import ChromeDriverConfig
10
+ from amzsc.modules.proxy import get_proxy
11
+ from amzsc.utils.file_worker import write_to_json
12
+ from amzsc.utils.marketplace import get_zone
13
+
14
+ logger = logging.getLogger(__name__)
15
+ logger.setLevel(logging.DEBUG)
16
+
17
+
18
+ def scrape_one(client: AmazonDriver, marketplace: str, asin: str) -> dict[str, str]:
19
+ data = {"asin": asin, "marketplace": marketplace}
20
+ zone = get_zone(marketplace)
21
+ url = f"https://www.amazon.{zone}/dp/{asin}"
22
+ client.get(url)
23
+
24
+ product_overview = client.get_product_overview()
25
+ if product_overview:
26
+ data = data | product_overview
27
+
28
+ product_specs = client.get_product_specs()
29
+ if product_specs:
30
+ data = data | product_specs
31
+
32
+ product_micro = client.get_product_micro()
33
+ if product_micro:
34
+ data = data | product_micro
35
+
36
+ return data
37
+
38
+
39
+ def scrape_all(
40
+ marketplaces: list[str],
41
+ asins: list[str],
42
+ thread_id: int,
43
+ thread_count: int = 10,
44
+ proxy_key: str | None = None,
45
+ headless: bool = True,
46
+ is_remote: bool = False,
47
+ remote_url: str | None = None,
48
+ jsonl_output_path: Path | None = None,
49
+ ) -> list[dict[str, str]]:
50
+ client = None
51
+ data: list[dict[str, str]] = []
52
+ try:
53
+ proxy = get_proxy(proxy_key) if proxy_key else None
54
+ position = ChromeDriverConfig.get_driver_position(thread_id, thread_count)
55
+ options = ChromeDriverConfig.get_options(
56
+ proxy=proxy,
57
+ position=position,
58
+ user_agent=UserAgent().random,
59
+ headless=headless,
60
+ )
61
+ if is_remote:
62
+ driver = ChromeDriverConfig.get_remote_driver(options, remote_url)
63
+ else:
64
+ driver = ChromeDriverConfig.get_chrome_driver(options)
65
+ client = AmazonDriver(driver)
66
+ for i in range(len(asins)):
67
+ asin = asins[i]
68
+ marketplace = marketplaces[i]
69
+ row = scrape_one(client, marketplace, asin)
70
+ logger.info("Thread %d: ASIN %s from %s", thread_id, asin, marketplace)
71
+ if jsonl_output_path:
72
+ write_to_json(jsonl_output_path, row)
73
+ logger.debug("Thread %d: ASIN %s to JSONL file", thread_id, asin)
74
+ data.append(row)
75
+
76
+ except Exception as e:
77
+ logger.error(str(e))
78
+
79
+ finally:
80
+ if client is not None:
81
+ client.quit()
82
+ return data
83
+
84
+
85
+ class AmazonScraper:
86
+ def __init__(
87
+ self,
88
+ proxy_key: str | None = None,
89
+ headless: bool = True,
90
+ is_remote: bool = False,
91
+ remote_url: str | None = None,
92
+ batch_size: int = 10,
93
+ thread_count: int = 10,
94
+ jsonl_output_path: Path | None = None,
95
+ ) -> None:
96
+ """
97
+ Initialize the AmazonScraper.
98
+
99
+ Args:
100
+ proxy_key: If parsed, use proxy to prevent Amazon from block the host
101
+ computer. WWProxy API key. Defaults to None.
102
+ headless: If set to `True`, run the Selenium instances in headless mode.
103
+ Defaults to True.
104
+ is_remote: If set to `True`, use Selenium Grid for the instances.
105
+ Defaults to False.
106
+ remote_url: Selenium Grid remote URL. Required `is_remote` = True to
107
+ activate. Defaults to None.
108
+ batch_size: The number of URLs to be processed in one instance before
109
+ quitting it. Defaults to 10.
110
+ thread_count: The number of threads to use in the process. Defaults to 10.
111
+ jsonl_output_path: If parsed, append results in JSONL type in the parsed
112
+ path. Defaults to None.
113
+
114
+ Raises:
115
+ ValueError: If `thread_count` is not a positive integer.
116
+ """
117
+ self.__proxy_key = proxy_key
118
+ self.headless = headless
119
+ self.is_remote = is_remote
120
+ self.remote_url = remote_url
121
+ self.batch_size = batch_size
122
+ self.thread_count = thread_count
123
+ if self.thread_count <= 0:
124
+ raise ValueError("thread_count must be a positive integer")
125
+
126
+ # Set up output options
127
+ self.jsonl_output_path = jsonl_output_path
128
+
129
+ logger.debug(
130
+ "Initializing AmazonScraper with thread_count=%d, batch_size=%d"
131
+ % (self.thread_count, self.batch_size),
132
+ )
133
+
134
+ @property
135
+ def proxy_key(self) -> str | None:
136
+ return self.__proxy_key
137
+
138
+ def scrape(
139
+ self,
140
+ asins: list[str],
141
+ marketplaces: list[str] | None = None,
142
+ marketplace: Literal["US", "UK", "DE", "FR", "ES", "IT"] | None = None,
143
+ ) -> list[dict[str, str]]:
144
+ """
145
+ Scrape product data from Amazon for a list of ASINs.
146
+
147
+ Args:
148
+ asins: An array of ASINs to scrape.
149
+ marketplaces: Corresponding array of marketplaces to ASINs. Defaults to None
150
+ Has to be the same length as `asins`.
151
+ marketplace: Marketplace for all ASINs. Defaults to None.
152
+ If `marketplaces` is not set, `marketplace` must be set.
153
+ If `marketplace` is set, it will be used for all ASINs.
154
+
155
+ Raises:
156
+ ValueError: asins must not be an empty list.
157
+ ValueError: marketplaces must be the same length as asins.
158
+
159
+ Returns:
160
+ A pandas DataFrame containing the scraped data.
161
+ """
162
+ if len(asins) == 0:
163
+ raise ValueError("asins must not be an empty list")
164
+ if marketplace is not None and marketplaces is None:
165
+ marketplaces = [marketplace] * len(asins)
166
+ if marketplaces is None or len(marketplaces) != len(asins):
167
+ raise ValueError("Invalid marketplaces array length")
168
+
169
+ chunks = [
170
+ (marketplaces[i : i + self.batch_size], asins[i : i + self.batch_size])
171
+ for i in range(0, len(asins), self.batch_size)
172
+ ]
173
+ args = [
174
+ self.thread_count,
175
+ self.proxy_key,
176
+ self.headless,
177
+ self.is_remote,
178
+ self.remote_url,
179
+ self.jsonl_output_path,
180
+ ]
181
+ with ThreadPoolExecutor(max_workers=self.thread_count) as executor:
182
+ futures = [
183
+ executor.submit(
184
+ scrape_all, chunk[0], chunk[1], i + 1 % self.thread_count, *args
185
+ )
186
+ for i, chunk in enumerate(chunks)
187
+ ]
188
+ results: list[dict[str, str]] = []
189
+ for future in futures:
190
+ results.extend(future.result())
191
+
192
+ return results
@@ -0,0 +1,7 @@
1
+ from .constants import Constants
2
+ from .custom_types import CustomTypes
3
+
4
+ __all__ = [
5
+ "Constants",
6
+ "CustomTypes",
7
+ ]
@@ -0,0 +1,16 @@
1
+ class Constants:
2
+ THREAD_MESSAGE = ""
3
+ PROXY_LIVE_URL = "https://clonebysun.com/api/tienich/checkliveproxy"
4
+ PROXY_BASE_URL = "http://proxy.shoplike.vn/Api"
5
+ GET_NEW_ENDPOINT = PROXY_BASE_URL + "/getNewProxy"
6
+ GET_CURRENT_ENDPOINT = PROXY_BASE_URL + "/getCurrentProxy"
7
+
8
+ PRODUCT_OVERVIEW = "productOverview_hoc_view_div"
9
+ PRODUCT_SPECS = "productSpecifications-content"
10
+ PRODUCT_SPECS_SCRIPT = "arguments[0].style.display = 'block';"
11
+ PRODUCT_MICRO = ".a-normal.a-spacing-micro"
12
+
13
+ MONITOR_WIDTH = 1920
14
+ MONITOR_HEIGHT = 1080
15
+
16
+ LOGGING_LEVELS = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]
@@ -0,0 +1,5 @@
1
+ from selenium.webdriver import Chrome, Remote
2
+
3
+
4
+ class CustomTypes:
5
+ DRIVER_TYPE = Chrome | Remote
@@ -0,0 +1,11 @@
1
+ import json
2
+ import threading
3
+ from pathlib import Path
4
+
5
+ lock = threading.Lock()
6
+
7
+
8
+ def write_to_json(path: Path, row: dict[str, str]) -> None:
9
+ with lock:
10
+ with open(path, "a", encoding="utf-8") as f:
11
+ f.write(json.dumps(row, ensure_ascii=False) + "\n")
@@ -0,0 +1,14 @@
1
+ def get_zone(marketplace: str | None = None) -> str:
2
+ if marketplace is None:
3
+ marketplace = "de"
4
+ zone = {
5
+ "us": "com",
6
+ "usa": "com",
7
+ "uk": "co.uk",
8
+ "gb": "co.uk",
9
+ "de": "de",
10
+ "fr": "fr",
11
+ "es": "es",
12
+ "it": "it",
13
+ }
14
+ return zone.get(marketplace.lower(), "de")