PyPI - amzsc - Versions diffs - 0.1.1__py3-none-any.whl - Mend

amzsc 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

amzsc/__init__.py +5 -0
amzsc/handlers/__init__.py +5 -0
amzsc/handlers/error_handler.py +15 -0
amzsc/modules/driver/driver_amazon.py +48 -0
amzsc/modules/driver/driver_config.py +66 -0
amzsc/modules/driver/driver_manipulator.py +29 -0
amzsc/modules/proxy/__init__.py +5 -0
amzsc/modules/proxy/proxy.py +15 -0
amzsc/modules/proxy/proxy_request.py +54 -0
amzsc/scraper.py +192 -0
amzsc/utils/__init__.py +7 -0
amzsc/utils/constants.py +16 -0
amzsc/utils/custom_types.py +5 -0
amzsc/utils/file_worker.py +11 -0
amzsc/utils/marketplace.py +14 -0
amzsc-0.1.1.dist-info/METADATA +54 -0
amzsc-0.1.1.dist-info/RECORD +18 -0
amzsc-0.1.1.dist-info/WHEEL +4 -0

amzsc/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .scraper import AmazonScraper
+__all__ = [
+    "AmazonScraper",
+]

amzsc/handlers/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .error_handler import safe_method
+__all__ = [
+    "safe_method",
+]

amzsc/handlers/error_handler.py ADDED Viewed

@@ -0,0 +1,15 @@
+import logging
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+def safe_method(func):
+    def wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except Exception as e:
+            logger.error(f"Error in {func.__name__}: {e}")
+            return None
+    return wrapper

amzsc/modules/driver/driver_amazon.py ADDED Viewed

@@ -0,0 +1,48 @@
+from selenium.webdriver.common.by import By
+from amzsc.handlers import safe_method
+from amzsc.modules.driver.driver_manipulator import ChromeManipulator
+from amzsc.utils import Constants, CustomTypes
+class AmazonDriver(ChromeManipulator):
+    def __init__(self, driver: CustomTypes.DRIVER_TYPE) -> None:
+        super().__init__(driver)
+    @safe_method
+    def get_product_overview(self) -> dict[str, str]:
+        data = {}
+        parent_div = self.driver.find_element(By.ID, Constants.PRODUCT_OVERVIEW)
+        child_divs = parent_div.find_elements(By.XPATH, "./div")
+        for child_div in child_divs:
+            divs = child_div.find_elements(By.TAG_NAME, "div")
+            field = divs[0].text.strip()
+            value = divs[1].text.strip()
+            data[field] = value
+        return data
+    @safe_method
+    def get_product_specs(self) -> dict[str, str]:
+        data = {}
+        data_table = self.driver.find_element(By.ID, Constants.PRODUCT_SPECS)
+        self.driver.execute_script(Constants.PRODUCT_SPECS_SCRIPT, data_table)
+        rows = data_table.find_elements(By.TAG_NAME, "tr")
+        for row in rows:
+            if row.text.strip() == "":
+                continue
+            field = row.find_element(By.TAG_NAME, "th").text.strip()
+            value = row.find_element(By.TAG_NAME, "td").text.strip()
+            data[field] = value
+        return data
+    @safe_method
+    def get_product_micro(self) -> dict[str, str]:
+        data = {}
+        data_table = self.driver.find_element(By.CSS_SELECTOR, Constants.PRODUCT_MICRO)
+        rows = data_table.find_elements(By.TAG_NAME, "tr")
+        for row in rows:
+            cells = row.find_elements(By.TAG_NAME, "td")
+            field = cells[0].text
+            value = cells[1].text
+            data[field] = value
+        return data

amzsc/modules/driver/driver_config.py ADDED Viewed

@@ -0,0 +1,66 @@
+from selenium.webdriver import Chrome, ChromeOptions, Remote
+from amzsc.utils import Constants
+class ChromeDriverConfig:
+    @staticmethod
+    def get_options(**kwargs) -> ChromeOptions:
+        options = ChromeOptions()
+        if kwargs.get("user_agent") is not None:
+            options.add_argument(f"user-agent={kwargs.get('user_agent')}")
+        if kwargs.get("proxy") is not None:
+            options.add_argument(f"--proxy-server={kwargs.get('proxy')}")
+        options.add_argument("--disable-logging")
+        options.add_argument("--disable-dev-shm-usage")
+        if kwargs.get("headless", True):
+            options.add_argument("--headless")
+        options.add_argument("--log-level=3")
+        options.add_argument("--mute-audio")
+        options.add_argument("--no-sandbox")
+        options.add_argument("--disable-notifications")
+        options.add_argument("--disable-infobars")
+        options.add_argument("--disable-features=Translate")
+        options.add_argument("--disable-popup-blocking")
+        options.add_argument("--autoplay-policy-no-user-gesture-required")
+        options.add_argument("--no-default-browser-check")
+        options.add_argument("--force-dark-mode")
+        options.add_argument("--force-show-cursor")
+        options.add_argument("--disable-blink-features=AutomationControlled")
+        options.add_experimental_option("excludeSwitches", ["enable-automation"])
+        options.add_experimental_option("useAutomationExtension", False)
+        options.add_argument("--force-device-scale-factor=0.8")
+        if kwargs.get("position") is not None:
+            options.add_argument(f"--window-position={kwargs.get('position')}")
+        if kwargs.get("download_dir") is not None:
+            prefs = {
+                "download.default_directory": kwargs.get("download_dir"),
+                "download.prompt_for_download": False,
+                "directory_upgrade": True,
+                "safebrowsing.enabled": True,
+            }
+            options.add_experimental_option("prefs", prefs)
+        return options
+    @staticmethod
+    def get_chrome_driver(options: ChromeOptions) -> Chrome:
+        driver = Chrome(options=options)
+        return driver
+    @staticmethod
+    def get_remote_driver(options: ChromeOptions, remote_url: str) -> Remote:
+        driver = Remote(options=options, command_executor=remote_url)
+        return driver
+    @staticmethod
+    def get_driver_position(thread_id: int, thread_count: int = 10) -> str:
+        min_row = 4
+        min_col = 4
+        tmp = int(thread_count**0.5)
+        row_count = max(min_row, tmp)
+        col_count = max(min_col, tmp + 1)
+        row_height = Constants.MONITOR_HEIGHT // row_count
+        col_width = Constants.MONITOR_WIDTH // col_count
+        col = thread_id % col_count
+        row = thread_id // col_count
+        return f"{col_width * col},{row_height * row}"

amzsc/modules/driver/driver_manipulator.py ADDED Viewed

@@ -0,0 +1,29 @@
+from typing import Callable
+from selenium.webdriver.remote.webdriver import WebDriver
+from selenium.webdriver.remote.webelement import WebElement
+from selenium.webdriver.support.ui import WebDriverWait
+from amzsc.utils import CustomTypes
+CONDITION_TYPE = Callable[[tuple[str, str]], Callable[[WebDriver], WebElement | bool]]
+class ChromeManipulator:
+    def __init__(self, driver: CustomTypes.DRIVER_TYPE) -> None:
+        self.driver = driver
+    def __str__(self) -> str:
+        return "DriverManipulator"
+    def get(self, url: str) -> None:
+        self.driver.get(url)
+    def refresh(self) -> None:
+        self.driver.refresh()
+    def quit(self) -> None:
+        self.driver.quit()
+    def wait(self, timeout: int = 10) -> WebDriverWait:
+        return WebDriverWait(self.driver, timeout)

amzsc/modules/proxy/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+from .proxy import get_proxy
+__all__ = [
+    "get_proxy",
+]

amzsc/modules/proxy/proxy.py ADDED Viewed

@@ -0,0 +1,15 @@
+from amzsc.handlers import safe_method
+from amzsc.modules.proxy.proxy_request import ProxyRequest
+@safe_method
+def get_proxy(api_key: str) -> str | None:
+    """
+    Get a new proxy from the ProxyRequest module.
+    If no new proxy is available, it returns the current proxy.
+    """
+    proxy_cli = ProxyRequest(api_key=api_key)
+    proxy = proxy_cli.get_new_proxy() or proxy_cli.get_current_proxy()
+    if not proxy or not ProxyRequest.is_proxy_live(proxy):
+        raise ValueError("No valid proxy available.")
+    return proxy

amzsc/modules/proxy/proxy_request.py ADDED Viewed

@@ -0,0 +1,54 @@
+import requests
+from amzsc.handlers import safe_method
+from amzsc.utils import Constants
+RESPONSE_STATUS = {"ERROR": "error", "SUCCESS": "success"}
+class ProxyRequest:
+    def __init__(self, api_key: str) -> None:
+        self.__api_key = api_key
+    @property
+    def api_key(self) -> str:
+        return self.__api_key
+    def request(self, url: str) -> dict:
+        params = {"access_token": self.api_key, "location": "", "provider": ""}
+        response = requests.get(url, params=params)
+        response.raise_for_status()
+        return response.json()
+    @safe_method
+    def get_new_proxy(self) -> str | None:
+        data = self.request(Constants.GET_NEW_ENDPOINT)
+        if data is None:
+            return None
+        if data["status"] == RESPONSE_STATUS["ERROR"]:
+            return None
+        return data["data"]["proxy"]
+    @safe_method
+    def get_current_proxy(self) -> str | None:
+        data = self.request(Constants.GET_CURRENT_ENDPOINT)
+        if data is None:
+            return None
+        return data["data"]["proxy"]
+    @staticmethod
+    def is_proxy_live(proxy: str | None = None) -> bool:
+        if proxy is None:
+            return False
+        try:
+            response = requests.get(f"{Constants.PROXY_LIVE_URL}/{proxy}", timeout=10)
+            response.raise_for_status()
+            data = response.json()
+            if data["status"] == "Live":
+                return True
+            return False
+        except requests.RequestException:
+            return False
+        except Exception:
+            return False

amzsc/scraper.py ADDED Viewed

@@ -0,0 +1,192 @@
+import logging
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from typing import Literal
+from fake_useragent import UserAgent
+from amzsc.modules.driver.driver_amazon import AmazonDriver
+from amzsc.modules.driver.driver_config import ChromeDriverConfig
+from amzsc.modules.proxy import get_proxy
+from amzsc.utils.file_worker import write_to_json
+from amzsc.utils.marketplace import get_zone
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+def scrape_one(client: AmazonDriver, marketplace: str, asin: str) -> dict[str, str]:
+    data = {"asin": asin, "marketplace": marketplace}
+    zone = get_zone(marketplace)
+    url = f"https://www.amazon.{zone}/dp/{asin}"
+    client.get(url)
+    product_overview = client.get_product_overview()
+    if product_overview:
+        data = data | product_overview
+    product_specs = client.get_product_specs()
+    if product_specs:
+        data = data | product_specs
+    product_micro = client.get_product_micro()
+    if product_micro:
+        data = data | product_micro
+    return data
+def scrape_all(
+    marketplaces: list[str],
+    asins: list[str],
+    thread_id: int,
+    thread_count: int = 10,
+    proxy_key: str | None = None,
+    headless: bool = True,
+    is_remote: bool = False,
+    remote_url: str | None = None,
+    jsonl_output_path: Path | None = None,
+) -> list[dict[str, str]]:
+    client = None
+    data: list[dict[str, str]] = []
+    try:
+        proxy = get_proxy(proxy_key) if proxy_key else None
+        position = ChromeDriverConfig.get_driver_position(thread_id, thread_count)
+        options = ChromeDriverConfig.get_options(
+            proxy=proxy,
+            position=position,
+            user_agent=UserAgent().random,
+            headless=headless,
+        )
+        if is_remote:
+            driver = ChromeDriverConfig.get_remote_driver(options, remote_url)
+        else:
+            driver = ChromeDriverConfig.get_chrome_driver(options)
+        client = AmazonDriver(driver)
+        for i in range(len(asins)):
+            asin = asins[i]
+            marketplace = marketplaces[i]
+            row = scrape_one(client, marketplace, asin)
+            logger.info("Thread %d: ASIN %s from %s", thread_id, asin, marketplace)
+            if jsonl_output_path:
+                write_to_json(jsonl_output_path, row)
+                logger.debug("Thread %d: ASIN %s to JSONL file", thread_id, asin)
+            data.append(row)
+    except Exception as e:
+        logger.error(str(e))
+    finally:
+        if client is not None:
+            client.quit()
+        return data
+class AmazonScraper:
+    def __init__(
+        self,
+        proxy_key: str | None = None,
+        headless: bool = True,
+        is_remote: bool = False,
+        remote_url: str | None = None,
+        batch_size: int = 10,
+        thread_count: int = 10,
+        jsonl_output_path: Path | None = None,
+    ) -> None:
+        """
+        Initialize the AmazonScraper.
+        Args:
+            proxy_key: If parsed, use proxy to prevent Amazon from block the host
+                computer. WWProxy API key. Defaults to None.
+            headless: If set to `True`, run the Selenium instances in headless mode.
+                Defaults to True.
+            is_remote: If set to `True`, use Selenium Grid for the instances.
+                Defaults to False.
+            remote_url: Selenium Grid remote URL. Required `is_remote` = True to
+                activate. Defaults to None.
+            batch_size: The number of URLs to be processed in one instance before
+                quitting it. Defaults to 10.
+            thread_count: The number of threads to use in the process. Defaults to 10.
+                jsonl_output_path: If parsed, append results in JSONL type in the parsed
+                path. Defaults to None.
+        Raises:
+            ValueError: If `thread_count` is not a positive integer.
+        """
+        self.__proxy_key = proxy_key
+        self.headless = headless
+        self.is_remote = is_remote
+        self.remote_url = remote_url
+        self.batch_size = batch_size
+        self.thread_count = thread_count
+        if self.thread_count <= 0:
+            raise ValueError("thread_count must be a positive integer")
+        # Set up output options
+        self.jsonl_output_path = jsonl_output_path
+        logger.debug(
+            "Initializing AmazonScraper with thread_count=%d, batch_size=%d"
+            % (self.thread_count, self.batch_size),
+        )
+    @property
+    def proxy_key(self) -> str | None:
+        return self.__proxy_key
+    def scrape(
+        self,
+        asins: list[str],
+        marketplaces: list[str] | None = None,
+        marketplace: Literal["US", "UK", "DE", "FR", "ES", "IT"] | None = None,
+    ) -> list[dict[str, str]]:
+        """
+        Scrape product data from Amazon for a list of ASINs.
+        Args:
+            asins: An array of ASINs to scrape.
+            marketplaces: Corresponding array of marketplaces to ASINs. Defaults to None
+                Has to be the same length as `asins`.
+            marketplace: Marketplace for all ASINs. Defaults to None.
+                If `marketplaces` is not set, `marketplace` must be set.
+                If `marketplace` is set, it will be used for all ASINs.
+        Raises:
+            ValueError: asins must not be an empty list.
+            ValueError: marketplaces must be the same length as asins.
+        Returns:
+            A pandas DataFrame containing the scraped data.
+        """
+        if len(asins) == 0:
+            raise ValueError("asins must not be an empty list")
+        if marketplace is not None and marketplaces is None:
+            marketplaces = [marketplace] * len(asins)
+        if marketplaces is None or len(marketplaces) != len(asins):
+            raise ValueError("Invalid marketplaces array length")
+        chunks = [
+            (marketplaces[i : i + self.batch_size], asins[i : i + self.batch_size])
+            for i in range(0, len(asins), self.batch_size)
+        ]
+        args = [
+            self.thread_count,
+            self.proxy_key,
+            self.headless,
+            self.is_remote,
+            self.remote_url,
+            self.jsonl_output_path,
+        ]
+        with ThreadPoolExecutor(max_workers=self.thread_count) as executor:
+            futures = [
+                executor.submit(
+                    scrape_all, chunk[0], chunk[1], i + 1 % self.thread_count, *args
+                )
+                for i, chunk in enumerate(chunks)
+            ]
+            results: list[dict[str, str]] = []
+            for future in futures:
+                results.extend(future.result())
+        return results

amzsc/utils/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+from .constants import Constants
+from .custom_types import CustomTypes
+__all__ = [
+    "Constants",
+    "CustomTypes",
+]

amzsc/utils/constants.py ADDED Viewed

@@ -0,0 +1,16 @@
+class Constants:
+    THREAD_MESSAGE = ""
+    PROXY_LIVE_URL = "https://clonebysun.com/api/tienich/checkliveproxy"
+    PROXY_BASE_URL = "http://proxy.shoplike.vn/Api"
+    GET_NEW_ENDPOINT = PROXY_BASE_URL + "/getNewProxy"
+    GET_CURRENT_ENDPOINT = PROXY_BASE_URL + "/getCurrentProxy"
+    PRODUCT_OVERVIEW = "productOverview_hoc_view_div"
+    PRODUCT_SPECS = "productSpecifications-content"
+    PRODUCT_SPECS_SCRIPT = "arguments[0].style.display = 'block';"
+    PRODUCT_MICRO = ".a-normal.a-spacing-micro"
+    MONITOR_WIDTH = 1920
+    MONITOR_HEIGHT = 1080
+    LOGGING_LEVELS = ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]

amzsc/utils/custom_types.py ADDED Viewed

@@ -0,0 +1,5 @@
+from selenium.webdriver import Chrome, Remote
+class CustomTypes:
+    DRIVER_TYPE = Chrome | Remote

amzsc/utils/file_worker.py ADDED Viewed

@@ -0,0 +1,11 @@
+import json
+import threading
+from pathlib import Path
+lock = threading.Lock()
+def write_to_json(path: Path, row: dict[str, str]) -> None:
+    with lock:
+        with open(path, "a", encoding="utf-8") as f:
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")

amzsc/utils/marketplace.py ADDED Viewed

@@ -0,0 +1,14 @@
+def get_zone(marketplace: str | None = None) -> str:
+    if marketplace is None:
+        marketplace = "de"
+    zone = {
+        "us": "com",
+        "usa": "com",
+        "uk": "co.uk",
+        "gb": "co.uk",
+        "de": "de",
+        "fr": "fr",
+        "es": "es",
+        "it": "it",
+    }
+    return zone.get(marketplace.lower(), "de")

amzsc-0.1.1.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,54 @@
+Metadata-Version: 2.3
+Name: amzsc
+Version: 0.1.1
+Summary: Amazon product description scraper
+Requires-Dist: bs4>=0.0.2
+Requires-Dist: fake-useragent>=2.2.0
+Requires-Dist: requests>=2.32.5
+Requires-Dist: selenium>=4.35.0
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+# amzsc
+[![image](https://img.shields.io/pypi/v/amzsc.svg)](https://pypi.python.org/pypi/amzsc)
+[![image](https://img.shields.io/pypi/l/amzsc.svg)](https://pypi.python.org/pypi/amzsc)
+[![image](https://img.shields.io/pypi/pyversions/amzsc.svg)](https://pypi.python.org/pypi/amzsc)
+[![Actions status](https://github.com/zenovate-org/amzsc/actions/workflows/test-and-release.yaml/badge.svg)](https://github.com/zenovate-org/amzsc/actions)
+[![codecov](https://codecov.io/gh/zenovate-org/amzsc/branch/main/graph/badge.svg)](https://codecov.io/gh/zenovate-org/amzsc)
+`amzsc` is an Amazon product description scraper library that allows you to extract product details such as title, price, description, and reviews using ASINs.
+## Example Usage
+```python
+from amzsc import AmazonScraper
+def main():
+    # Initialize the AmazonScraper with your Amazon credentials
+    scraper = AmazonScraper()
+    asins = ['B08N5WRWNW', 'B07XJ8C8F5']  # Example ASINs
+    results = scraper.scrape(asins=asins, marketplace="US") # DataFrame with scraped data
+    print(results)
+if __name__ == "__main__":
+    main()
+```
+## Installation
+```bash
+pip install amzsc
+```
+## Contribution
+Contributions are welcome! Please open an issue or submit a pull request for any improvements or bug fixes.
+```bash
+git clone https://github.com/zenovate-org/amzsc.git
+cd amzsc
+uv sync
+```

amzsc-0.1.1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,18 @@
+amzsc/__init__.py,sha256=-yg9RGVckvgcFa0HmWx1sqpvJdt59vO9Dwv_pnFAaZI,71
+amzsc/handlers/__init__.py,sha256=soGrhbsh0slb0kl-4iO-vtonozkP9exOPgqY1A2LmV4,73
+amzsc/handlers/error_handler.py,sha256=PxB9dX0qPRCB2wbryP7JUe0VSMVVJuhV_nrw_ybja8E,331
+amzsc/modules/driver/driver_amazon.py,sha256=ehs_2yaZHkHvGV5AcVblWuOFKDcNEGflxCfyPIUqIJ4,1828
+amzsc/modules/driver/driver_config.py,sha256=zTbafH31bYizEP_DLXaitJaQGLkirB7veEeyfnoW0Dk,2897
+amzsc/modules/driver/driver_manipulator.py,sha256=WJjbK1Y-m1Scz1-vO63RjK34bdkQs_k4_srpwrdWIiA,808
+amzsc/modules/proxy/__init__.py,sha256=4kMiT_LvHyFTtXzD_P_mbQvZPrw7miKlAnDj-9zt0Gs,61
+amzsc/modules/proxy/proxy.py,sha256=DI9aIjn4A0DkdD2aojelPcmfX-r45bJGpP7UbFzye4o,533
+amzsc/modules/proxy/proxy_request.py,sha256=FE2WiVPk1G2d-moTTnyfAJ6UndozFpRkGmrfjaaWXXI,1575
+amzsc/scraper.py,sha256=kUC8uz70M_keLrEq6A7e_7gYHs5p-GRnDK8F0wv82xw,6702
+amzsc/utils/__init__.py,sha256=TqrZ-Yz7jvCNiTbJ_nC2TQfE9K0yLG4m2j-C4VHB0yg,122
+amzsc/utils/constants.py,sha256=BlchriYSVT4nxDwkVN1EOJoxPfGwSnke8SZtZ1BL3F0,630
+amzsc/utils/custom_types.py,sha256=gBZySjlhASAZwVdXKLT84kLrVIzENt8hki61Jzc7kwQ,101
+amzsc/utils/file_worker.py,sha256=2iMYSGeUWKErLhTzRhSTeA3vyEUNIAeM5GV-5rEzhnQ,273
+amzsc/utils/marketplace.py,sha256=K87efMSvsrYv9euefIXsPV5rycdIlaSlFOteUn-3HgE,343
+amzsc-0.1.1.dist-info/WHEEL,sha256=DpNsHFUm_gffZe1FgzmqwuqiuPC6Y-uBCzibcJcdupM,78
+amzsc-0.1.1.dist-info/METADATA,sha256=FtAZbcuUllECEfrxpg6M86vq654v6qxJAHmJv2aVEfw,1645
+amzsc-0.1.1.dist-info/RECORD,,

amzsc-0.1.1.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: uv 0.9.8
+Root-Is-Purelib: true
+Tag: py3-none-any