PyPI - mcmaster-scraper - Versions diffs - 0.1.0__py3-none-any.whl - Mend

mcmaster-scraper 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

mcmaster_scraper/__init__.py +5 -0
mcmaster_scraper/_api/_text_parser.py +60 -0
mcmaster_scraper/_api/scraper.py +56 -0
mcmaster_scraper/_api/table_parser.py +49 -0
mcmaster_scraper/_utils/event_loop_wrapper.py +55 -0
mcmaster_scraper/_utils/page_provider.py +39 -0
mcmaster_scraper/async_api.py +49 -0
mcmaster_scraper/py.typed +0 -0
mcmaster_scraper/sync_api.py +42 -0
mcmaster_scraper-0.1.0.dist-info/METADATA +113 -0
mcmaster_scraper-0.1.0.dist-info/RECORD +13 -0
mcmaster_scraper-0.1.0.dist-info/WHEEL +4 -0
mcmaster_scraper-0.1.0.dist-info/licenses/LICENSE +21 -0

mcmaster_scraper/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""
+.. include::../../README.md
+   :start-line: 2
+   :end-before: Docs
+"""

mcmaster_scraper/_api/_text_parser.py ADDED Viewed

@@ -0,0 +1,60 @@
+from fractions import Fraction
+_STANDARD_HEADERS = {
+    "PART_NUMBER": "Part Number",
+    "PRICING": "Price",
+}
+def get_header_text(col_id: int, meta: dict):
+    # Column Id -> Column Metadata
+    column_metas = meta["ColumnIdToMetadata"]
+    column_meta = column_metas[col_id]
+    header_type = column_meta.get("Type")
+    if isinstance(header_type, str) and header_type in _STANDARD_HEADERS:
+        return _STANDARD_HEADERS[header_type]
+    else:
+        # Column Metadata -> Header
+        return _extract_text(column_meta)
+def get_cell_text(cell_id: int, meta: dict):
+    # Cell ID -> Value Metadata ID
+    cell_metas = meta["CellIdToCellMetadata"]
+    cell_meta = cell_metas[cell_id]
+    value_meta_id = cell_meta["ValueMetadataIds"][0]
+    # Value Metadata ID -> Value Metadata
+    value_metas = meta["ValueMetadataIdToValueMetadata"]
+    value_meta = value_metas[value_meta_id]
+    # Value Metadata -> Value
+    return _extract_text(value_meta)
+def _extract_text(meta_item: dict):
+    components = meta_item["Name"]["Components"]
+    text = " ".join(c["Text"] for c in components)
+    return _parse_number(text)
+def _parse_number(text: str):
+    t = text.replace('"', "").strip()
+    if t == "":
+        return t
+    try:
+        return float(t)
+    except ValueError:
+        pass
+    try:
+        fraction = sum(Fraction(part) for part in t.split())
+        return float(fraction)
+    except ValueError:
+        pass
+    return text

mcmaster_scraper/_api/scraper.py ADDED Viewed

@@ -0,0 +1,56 @@
+import json
+import logging
+import re
+from .._utils.page_provider import get_page
+logger = logging.getLogger(__name__)
+async def get_product_api_response(url: str) -> dict:
+    if not _is_valid_url(url):
+        raise ValueError("Not a McMaster-Carr URL")
+    logger.info("Finding API for product page...")
+    # Using Playwright because the API can only be discovered by loading the JavaScript
+    page = await get_page()
+    await page.goto(url)
+    # If the JSON is too large, the response will be evicted
+    # from the inspector cache before we can access it
+    # As a workaround, we can navigate to the API URL
+    # and extract the response from the page's body
+    product_api = "**/ProdPageWebPart.aspx?**"
+    async with page.expect_request(product_api, timeout=5000) as request:
+        value = await request.value
+        api_url = value.url
+    logger.info("Getting API response...")
+    await page.goto(api_url)
+    res = await page.locator("body").text_content()
+    assert res is not None
+    data = _extract_json_from_response(res)
+    await page.close()
+    return data
+def _extract_json_from_response(res: str) -> dict:
+    start = res.find("{")
+    end = res.rfind("}")
+    if start == -1 or end == -1:
+        raise ValueError("No JSON found in API response")
+    json_str = res[start : end + 1]
+    return json.loads(json_str)
+def _is_valid_url(url: str) -> bool:
+    pattern = re.compile(r"^https?://(www\.)?mcmaster\.com(/\S*)?$")
+    return bool(pattern.match(url))

mcmaster_scraper/_api/table_parser.py ADDED Viewed

@@ -0,0 +1,49 @@
+from pandas import DataFrame
+from ._text_parser import get_cell_text, get_header_text
+def get_product_tables(json: dict) -> dict[str, DataFrame]:
+    tables = _find_pivot_tables(json)
+    dataframes = {k: _parse_pivot_table(v) for k, v in tables.items()}
+    return dataframes
+def _find_pivot_tables(root: dict) -> dict:
+    stack = [root]
+    while stack:
+        node = stack.pop()
+        if isinstance(node, dict):
+            if node.get("Name") == "ProductPresentations":
+                return {
+                    product["Display"]["Title"]: product["Table"]
+                    for product in node["Data"]
+                }
+            else:
+                stack.extend(node.values())
+        elif isinstance(node, list):
+            stack.extend(node)
+    raise KeyError("The McMaster URL provided does not have a visible product table.")
+def _parse_pivot_table(table: dict) -> DataFrame:
+    col_ids = table["ColumnIds"]
+    rows = table["Rows"]
+    meta = table["Metadata"]
+    # Build headers
+    headers = [get_header_text(col_id, meta) for col_id in col_ids]
+    # Build row data
+    def get_row_data(row: dict):
+        cell_ids = row["ColumnIdToCellIdMap"]
+        return {
+            header: get_cell_text(cell_ids[col_id], meta)
+            for col_id, header in zip(col_ids, headers)
+        }
+    data = [get_row_data(row) for row in rows]
+    return DataFrame(data)

mcmaster_scraper/_utils/event_loop_wrapper.py ADDED Viewed

@@ -0,0 +1,55 @@
+import asyncio
+import sys
+import threading
+from asyncio import AbstractEventLoop, CancelledError
+from concurrent.futures import Future
+from typing import Any, Coroutine, TypeVar, Union
+T = TypeVar("T")
+_loop: Union[AbstractEventLoop, None] = None
+_started = threading.Event()
+_lock = threading.Lock()
+async def run_in_loop_async(func: Coroutine[Any, Any, T]) -> T:
+    c_future = _run_in_loop(func)
+    a_future = asyncio.wrap_future(c_future)
+    try:
+        return await a_future
+    except CancelledError:
+        c_future.cancel()
+        raise
+def run_in_loop_sync(func: Coroutine[Any, Any, T]) -> T:
+    return _run_in_loop(func).result()
+def _run_in_loop(func: Coroutine[Any, Any, T]) -> Future[T]:
+    loop = _ensure_loop()
+    return asyncio.run_coroutine_threadsafe(func, loop)
+def _ensure_loop() -> AbstractEventLoop:
+    global _loop
+    with _lock:
+        if _loop is None:
+            t = threading.Thread(target=_run_loop, daemon=True)
+            t.start()
+            _started.wait()
+    assert _loop is not None
+    return _loop
+def _run_loop():
+    global _loop
+    if sys.platform.startswith("win"):
+        loop = asyncio.ProactorEventLoop()
+    else:
+        loop = asyncio.SelectorEventLoop()
+    asyncio.set_event_loop(loop)
+    _loop = loop
+    _started.set()
+    loop.run_forever()

mcmaster_scraper/_utils/page_provider.py ADDED Viewed

@@ -0,0 +1,39 @@
+import asyncio
+from typing import Union
+from playwright.async_api import (
+    Browser,
+    BrowserContext,
+    Page,
+    Playwright,
+    async_playwright,
+)
+_browser: Union[Browser, None] = None
+_browser_context: Union[BrowserContext, None] = None
+_playwright: Union[Playwright, None] = None
+_lock = asyncio.Lock()
+async def _ensure_browser_context() -> BrowserContext:
+    global _browser, _browser_context, _playwright
+    async with _lock:
+        if _browser_context:
+            return _browser_context
+        _playwright = await async_playwright().start()
+        assert _playwright
+        _browser = await _playwright.chromium.launch()
+        assert _browser
+        _browser_context = await _browser.new_context()
+        assert _browser_context
+        return _browser_context
+async def get_page() -> Page:
+    browser_context = await _ensure_browser_context()
+    return await browser_context.new_page()

mcmaster_scraper/async_api.py ADDED Viewed

@@ -0,0 +1,49 @@
+import hashlib
+from asyncio import create_task, gather
+import diskcache as dc
+import platformdirs
+from pandas import DataFrame, concat
+from ._api.scraper import get_product_api_response
+from ._api.table_parser import get_product_tables
+from ._utils.event_loop_wrapper import run_in_loop_async
+async def get_products_from_url(url: str, refresh: bool = False) -> DataFrame:
+    """Gets product tables from a McMaster-Carr URL.
+    See Also
+    --------
+    sync_api.get_products_from_url
+    """
+    cache_dir = platformdirs.user_cache_dir(
+        appname="mcmaster-scraper", appauthor=False, ensure_exists=True
+    )
+    cache = dc.Cache(cache_dir, eviction_policy="least-recently-used")
+    key = hashlib.md5(url.encode()).hexdigest()
+    if key in cache and not refresh:
+        json = cache[key]
+    else:
+        json = await run_in_loop_async(get_product_api_response(url))
+        cache[key] = json
+    tables = get_product_tables(json)
+    tables_with_product_type = [
+        table.assign(**{"Product Type": product}) for product, table in tables.items()
+    ]
+    return concat(tables_with_product_type, ignore_index=True)
+async def get_products_from_urls(
+    urls: list[str], refresh: bool = False
+) -> list[DataFrame]:
+    """Gets product tables from a list of McMaster-Carr URLs.
+    See Also
+    --------
+    sync_api.get_products_from_urls
+    """
+    tasks = [create_task(get_products_from_url(url, refresh)) for url in urls]
+    return await gather(*tasks)

mcmaster_scraper/py.typed ADDED Viewed

File without changes

mcmaster_scraper/sync_api.py ADDED Viewed

@@ -0,0 +1,42 @@
+from pandas import DataFrame
+from . import async_api
+from ._utils.event_loop_wrapper import run_in_loop_sync
+def get_products_from_url(url: str, refresh: bool = False) -> DataFrame:
+    """Gets product tables from a McMaster-Carr URL.
+    If there are multiple product tables, they will be merged,
+    and an additional "Product Type" column will be added.
+    Parameters
+    ----------
+    url : str
+        The URL to scrape.
+        Must be a valid McMaster-Carr URL.
+        The product tables must be visible on the webpage.
+    refresh : bool, optional
+        Whether to refresh the cached data. Default is False.
+    Returns
+    -------
+    DataFrame
+        A pandas DataFrame containing the combined product tables.
+    Raises
+    ------
+    ValueError
+        If the URL is not a valid McMaster-Carr URL.
+    """
+    return run_in_loop_sync(async_api.get_products_from_url(url, refresh))
+def get_products_from_urls(urls: list[str], refresh: bool = False) -> list[DataFrame]:
+    """Gets product tables from a list of McMaster-Carr URLs.
+    See Also
+    --------
+    get_products_from_url
+    """
+    return run_in_loop_sync(async_api.get_products_from_urls(urls, refresh))

mcmaster_scraper-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,113 @@
+Metadata-Version: 2.4
+Name: mcmaster-scraper
+Version: 0.1.0
+Summary: Fetch product tables from a McMaster-Carr URL as a DataFrame for complex filtering and calculations
+Keywords: mcmaster,mcmaster-carr,scraper,extractor,parser,fetcher,part,catalog,product
+Author: Alex
+Author-email: Alex <thedjchidev@gmail.com>
+License-Expression: MIT
+License-File: LICENSE
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Manufacturing
+Classifier: Intended Audience :: Science/Research
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Programming Language :: Python :: 3.14
+Classifier: Topic :: Scientific/Engineering
+Requires-Dist: diskcache>=5.6.3
+Requires-Dist: pandas>=2.3.3
+Requires-Dist: platformdirs>=3.5.1
+Requires-Dist: playwright>=1.14.0
+Requires-Python: >=3.10
+Project-URL: github, https://github.com/thedjchi/mcmaster-scraper
+Description-Content-Type: text/markdown
+# McMaster-Scraper
+A Python library for fetching product tables from a [McMaster-Carr](https://www.mcmaster.com) URL
+as a [DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) for complex filtering and
+calculations.
+![PyPI - Version](https://img.shields.io/pypi/v/mcmaster-scraper?style=for-the-badge)
+![PyPI - Python Versions](https://img.shields.io/pypi/pyversions/mcmaster-scraper?style=for-the-badge)
+![PyPI - License](https://img.shields.io/pypi/l/mcmaster-scraper?style=for-the-badge)
+## Features
+- Caches data locally to speed up future calls
+- Supports both sync/async APIs
+- Works in Python files and Jupyter notebooks
+- Includes convenience functions to quickly retrieve product tables from multiple URLs
+- Typed functions for type-checking compatibility
+## Install
+McMaster-Scraper is available on PyPi:
+`pip install mcmaster-scraper`
+McMaster-Scraper requires [Playwright](https://playwright.dev/python) to fetch the product tables. It is already
+included as a dependency. However, you will need to install the browsers manually:
+`playwright install`
+## Quick Start
+To use the Sync API, import the `sync_api` module and call `get_products_from_url(s)`:
+```
+from mcmaster_scraper.sync_api import get_products_from_url
+url = "https://www.mcmaster.com/products/screws/socket-head-screws-2~/steel-socket-head-screws~~/"
+data = get_products_from_url(url) # Returns a DataFrame with all the products from the URL
+... # Do stuff with the DataFrame (filter, perform calculations, etc.)
+```
+Using the Async API is similar, import the `async_api` module and `await` the function call:
+```
+from mcmaster_scraper.async_api import get_products_from_url
+url = "https://www.mcmaster.com/products/screws/socket-head-screws-2~/steel-socket-head-screws~~/"
+data = await get_products_from_url(url) # Returns a DataFrame with all the products from the URL
+... # Do stuff with the DataFrame (filter, perform calculations, etc.)
+```
+## Docs
+### API Reference
+The API reference can be found on [GitHub Pages](https://thedjchi.github.io/mcmaster-scraper/mcmaster_scraper.html).
+### Examples
+An example script can be found
+in [docs/example.py](https://github.com/thedjchi/mcmaster-scraper/blob/master/docs/example.py).
+## Disclaimer
+This library is for responsible data extraction only. Do not:
+- Scrape beyond reasonable rates
+- Violate Terms of Service
+- Circumvent access controls
+- Use data for unauthorized commercial purposes
+## Legal Notice
+This library is provided as-is. Authors are not liable for any legal, technical, or business consequences resulting from
+misuse of this library. Users assume full responsibility for compliance with applicable laws, regulations, and website
+policies.
+**By using this library, you acknowledge and agree to these responsibilities.**
+## License
+[MIT](https://github.com/thedjchi/mcmaster-scraper/blob/master/LICENSE)

mcmaster_scraper-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,13 @@
+mcmaster_scraper/__init__.py,sha256=ZS1B55NxS-IgvmJg4Z__BQxNVGp9Jw_eHLl0Ued2M7k,75
+mcmaster_scraper/_api/_text_parser.py,sha256=D3QD36I5LrAJCM35VB5KW33yWb_0NSKOLj8atB1jsHM,1461
+mcmaster_scraper/_api/scraper.py,sha256=MNel-80oMD4aF-tuy6UyPAFi9yXcEAQZdH-KseJ3GGk,1510
+mcmaster_scraper/_api/table_parser.py,sha256=e26BS8d5SVPIFJIsIujpto3AYoYyOVhY5fWpSHK_0l4,1405
+mcmaster_scraper/_utils/event_loop_wrapper.py,sha256=UkYw5OmjiQ-wq1sf_MZ8SQAXZh6CHkSsZstCjGxWh4U,1323
+mcmaster_scraper/_utils/page_provider.py,sha256=nUFaKSQZvovM_RlVWzBRKpy5XgMhdI76umELaW_OuHE,929
+mcmaster_scraper/async_api.py,sha256=dusD84Cn8blWQWjy4jUmlShQj2qtoNDzVHqE9mPeqOI,1484
+mcmaster_scraper/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+mcmaster_scraper/sync_api.py,sha256=OZFmmar0y09Ne2rAs65CpUl3bKyIhs6GDZEvGYNVSt8,1195
+mcmaster_scraper-0.1.0.dist-info/licenses/LICENSE,sha256=dT4IySfb5VBHiFbHfgF7UXboXtMlw273PdJeyBA8U2A,1065
+mcmaster_scraper-0.1.0.dist-info/WHEEL,sha256=iCTolw4aw2dP3yfM-EQCGTDsFCXL_ymmbYnBRVH7plA,81
+mcmaster_scraper-0.1.0.dist-info/METADATA,sha256=JxgVTwXozhDCWukDRvcfWEuUc-TgyZl06Xdp7WrFARM,4101
+mcmaster_scraper-0.1.0.dist-info/RECORD,,

mcmaster_scraper-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: uv 0.11.11
+Root-Is-Purelib: true
+Tag: py3-none-any

mcmaster_scraper-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 thedjchi
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.