PyPI - cert-host-scraper - Versions diffs - 0.3.0__py3-none-any.whl → 0.10.3__py3-none-any.whl - Mend

cert-host-scraper 0.3.0py3-none-any.whl → 0.10.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

cert_host_scraper/__init__.py +3 -76
cert_host_scraper/cli.py +137 -18
cert_host_scraper/scraper.py +87 -0
cert_host_scraper/utils.py +13 -0
cert_host_scraper-0.10.3.dist-info/METADATA +68 -0
cert_host_scraper-0.10.3.dist-info/RECORD +9 -0
{cert_host_scraper-0.3.0.dist-info → cert_host_scraper-0.10.3.dist-info}/WHEEL +1 -1
cert_host_scraper-0.10.3.dist-info/entry_points.txt +2 -0
cert_host_scraper-0.3.0.dist-info/METADATA +0 -54
cert_host_scraper-0.3.0.dist-info/RECORD +0 -7
cert_host_scraper-0.3.0.dist-info/entry_points.txt +0 -3
{cert_host_scraper-0.3.0.dist-info → cert_host_scraper-0.10.3.dist-info/licenses}/LICENSE +0 -0

cert_host_scraper/__init__.py CHANGED Viewed

@@ -1,78 +1,5 @@
-import logging
-from dataclasses import dataclass
-from typing import List
+from pathlib import Path
-import requests
-from bs4 import BeautifulSoup
+from single_source import get_version
-logger = logging.getLogger(__name__)
-@dataclass
-class Options:
-    timeout: int
-    clean: bool
-@dataclass
-class UrlResult:
-    url: str
-    status_code: int
-@dataclass
-class Result:
-    scraped: List[UrlResult]
-    def filter_by_status_code(self, status_code: int) -> List[UrlResult]:
-        return [result for result in self.scraped if result.status_code == status_code]
-def fetch_site_information(url: str, timeout: int) -> int:
-    try:
-        return requests.get(url, timeout=timeout).status_code
-    except Exception as e:
-        logger.debug(e)
-        return -1
-def fetch_site(search: str) -> str:
-    url = f"https://crt.sh/?q={search}"
-    result = requests.get(url)
-    result.raise_for_status()
-    return result.content.decode()
-def scrape_urls(contents: str, options: Options) -> List[str]:
-    soup = BeautifulSoup(contents, features="html.parser")
-    tables = soup.findAll("table")
-    if len(tables) <= 2:
-        return []
-    results_table = tables[2]
-    total_urls = []
-    for row in results_table.findAll("tr"):
-        cells = row.findAll("td")
-        if len(cells) == 0:
-            continue
-        matching_identity = cells[4].decode_contents()
-        if options.clean and "*" in matching_identity:
-            continue
-        total_urls.append(f"https://{matching_identity}")
-    return list(set(total_urls))
-def validate_url(url: str, options: Options) -> UrlResult:
-    return UrlResult(url, fetch_site_information(url, options.timeout))
-def fetch_urls(site: str, options: Options) -> List[str]:
-    contents = fetch_site(site)
-    urls = scrape_urls(contents, options)
-    return urls
+__version__ = get_version("cert_host_scraper", Path(__file__).parent.parent)

cert_host_scraper/cli.py CHANGED Viewed

@@ -1,17 +1,84 @@
+import asyncio
+import json
 import logging
 import sys
 import click
 from requests import RequestException
+from rich import box
 from rich.console import Console
 from rich.progress import track
 from rich.table import Table
-from cert_host_scraper import Options, Result, fetch_urls, validate_url
+from cert_host_scraper import __version__
+from cert_host_scraper.scraper import (
+    Options,
+    Result,
+    UrlResult,
+    fetch_urls,
+    validate_url,
+)
+from cert_host_scraper.utils import divide_chunks, strip_url
+NO_STATUS_CODE_FILTER = 0
+def process_urls(
+    urls: list[str], options: Options, batch_size: int, show_progress: bool
+) -> list[UrlResult]:
+    """
+    Process a list of URLs concurrently and return the results.
+    """
+    results = []
+    try:
+        loop = asyncio.get_running_loop()
+    except RuntimeError:
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+    chunks = list(divide_chunks(urls, batch_size))
+    progress_iterable = range(len(chunks))
+    if show_progress:
+        progress_iterable = track(progress_iterable, "Checking URLs")
+    for chunk_index in progress_iterable:
+        chunk = chunks[chunk_index]
+        chunk_result = loop.run_until_complete(
+            asyncio.gather(*[validate_url(url, options) for url in chunk])
+        )
+        results.extend(chunk_result)
+    return results
+def validate_status_code(
+    _ctx: click.core.Context, _param: click.core.Option, value: str
+):
+    try:
+        status_code = int(value)
+        if not (100 <= status_code <= 599):
+            raise click.BadParameter("status code must be between 100 and 599")
+        return status_code
+    except ValueError as e:
+        raise click.BadParameter("must be an integer") from e
+    except TypeError:
+        return NO_STATUS_CODE_FILTER
+class Output:
+    TABLE = "table"
+    JSON = "json"
+    @classmethod
+    def values(cls) -> list:
+        return [cls.TABLE, cls.JSON]
 @click.group()
 @click.option("--debug", is_flag=True, help="Whether to enable debug level output")
+@click.version_option(__version__, message="%(version)s")
 def cli(debug: bool):
     log_level = logging.DEBUG if debug else logging.INFO
     logging.basicConfig(level=log_level)
@@ -19,41 +86,93 @@ def cli(debug: bool):
 @cli.command()
 @click.argument("search")
-@click.option("--status-code", help="Pass the HTTP status code to filter results on")
+@click.option(
+    "--status-code",
+    help="Pass the HTTP status code to filter results on",
+    callback=validate_status_code,
+)
 @click.option("--timeout", help="Seconds before timing out on each request", default=2)
 @click.option(
     "--clean/--no-clean", is_flag=True, help="Clean wildcard results", default=True
 )
-def search(search: str, status_code: int, timeout: int, clean: bool):
+@click.option(
+    "--strip/--no-strip",
+    is_flag=True,
+    help="Remove protocol and leading www from search",
+    default=True,
+)
+@click.option(
+    "--batch-size",
+    help="Number of URLs to process at once",
+    default=20,
+)
+@click.option(
+    "--output", type=click.Choice(Output.values()), required=True, default="table"
+)
+def search(
+    search: str,
+    status_code: int,
+    timeout: int,
+    clean: bool,
+    strip: bool,
+    batch_size: int,
+    output: str,
+):
     """
     Search the certificate transparency log.
     """
-    click.echo(f"Searching for {search}")
+    if strip:
+        search = strip_url(search)
+    display_json = output == Output.JSON
+    if not display_json:
+        click.echo(f"Searching for {search}")
     options = Options(timeout, clean)
-    results = []
     try:
         urls = fetch_urls(search, options)
-        click.echo(f"Found {len(urls)} URLs for {search}")
-        for url in track(urls, "Checking URLs"):
-            results.append(validate_url(url, options))
     except RequestException as e:
         click.echo(f"Failed to search for results: {e}")
         sys.exit(1)
-    result = Result(results)
-    if status_code:
-        display = result.filter_by_status_code(int(status_code))
+    if not display_json:
+        click.echo(f"Found {len(urls)} URLs for {search}")
+    scraped_results = process_urls(
+        urls, options, batch_size, show_progress=not display_json
+    )
+    result = Result(scraped_results)
+    if status_code != NO_STATUS_CODE_FILTER:
+        display = result.filter_by_status_code(status_code)
     else:
         display = result.scraped
-    table = Table(show_header=True, header_style="bold")
-    table.add_column("URL")
-    table.add_column("Status Code")
-    for url_result in display:
-        table.add_row(url_result.url, str(url_result.status_code))
+    if display_json:
+        json_output = [
+            {"url": url_result.url, "status_code": url_result.status_code}
+            for url_result in display
+        ]
+        click.echo(json.dumps(json_output, indent=2))
+    else:
+        table = Table(show_header=True, header_style="bold", box=box.MINIMAL)
+        table.add_column("URL")
+        table.add_column("Status Code")
+        for url_result in display:
+            display_code = str(url_result.status_code)
+            if url_result.status_code == -1:
+                display_code = "-"
+            url = url_result.url
+            if url_result.status_code == 200:
+                display_code = f"[green]{display_code}[/green]"
+                url = f"[green]{url}[/green]"
+            table.add_row(url, display_code)
-    console = Console()
-    console.print(table)
+        console = Console()
+        console.print(table)
 if __name__ == "__main__":

cert_host_scraper/scraper.py ADDED Viewed

@@ -0,0 +1,87 @@
+import asyncio
+import logging
+from dataclasses import dataclass
+import requests
+import urllib3
+from tenacity import (
+    before_log,
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+logger = logging.getLogger(__name__)
+logging.getLogger("urllib3").setLevel(logging.ERROR)
+urllib3.disable_warnings()
+@dataclass
+class Options:
+    timeout: int
+    clean: bool
+@dataclass
+class UrlResult:
+    url: str
+    status_code: int
+@dataclass
+class Result:
+    scraped: list[UrlResult]
+    def filter_by_status_code(self, status_code: int) -> list[UrlResult]:
+        return [result for result in self.scraped if result.status_code == status_code]
+def fetch_site_information(url: str, timeout: int) -> int:
+    try:
+        return requests.get(url, timeout=timeout).status_code
+    except Exception as e:
+        logger.debug(e)
+        return -1
+async def async_fetch_site_information(url: str, timeout: int) -> int:
+    return await asyncio.to_thread(fetch_site_information, url, timeout)
+@retry(
+    retry=retry_if_exception_type(requests.HTTPError),
+    stop=stop_after_attempt(3),
+    wait=wait_exponential(multiplier=1, min=1, max=10),
+    reraise=True,
+    before=before_log(logger, logging.DEBUG),
+)
+def fetch_site(search: str) -> list[dict]:
+    url = f"https://crt.sh/?q={search}&output=json"
+    result = requests.get(url)
+    result.raise_for_status()
+    return result.json()
+def scrape_urls(results: list[dict], options: Options) -> list[str]:
+    total_urls = []
+    for result in results:
+        common_name = result["common_name"]
+        if options.clean and "*" in common_name:
+            continue
+        total_urls.append(f"https://{common_name}")
+    return list(set(total_urls))
+def fetch_urls(site: str, options: Options) -> list[str]:
+    results = fetch_site(site)
+    return scrape_urls(results, options)
+async def validate_url(url: str, options: Options) -> UrlResult:
+    return UrlResult(url, await async_fetch_site_information(url, options.timeout))

cert_host_scraper/utils.py ADDED Viewed

@@ -0,0 +1,13 @@
+import re
+from collections.abc import Iterable
+def strip_url(url: str) -> str:
+    url = re.sub(r"https?://", "", url)
+    url = re.sub(r"www.", "", url)
+    return re.sub(r"/.*", "", url)
+def divide_chunks(objects: list, size: int) -> Iterable[list[str]]:
+    for i in range(0, len(objects), size):
+        yield objects[i : i + size]

cert_host_scraper-0.10.3.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,68 @@
+Metadata-Version: 2.4
+Name: cert-host-scraper
+Version: 0.10.3
+Summary: Query the certificate transparency log from crt.sh by a given a keyword and returns the status code of the matched results. Optionally, filtering the results by status code.
+Project-URL: homepage, https://github.com/inverse/cert-host-scraper
+Project-URL: repository, https://github.com/inverse/cert-host-scraper
+Author-email: Malachi Soord <inverse.chi@gmail.com>
+License-Expression: MIT
+License-File: LICENSE
+Requires-Python: <4,>=3.10
+Requires-Dist: click<9,>=8.1.8
+Requires-Dist: requests<3,>=2.27.1
+Requires-Dist: rich<15,>=11
+Requires-Dist: single-source<0.5,>=0.4.0
+Requires-Dist: tenacity<10,>=9.0.0
+Description-Content-Type: text/markdown
+# Cert Host Scraper
+[![CI](https://github.com/inverse/cert-host-scraper/actions/workflows/ci.yml/badge.svg)](https://github.com/inverse/cert-host-scraper/actions/workflows/ci.yml)
+[![PyPI version](https://badge.fury.io/py/cert-host-scraper.svg)](https://badge.fury.io/py/cert-host-scraper)
+![PyPI downloads](https://img.shields.io/pypi/dm/cert-host-scraper?label=pypi%20downloads)
+[![codecov](https://codecov.io/github/inverse/cert-host-scraper/graph/badge.svg?token=TLO58M5UC5)](https://codecov.io/github/inverse/cert-host-scraper)
+[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
+![Static Badge](https://img.shields.io/badge/type%20checked-mypy-039dfc)
+[![License](https://img.shields.io/github/license/inverse/cert-host-scraper.svg)](LICENSE)
+Query the certificate transparency log from [crt.sh](https://crt.sh) by a given a keyword and returns the status code of the matched results. Optionally, filtering the results by status code.
+<img alt="Demo of cert-host-scraper" src="https://vhs.charm.sh/vhs-7fKWanXXcalG2oS28DVyZC.gif" width="800" />
+## Usage
+```bash
+cert-host-scraper search your-domain.com [--status-code 200]
+```
+## Installation
+With pipx:
+```bash
+pipx install cert-host-scraper
+```
+With pip:
+```bash
+pip install cert-host-scraper
+```
+## Development
+Requires [uv][0] and Python 3.10+.
+```bash
+uv install
+uv run python -m cert_host_scraper.cli
+```
+All dev tooling is managed [mise][1] as defined in the provided `mise.toml` and `.python-version` files.
+## License
+MIT
+[0]: https://github.com/astral-sh/uv
+[1]: https://github.com/jdx/mise

cert_host_scraper-0.10.3.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+cert_host_scraper/__init__.py,sha256=BUkbAFUgGLjxexWjDqSMZA-YAY55fxoD-lym6D6yDHw,142
+cert_host_scraper/cli.py,sha256=Hxu5XzC7O5eWph6Wrer5LcZEUkkxEeyCfIMyAoeuDWE,4685
+cert_host_scraper/scraper.py,sha256=NPOGRk15HRDChhBS7ubJKbVD4z4sb7yJ-0MBorskL40,2037
+cert_host_scraper/utils.py,sha256=p_t8LIrehMMLKNEiSDYGi4TehH9z0neDz3EdoOAeEeQ,340
+cert_host_scraper-0.10.3.dist-info/METADATA,sha256=LpsmRHAB79H9uDq6V9Dt6rYWeh4EOosGYVDebRI0_pg,2398
+cert_host_scraper-0.10.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+cert_host_scraper-0.10.3.dist-info/entry_points.txt,sha256=i8Io-t4dA1B2V2IGhht4QiaT_IavR-tGSZE_vmwYhvo,64
+cert_host_scraper-0.10.3.dist-info/licenses/LICENSE,sha256=x2zGqcA4IWCXX5UKMH144zM_rK2NMXSXHN5Qn8cg6yY,1070
+cert_host_scraper-0.10.3.dist-info/RECORD,,

{cert_host_scraper-0.3.0.dist-info → cert_host_scraper-0.10.3.dist-info}/WHEEL RENAMED Viewed

@@ -1,4 +1,4 @@
 Wheel-Version: 1.0
-Generator: poetry 1.0.8
+Generator: hatchling 1.27.0
 Root-Is-Purelib: true
 Tag: py3-none-any

cert_host_scraper-0.10.3.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ cert-host-scraper = cert_host_scraper.cli:cli

cert_host_scraper-0.3.0.dist-info/METADATA DELETED Viewed

@@ -1,54 +0,0 @@
-Metadata-Version: 2.1
-Name: cert-host-scraper
-Version: 0.3.0
-Summary:
-Home-page: https://github.com/inverse/cert-host-scraper
-License: MIT
-Author: Malachi Soord
-Author-email: inverse.chi@gmail.com
-Requires-Python: >=3.10,<4.0
-Classifier: License :: OSI Approved :: MIT License
-Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.10
-Requires-Dist: aiohttp[speedups] (>=3.8.1,<4.0.0)
-Requires-Dist: beautifulsoup4 (>=4.10.0,<5.0.0)
-Requires-Dist: click (>=8.0.3,<9.0.0)
-Requires-Dist: requests (>=2.27.1,<3.0.0)
-Requires-Dist: rich (>=11.0.0,<12.0.0)
-Project-URL: Repository, https://github.com/inverse/cert-host-scraper
-Description-Content-Type: text/markdown
-# Cert Host Scraper
-![CI](https://github.com/inverse/cert-host-scraper/workflows/CI/badge.svg)
-[![PyPI version](https://badge.fury.io/py/cert-host-scraper.svg)](https://badge.fury.io/py/cert-host-scraper)
-![PyPI downloads](https://img.shields.io/pypi/dm/cert-host-scraper?label=pypi%20downloads)
-[![License](https://img.shields.io/github/license/inverse/cert-host-scraper.svg)](LICENSE)
-[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
-Query the certificate transparency log for a keyword and check the status code of the results. Optionally filtering out based on the code.
-## Usage
-```bash
-cert-host-scraper search your-domain.com [--status-code 200]
-```
-## Installation
-With pipx:
-```bash
-pipx install cert-host-scraper
-```
-With pip:
-```bash
-pip install cert-host-scraper
-```
-## Licence
-MIT

cert_host_scraper-0.3.0.dist-info/RECORD DELETED Viewed

@@ -1,7 +0,0 @@
-cert_host_scraper/__init__.py,sha256=rxKmbcZRR3VtiwjJuOb-gziYvmlQJvHWjJFvew8ysHo,1768
-cert_host_scraper/cli.py,sha256=V3ek-4_z8UJNAElm1GpO3fhRKjNL09-FQaqMSOcEOGo,1805
-cert_host_scraper-0.3.0.dist-info/entry_points.txt,sha256=GlQNSNbnSjw_MDZrOzhqcATBJ7C4otv7Adrz2yaeK0w,63
-cert_host_scraper-0.3.0.dist-info/LICENSE,sha256=x2zGqcA4IWCXX5UKMH144zM_rK2NMXSXHN5Qn8cg6yY,1070
-cert_host_scraper-0.3.0.dist-info/WHEEL,sha256=DA86_h4QwwzGeRoz62o1svYt5kGEXpoUTuTtwzoTb30,83
-cert_host_scraper-0.3.0.dist-info/METADATA,sha256=An6mxNNP2ozqz-2TuaQ3CNz1GYV6glhEJ4asOp9E1vc,1575
-cert_host_scraper-0.3.0.dist-info/RECORD,,

cert_host_scraper-0.3.0.dist-info/entry_points.txt DELETED Viewed

@@ -1,3 +0,0 @@
-[console_scripts]
-cert-host-scraper=cert_host_scraper.cli:cli

{cert_host_scraper-0.3.0.dist-info → cert_host_scraper-0.10.3.dist-info/licenses}/LICENSE RENAMED Viewed

File without changes

cert-host-scraper 0.3.0__py3-none-any.whl → 0.10.3__py3-none-any.whl

cert-host-scraper 0.3.0py3-none-any.whl → 0.10.3py3-none-any.whl