cert-host-scraper 0.3.0__py3-none-any.whl → 0.10.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,78 +1,5 @@
1
- import logging
2
- from dataclasses import dataclass
3
- from typing import List
1
+ from pathlib import Path
4
2
 
5
- import requests
6
- from bs4 import BeautifulSoup
3
+ from single_source import get_version
7
4
 
8
- logger = logging.getLogger(__name__)
9
-
10
-
11
- @dataclass
12
- class Options:
13
- timeout: int
14
- clean: bool
15
-
16
-
17
- @dataclass
18
- class UrlResult:
19
- url: str
20
- status_code: int
21
-
22
-
23
- @dataclass
24
- class Result:
25
- scraped: List[UrlResult]
26
-
27
- def filter_by_status_code(self, status_code: int) -> List[UrlResult]:
28
- return [result for result in self.scraped if result.status_code == status_code]
29
-
30
-
31
- def fetch_site_information(url: str, timeout: int) -> int:
32
- try:
33
- return requests.get(url, timeout=timeout).status_code
34
- except Exception as e:
35
- logger.debug(e)
36
- return -1
37
-
38
-
39
- def fetch_site(search: str) -> str:
40
- url = f"https://crt.sh/?q={search}"
41
- result = requests.get(url)
42
- result.raise_for_status()
43
-
44
- return result.content.decode()
45
-
46
-
47
- def scrape_urls(contents: str, options: Options) -> List[str]:
48
- soup = BeautifulSoup(contents, features="html.parser")
49
- tables = soup.findAll("table")
50
-
51
- if len(tables) <= 2:
52
- return []
53
-
54
- results_table = tables[2]
55
-
56
- total_urls = []
57
- for row in results_table.findAll("tr"):
58
- cells = row.findAll("td")
59
- if len(cells) == 0:
60
- continue
61
-
62
- matching_identity = cells[4].decode_contents()
63
- if options.clean and "*" in matching_identity:
64
- continue
65
-
66
- total_urls.append(f"https://{matching_identity}")
67
-
68
- return list(set(total_urls))
69
-
70
-
71
- def validate_url(url: str, options: Options) -> UrlResult:
72
- return UrlResult(url, fetch_site_information(url, options.timeout))
73
-
74
-
75
- def fetch_urls(site: str, options: Options) -> List[str]:
76
- contents = fetch_site(site)
77
- urls = scrape_urls(contents, options)
78
- return urls
5
+ __version__ = get_version("cert_host_scraper", Path(__file__).parent.parent)
cert_host_scraper/cli.py CHANGED
@@ -1,17 +1,84 @@
1
+ import asyncio
2
+ import json
1
3
  import logging
2
4
  import sys
3
5
 
4
6
  import click
5
7
  from requests import RequestException
8
+ from rich import box
6
9
  from rich.console import Console
7
10
  from rich.progress import track
8
11
  from rich.table import Table
9
12
 
10
- from cert_host_scraper import Options, Result, fetch_urls, validate_url
13
+ from cert_host_scraper import __version__
14
+ from cert_host_scraper.scraper import (
15
+ Options,
16
+ Result,
17
+ UrlResult,
18
+ fetch_urls,
19
+ validate_url,
20
+ )
21
+ from cert_host_scraper.utils import divide_chunks, strip_url
22
+
23
+ NO_STATUS_CODE_FILTER = 0
24
+
25
+
26
+ def process_urls(
27
+ urls: list[str], options: Options, batch_size: int, show_progress: bool
28
+ ) -> list[UrlResult]:
29
+ """
30
+ Process a list of URLs concurrently and return the results.
31
+ """
32
+ results = []
33
+ try:
34
+ loop = asyncio.get_running_loop()
35
+ except RuntimeError:
36
+ loop = asyncio.new_event_loop()
37
+ asyncio.set_event_loop(loop)
38
+
39
+ chunks = list(divide_chunks(urls, batch_size))
40
+
41
+ progress_iterable = range(len(chunks))
42
+ if show_progress:
43
+ progress_iterable = track(progress_iterable, "Checking URLs")
44
+
45
+ for chunk_index in progress_iterable:
46
+ chunk = chunks[chunk_index]
47
+ chunk_result = loop.run_until_complete(
48
+ asyncio.gather(*[validate_url(url, options) for url in chunk])
49
+ )
50
+ results.extend(chunk_result)
51
+
52
+ return results
53
+
54
+
55
+ def validate_status_code(
56
+ _ctx: click.core.Context, _param: click.core.Option, value: str
57
+ ):
58
+ try:
59
+ status_code = int(value)
60
+ if not (100 <= status_code <= 599):
61
+ raise click.BadParameter("status code must be between 100 and 599")
62
+
63
+ return status_code
64
+ except ValueError as e:
65
+ raise click.BadParameter("must be an integer") from e
66
+ except TypeError:
67
+ return NO_STATUS_CODE_FILTER
68
+
69
+
70
+ class Output:
71
+ TABLE = "table"
72
+ JSON = "json"
73
+
74
+ @classmethod
75
+ def values(cls) -> list:
76
+ return [cls.TABLE, cls.JSON]
11
77
 
12
78
 
13
79
  @click.group()
14
80
  @click.option("--debug", is_flag=True, help="Whether to enable debug level output")
81
+ @click.version_option(__version__, message="%(version)s")
15
82
  def cli(debug: bool):
16
83
  log_level = logging.DEBUG if debug else logging.INFO
17
84
  logging.basicConfig(level=log_level)
@@ -19,41 +86,93 @@ def cli(debug: bool):
19
86
 
20
87
  @cli.command()
21
88
  @click.argument("search")
22
- @click.option("--status-code", help="Pass the HTTP status code to filter results on")
89
+ @click.option(
90
+ "--status-code",
91
+ help="Pass the HTTP status code to filter results on",
92
+ callback=validate_status_code,
93
+ )
23
94
  @click.option("--timeout", help="Seconds before timing out on each request", default=2)
24
95
  @click.option(
25
96
  "--clean/--no-clean", is_flag=True, help="Clean wildcard results", default=True
26
97
  )
27
- def search(search: str, status_code: int, timeout: int, clean: bool):
98
+ @click.option(
99
+ "--strip/--no-strip",
100
+ is_flag=True,
101
+ help="Remove protocol and leading www from search",
102
+ default=True,
103
+ )
104
+ @click.option(
105
+ "--batch-size",
106
+ help="Number of URLs to process at once",
107
+ default=20,
108
+ )
109
+ @click.option(
110
+ "--output", type=click.Choice(Output.values()), required=True, default="table"
111
+ )
112
+ def search(
113
+ search: str,
114
+ status_code: int,
115
+ timeout: int,
116
+ clean: bool,
117
+ strip: bool,
118
+ batch_size: int,
119
+ output: str,
120
+ ):
28
121
  """
29
122
  Search the certificate transparency log.
30
123
  """
31
- click.echo(f"Searching for {search}")
124
+ if strip:
125
+ search = strip_url(search)
126
+
127
+ display_json = output == Output.JSON
128
+
129
+ if not display_json:
130
+ click.echo(f"Searching for {search}")
32
131
  options = Options(timeout, clean)
33
- results = []
132
+
34
133
  try:
35
134
  urls = fetch_urls(search, options)
36
- click.echo(f"Found {len(urls)} URLs for {search}")
37
- for url in track(urls, "Checking URLs"):
38
- results.append(validate_url(url, options))
39
135
  except RequestException as e:
40
136
  click.echo(f"Failed to search for results: {e}")
41
137
  sys.exit(1)
42
138
 
43
- result = Result(results)
44
- if status_code:
45
- display = result.filter_by_status_code(int(status_code))
139
+ if not display_json:
140
+ click.echo(f"Found {len(urls)} URLs for {search}")
141
+
142
+ scraped_results = process_urls(
143
+ urls, options, batch_size, show_progress=not display_json
144
+ )
145
+
146
+ result = Result(scraped_results)
147
+ if status_code != NO_STATUS_CODE_FILTER:
148
+ display = result.filter_by_status_code(status_code)
46
149
  else:
47
150
  display = result.scraped
48
151
 
49
- table = Table(show_header=True, header_style="bold")
50
- table.add_column("URL")
51
- table.add_column("Status Code")
52
- for url_result in display:
53
- table.add_row(url_result.url, str(url_result.status_code))
152
+ if display_json:
153
+ json_output = [
154
+ {"url": url_result.url, "status_code": url_result.status_code}
155
+ for url_result in display
156
+ ]
157
+ click.echo(json.dumps(json_output, indent=2))
158
+ else:
159
+ table = Table(show_header=True, header_style="bold", box=box.MINIMAL)
160
+ table.add_column("URL")
161
+ table.add_column("Status Code")
162
+ for url_result in display:
163
+ display_code = str(url_result.status_code)
164
+ if url_result.status_code == -1:
165
+ display_code = "-"
166
+
167
+ url = url_result.url
168
+ if url_result.status_code == 200:
169
+ display_code = f"[green]{display_code}[/green]"
170
+ url = f"[green]{url}[/green]"
171
+
172
+ table.add_row(url, display_code)
54
173
 
55
- console = Console()
56
- console.print(table)
174
+ console = Console()
175
+ console.print(table)
57
176
 
58
177
 
59
178
  if __name__ == "__main__":
@@ -0,0 +1,87 @@
1
+ import asyncio
2
+ import logging
3
+ from dataclasses import dataclass
4
+
5
+ import requests
6
+ import urllib3
7
+ from tenacity import (
8
+ before_log,
9
+ retry,
10
+ retry_if_exception_type,
11
+ stop_after_attempt,
12
+ wait_exponential,
13
+ )
14
+
15
+ logger = logging.getLogger(__name__)
16
+ logging.getLogger("urllib3").setLevel(logging.ERROR)
17
+
18
+ urllib3.disable_warnings()
19
+
20
+
21
+ @dataclass
22
+ class Options:
23
+ timeout: int
24
+ clean: bool
25
+
26
+
27
+ @dataclass
28
+ class UrlResult:
29
+ url: str
30
+ status_code: int
31
+
32
+
33
+ @dataclass
34
+ class Result:
35
+ scraped: list[UrlResult]
36
+
37
+ def filter_by_status_code(self, status_code: int) -> list[UrlResult]:
38
+ return [result for result in self.scraped if result.status_code == status_code]
39
+
40
+
41
+ def fetch_site_information(url: str, timeout: int) -> int:
42
+ try:
43
+ return requests.get(url, timeout=timeout).status_code
44
+ except Exception as e:
45
+ logger.debug(e)
46
+ return -1
47
+
48
+
49
+ async def async_fetch_site_information(url: str, timeout: int) -> int:
50
+ return await asyncio.to_thread(fetch_site_information, url, timeout)
51
+
52
+
53
+ @retry(
54
+ retry=retry_if_exception_type(requests.HTTPError),
55
+ stop=stop_after_attempt(3),
56
+ wait=wait_exponential(multiplier=1, min=1, max=10),
57
+ reraise=True,
58
+ before=before_log(logger, logging.DEBUG),
59
+ )
60
+ def fetch_site(search: str) -> list[dict]:
61
+ url = f"https://crt.sh/?q={search}&output=json"
62
+ result = requests.get(url)
63
+ result.raise_for_status()
64
+
65
+ return result.json()
66
+
67
+
68
+ def scrape_urls(results: list[dict], options: Options) -> list[str]:
69
+ total_urls = []
70
+ for result in results:
71
+ common_name = result["common_name"]
72
+
73
+ if options.clean and "*" in common_name:
74
+ continue
75
+
76
+ total_urls.append(f"https://{common_name}")
77
+
78
+ return list(set(total_urls))
79
+
80
+
81
+ def fetch_urls(site: str, options: Options) -> list[str]:
82
+ results = fetch_site(site)
83
+ return scrape_urls(results, options)
84
+
85
+
86
+ async def validate_url(url: str, options: Options) -> UrlResult:
87
+ return UrlResult(url, await async_fetch_site_information(url, options.timeout))
@@ -0,0 +1,13 @@
1
+ import re
2
+ from collections.abc import Iterable
3
+
4
+
5
+ def strip_url(url: str) -> str:
6
+ url = re.sub(r"https?://", "", url)
7
+ url = re.sub(r"www.", "", url)
8
+ return re.sub(r"/.*", "", url)
9
+
10
+
11
+ def divide_chunks(objects: list, size: int) -> Iterable[list[str]]:
12
+ for i in range(0, len(objects), size):
13
+ yield objects[i : i + size]
@@ -0,0 +1,68 @@
1
+ Metadata-Version: 2.4
2
+ Name: cert-host-scraper
3
+ Version: 0.10.3
4
+ Summary: Query the certificate transparency log from crt.sh by a given a keyword and returns the status code of the matched results. Optionally, filtering the results by status code.
5
+ Project-URL: homepage, https://github.com/inverse/cert-host-scraper
6
+ Project-URL: repository, https://github.com/inverse/cert-host-scraper
7
+ Author-email: Malachi Soord <inverse.chi@gmail.com>
8
+ License-Expression: MIT
9
+ License-File: LICENSE
10
+ Requires-Python: <4,>=3.10
11
+ Requires-Dist: click<9,>=8.1.8
12
+ Requires-Dist: requests<3,>=2.27.1
13
+ Requires-Dist: rich<15,>=11
14
+ Requires-Dist: single-source<0.5,>=0.4.0
15
+ Requires-Dist: tenacity<10,>=9.0.0
16
+ Description-Content-Type: text/markdown
17
+
18
+ # Cert Host Scraper
19
+
20
+ [![CI](https://github.com/inverse/cert-host-scraper/actions/workflows/ci.yml/badge.svg)](https://github.com/inverse/cert-host-scraper/actions/workflows/ci.yml)
21
+ [![PyPI version](https://badge.fury.io/py/cert-host-scraper.svg)](https://badge.fury.io/py/cert-host-scraper)
22
+ ![PyPI downloads](https://img.shields.io/pypi/dm/cert-host-scraper?label=pypi%20downloads)
23
+ [![codecov](https://codecov.io/github/inverse/cert-host-scraper/graph/badge.svg?token=TLO58M5UC5)](https://codecov.io/github/inverse/cert-host-scraper)
24
+ [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
25
+ ![Static Badge](https://img.shields.io/badge/type%20checked-mypy-039dfc)
26
+ [![License](https://img.shields.io/github/license/inverse/cert-host-scraper.svg)](LICENSE)
27
+
28
+ Query the certificate transparency log from [crt.sh](https://crt.sh) by a given a keyword and returns the status code of the matched results. Optionally, filtering the results by status code.
29
+
30
+ <img alt="Demo of cert-host-scraper" src="https://vhs.charm.sh/vhs-7fKWanXXcalG2oS28DVyZC.gif" width="800" />
31
+
32
+ ## Usage
33
+
34
+ ```bash
35
+ cert-host-scraper search your-domain.com [--status-code 200]
36
+ ```
37
+
38
+ ## Installation
39
+
40
+ With pipx:
41
+
42
+ ```bash
43
+ pipx install cert-host-scraper
44
+ ```
45
+
46
+ With pip:
47
+
48
+ ```bash
49
+ pip install cert-host-scraper
50
+ ```
51
+
52
+ ## Development
53
+
54
+ Requires [uv][0] and Python 3.10+.
55
+
56
+ ```bash
57
+ uv install
58
+ uv run python -m cert_host_scraper.cli
59
+ ```
60
+
61
+ All dev tooling is managed [mise][1] as defined in the provided `mise.toml` and `.python-version` files.
62
+
63
+ ## License
64
+
65
+ MIT
66
+
67
+ [0]: https://github.com/astral-sh/uv
68
+ [1]: https://github.com/jdx/mise
@@ -0,0 +1,9 @@
1
+ cert_host_scraper/__init__.py,sha256=BUkbAFUgGLjxexWjDqSMZA-YAY55fxoD-lym6D6yDHw,142
2
+ cert_host_scraper/cli.py,sha256=Hxu5XzC7O5eWph6Wrer5LcZEUkkxEeyCfIMyAoeuDWE,4685
3
+ cert_host_scraper/scraper.py,sha256=NPOGRk15HRDChhBS7ubJKbVD4z4sb7yJ-0MBorskL40,2037
4
+ cert_host_scraper/utils.py,sha256=p_t8LIrehMMLKNEiSDYGi4TehH9z0neDz3EdoOAeEeQ,340
5
+ cert_host_scraper-0.10.3.dist-info/METADATA,sha256=LpsmRHAB79H9uDq6V9Dt6rYWeh4EOosGYVDebRI0_pg,2398
6
+ cert_host_scraper-0.10.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
7
+ cert_host_scraper-0.10.3.dist-info/entry_points.txt,sha256=i8Io-t4dA1B2V2IGhht4QiaT_IavR-tGSZE_vmwYhvo,64
8
+ cert_host_scraper-0.10.3.dist-info/licenses/LICENSE,sha256=x2zGqcA4IWCXX5UKMH144zM_rK2NMXSXHN5Qn8cg6yY,1070
9
+ cert_host_scraper-0.10.3.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry 1.0.8
2
+ Generator: hatchling 1.27.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ cert-host-scraper = cert_host_scraper.cli:cli
@@ -1,54 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: cert-host-scraper
3
- Version: 0.3.0
4
- Summary:
5
- Home-page: https://github.com/inverse/cert-host-scraper
6
- License: MIT
7
- Author: Malachi Soord
8
- Author-email: inverse.chi@gmail.com
9
- Requires-Python: >=3.10,<4.0
10
- Classifier: License :: OSI Approved :: MIT License
11
- Classifier: Programming Language :: Python :: 3
12
- Classifier: Programming Language :: Python :: 3.10
13
- Requires-Dist: aiohttp[speedups] (>=3.8.1,<4.0.0)
14
- Requires-Dist: beautifulsoup4 (>=4.10.0,<5.0.0)
15
- Requires-Dist: click (>=8.0.3,<9.0.0)
16
- Requires-Dist: requests (>=2.27.1,<3.0.0)
17
- Requires-Dist: rich (>=11.0.0,<12.0.0)
18
- Project-URL: Repository, https://github.com/inverse/cert-host-scraper
19
- Description-Content-Type: text/markdown
20
-
21
- # Cert Host Scraper
22
-
23
- ![CI](https://github.com/inverse/cert-host-scraper/workflows/CI/badge.svg)
24
- [![PyPI version](https://badge.fury.io/py/cert-host-scraper.svg)](https://badge.fury.io/py/cert-host-scraper)
25
- ![PyPI downloads](https://img.shields.io/pypi/dm/cert-host-scraper?label=pypi%20downloads)
26
- [![License](https://img.shields.io/github/license/inverse/cert-host-scraper.svg)](LICENSE)
27
- [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
28
-
29
- Query the certificate transparency log for a keyword and check the status code of the results. Optionally filtering out based on the code.
30
-
31
- ## Usage
32
-
33
- ```bash
34
- cert-host-scraper search your-domain.com [--status-code 200]
35
- ```
36
-
37
- ## Installation
38
-
39
- With pipx:
40
-
41
- ```bash
42
- pipx install cert-host-scraper
43
- ```
44
-
45
- With pip:
46
-
47
- ```bash
48
- pip install cert-host-scraper
49
- ```
50
-
51
- ## Licence
52
-
53
- MIT
54
-
@@ -1,7 +0,0 @@
1
- cert_host_scraper/__init__.py,sha256=rxKmbcZRR3VtiwjJuOb-gziYvmlQJvHWjJFvew8ysHo,1768
2
- cert_host_scraper/cli.py,sha256=V3ek-4_z8UJNAElm1GpO3fhRKjNL09-FQaqMSOcEOGo,1805
3
- cert_host_scraper-0.3.0.dist-info/entry_points.txt,sha256=GlQNSNbnSjw_MDZrOzhqcATBJ7C4otv7Adrz2yaeK0w,63
4
- cert_host_scraper-0.3.0.dist-info/LICENSE,sha256=x2zGqcA4IWCXX5UKMH144zM_rK2NMXSXHN5Qn8cg6yY,1070
5
- cert_host_scraper-0.3.0.dist-info/WHEEL,sha256=DA86_h4QwwzGeRoz62o1svYt5kGEXpoUTuTtwzoTb30,83
6
- cert_host_scraper-0.3.0.dist-info/METADATA,sha256=An6mxNNP2ozqz-2TuaQ3CNz1GYV6glhEJ4asOp9E1vc,1575
7
- cert_host_scraper-0.3.0.dist-info/RECORD,,
@@ -1,3 +0,0 @@
1
- [console_scripts]
2
- cert-host-scraper=cert_host_scraper.cli:cli
3
-