staticweb 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
staticweb/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# staticweb package
|
staticweb/cli.py
ADDED
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import requests
|
|
3
|
+
from bs4 import BeautifulSoup
|
|
4
|
+
import sys
|
|
5
|
+
import json
|
|
6
|
+
import time
|
|
7
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
8
|
+
|
|
9
|
+
from rich.progress import Progress, BarColumn, TimeRemainingColumn, SpinnerColumn
|
|
10
|
+
from rich.console import Console
|
|
11
|
+
from rich.table import Table
|
|
12
|
+
from rich.panel import Panel
|
|
13
|
+
from rich.syntax import Syntax
|
|
14
|
+
from rich.markdown import Markdown
|
|
15
|
+
|
|
16
|
+
console = Console()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def fetch_soup(url, insecure=False):
|
|
20
|
+
headers = {"User-Agent": "Mozilla/5.0"}
|
|
21
|
+
try:
|
|
22
|
+
res = requests.get(url, headers=headers, verify=not insecure, timeout=10)
|
|
23
|
+
res.raise_for_status()
|
|
24
|
+
return BeautifulSoup(res.text, "html.parser")
|
|
25
|
+
except Exception as e:
|
|
26
|
+
return f"[ERROR] {url} -> {e}"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def extract_links(soup):
|
|
30
|
+
return [{"text": a.get_text(strip=True), "href": a.get("href")} for a in soup.find_all("a")]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def extract_paragraphs(soup):
|
|
34
|
+
return [p.get_text(strip=True) for p in soup.find_all("p")]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def extract_all_text(soup):
|
|
38
|
+
return soup.get_text(separator="\n", strip=True)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def extract_tag(soup, tag):
|
|
42
|
+
return [el.get_text(strip=True) for el in soup.find_all(tag)]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def process_url(url, args):
|
|
46
|
+
soup = fetch_soup(url, args.insecure)
|
|
47
|
+
|
|
48
|
+
if isinstance(soup, str):
|
|
49
|
+
return soup
|
|
50
|
+
|
|
51
|
+
if args.link:
|
|
52
|
+
return extract_links(soup)
|
|
53
|
+
elif args.para:
|
|
54
|
+
return extract_paragraphs(soup)
|
|
55
|
+
elif args.all:
|
|
56
|
+
return extract_all_text(soup)
|
|
57
|
+
elif args.tag:
|
|
58
|
+
return extract_tag(soup, args.tag)
|
|
59
|
+
else:
|
|
60
|
+
return "[ERROR] No valid option"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def show_help():
|
|
64
|
+
help_text = """
|
|
65
|
+
# ⚡ StaticWeb CLI
|
|
66
|
+
|
|
67
|
+
A fast, multi-threaded static web scraper.
|
|
68
|
+
|
|
69
|
+
## Usage
|
|
70
|
+
python staticweb.py <urls> [options]
|
|
71
|
+
|
|
72
|
+
## Options
|
|
73
|
+
- --link Extract all links
|
|
74
|
+
- --para Extract paragraphs
|
|
75
|
+
- --all Extract full text
|
|
76
|
+
- --tag <tag> Extract specific HTML tag
|
|
77
|
+
|
|
78
|
+
## Output
|
|
79
|
+
- --json Output in JSON format
|
|
80
|
+
- --save <file> Save output to file
|
|
81
|
+
|
|
82
|
+
## Performance
|
|
83
|
+
- --threads N Number of threads (default: 1)
|
|
84
|
+
- --insecure Disable SSL verification
|
|
85
|
+
|
|
86
|
+
## Dev Tools
|
|
87
|
+
- --codeofit Show source code of this tool
|
|
88
|
+
|
|
89
|
+
## Example
|
|
90
|
+
python staticweb.py https://example.com --link --threads 5
|
|
91
|
+
"""
|
|
92
|
+
console.print(Markdown(help_text))
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def show_code():
|
|
96
|
+
try:
|
|
97
|
+
with open(__file__, "r", encoding="utf-8") as f:
|
|
98
|
+
code = f.read()
|
|
99
|
+
syntax = Syntax(code, "python", theme="monokai", line_numbers=True)
|
|
100
|
+
console.print(syntax)
|
|
101
|
+
except Exception as e:
|
|
102
|
+
console.print(f"[red][ERROR] Cannot read source: {e}[/red]")
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def output_result(results, args):
|
|
106
|
+
console.print("\n[bold cyan]=== RESULTS ===[/bold cyan]\n")
|
|
107
|
+
|
|
108
|
+
for url, result in results.items():
|
|
109
|
+
console.print(Panel(f"[bold yellow]{url}[/bold yellow]", expand=False))
|
|
110
|
+
|
|
111
|
+
if isinstance(result, str):
|
|
112
|
+
console.print(f"[red]{result}[/red]")
|
|
113
|
+
continue
|
|
114
|
+
|
|
115
|
+
if args.json:
|
|
116
|
+
console.print_json(json.dumps(result))
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
if args.link:
|
|
120
|
+
table = Table(title="Links", show_lines=True)
|
|
121
|
+
table.add_column("Text", style="cyan", overflow="fold")
|
|
122
|
+
table.add_column("Href", style="green", overflow="fold")
|
|
123
|
+
|
|
124
|
+
for item in result:
|
|
125
|
+
table.add_row(item["text"], str(item["href"]))
|
|
126
|
+
|
|
127
|
+
console.print(table)
|
|
128
|
+
else:
|
|
129
|
+
for i, item in enumerate(result, 1):
|
|
130
|
+
console.print(f"[green]{i}.[/green] {item}")
|
|
131
|
+
|
|
132
|
+
if args.save:
|
|
133
|
+
try:
|
|
134
|
+
with open(args.save, "w", encoding="utf-8") as f:
|
|
135
|
+
json.dump(results, f, indent=2)
|
|
136
|
+
console.print(f"\n[bold green][+] Saved to {args.save}[/bold green]")
|
|
137
|
+
except Exception as e:
|
|
138
|
+
console.print(f"[red][ERROR] Save failed: {e}[/red]")
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def main():
|
|
142
|
+
parser = argparse.ArgumentParser(add_help=False)
|
|
143
|
+
|
|
144
|
+
parser.add_argument("urls", nargs="*", help="One or more URLs")
|
|
145
|
+
|
|
146
|
+
parser.add_argument("--link", action="store_true")
|
|
147
|
+
parser.add_argument("--para", action="store_true")
|
|
148
|
+
parser.add_argument("--all", action="store_true")
|
|
149
|
+
parser.add_argument("--tag", type=str)
|
|
150
|
+
|
|
151
|
+
parser.add_argument("--json", action="store_true")
|
|
152
|
+
parser.add_argument("--save", type=str)
|
|
153
|
+
|
|
154
|
+
parser.add_argument("--threads", type=int, default=1)
|
|
155
|
+
parser.add_argument("--insecure", action="store_true")
|
|
156
|
+
|
|
157
|
+
parser.add_argument("--help", action="store_true")
|
|
158
|
+
parser.add_argument("--codeofit", action="store_true")
|
|
159
|
+
|
|
160
|
+
args = parser.parse_args()
|
|
161
|
+
|
|
162
|
+
if args.help:
|
|
163
|
+
show_help()
|
|
164
|
+
sys.exit(0)
|
|
165
|
+
|
|
166
|
+
if args.codeofit:
|
|
167
|
+
show_code()
|
|
168
|
+
sys.exit(0)
|
|
169
|
+
|
|
170
|
+
if not args.urls:
|
|
171
|
+
console.print("[red]No URL provided. Use --help[/red]")
|
|
172
|
+
sys.exit(1)
|
|
173
|
+
|
|
174
|
+
console.print(Panel("[bold magenta]⚡ StaticWeb Scraper Starting...[/bold magenta]"))
|
|
175
|
+
|
|
176
|
+
results = {}
|
|
177
|
+
|
|
178
|
+
with Progress(
|
|
179
|
+
SpinnerColumn(style="bold magenta"),
|
|
180
|
+
"[progress.description]{task.description}",
|
|
181
|
+
BarColumn(bar_width=40),
|
|
182
|
+
"[progress.percentage]{task.percentage:>3.0f}%",
|
|
183
|
+
TimeRemainingColumn(),
|
|
184
|
+
console=console,
|
|
185
|
+
) as progress:
|
|
186
|
+
|
|
187
|
+
task = progress.add_task("[cyan]Scraping URLs...", total=len(args.urls))
|
|
188
|
+
|
|
189
|
+
with ThreadPoolExecutor(max_workers=args.threads) as executor:
|
|
190
|
+
futures = {executor.submit(process_url, url, args): url for url in args.urls}
|
|
191
|
+
|
|
192
|
+
for future in as_completed(futures):
|
|
193
|
+
url = futures[future]
|
|
194
|
+
result = future.result()
|
|
195
|
+
|
|
196
|
+
results[url] = result
|
|
197
|
+
|
|
198
|
+
progress.update(task, advance=1)
|
|
199
|
+
time.sleep(0.02)
|
|
200
|
+
|
|
201
|
+
output_result(results, args)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def run():
|
|
205
|
+
main()
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
staticweb/__init__.py,sha256=KvrRCfAMKwZz7BDtebzzc-_1VyM-inYE-OFc7DIX0WU,19
|
|
2
|
+
staticweb/cli.py,sha256=r888efjVNEmZnESPgJkbgPZOuJgETqRwbgncU8FsGl0,5637
|
|
3
|
+
staticweb-0.1.0.dist-info/METADATA,sha256=TXuUVzylEcxzxehn6AmmCR3ZUg737N74k0Wz9V8jBc8,187
|
|
4
|
+
staticweb-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
5
|
+
staticweb-0.1.0.dist-info/entry_points.txt,sha256=Lzz08NMLX3-TrUwgLhEaHDG02sRRZXHgrOcc-WXtvIM,41
|
|
6
|
+
staticweb-0.1.0.dist-info/top_level.txt,sha256=W0nceJfb8idUeZDMitfOrJe1aAppQA-_vIIEAq5DVdg,10
|
|
7
|
+
staticweb-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
staticweb
|