sophhub 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/sophhub.js +21 -0
- package/package.json +32 -0
- package/skills/VERSIONS.md +27 -0
- package/skills/builtin/clawhub/SKILL.md +77 -0
- package/skills/builtin/flight-booking/SKILL.md +288 -0
- package/skills/builtin/flight-booking/scripts/flight_booking.py +1232 -0
- package/skills/builtin/inventory-management/SKILL.md +241 -0
- package/skills/builtin/inventory-management/scripts/inventory.py +1844 -0
- package/skills/builtin/schedule-reminder/SKILL.md +619 -0
- package/skills/builtin/schedule-reminder/schedule_template.md +68 -0
- package/skills/builtin/schedule-reminder/scripts/append_event.py +204 -0
- package/skills/builtin/schedule-reminder/scripts/create_reminders.sh +163 -0
- package/skills/builtin/schedule-reminder/scripts/daily_activate.sh +175 -0
- package/skills/builtin/schedule-reminder/scripts/parse_schedule.py +704 -0
- package/skills/builtin/schedule-reminder/scripts/setup.sh +242 -0
- package/skills/builtin/schedule-reminder//347/224/250/346/210/267/346/214/207/345/215/227.md +311 -0
- package/skills/builtin/skill-creator/SKILL.md +370 -0
- package/skills/builtin/skill-creator/license.txt +202 -0
- package/skills/builtin/skill-creator/scripts/init_skill.py +378 -0
- package/skills/builtin/skill-creator/scripts/package_skill.py +111 -0
- package/skills/builtin/skill-creator/scripts/quick_validate.py +101 -0
- package/skills/builtin/sophnet-customer-management/SKILL.md +271 -0
- package/skills/builtin/sophnet-customer-management/pyproject.toml +15 -0
- package/skills/builtin/sophnet-customer-management/src/customer_mgmt_cli/__init__.py +2 -0
- package/skills/builtin/sophnet-customer-management/src/customer_mgmt_cli/__main__.py +5 -0
- package/skills/builtin/sophnet-customer-management/src/customer_mgmt_cli/cli.py +67 -0
- package/skills/builtin/sophnet-customer-management/src/customer_mgmt_cli/commands/__init__.py +2 -0
- package/skills/builtin/sophnet-customer-management/src/customer_mgmt_cli/commands/customer.py +60 -0
- package/skills/builtin/sophnet-customer-management/src/customer_mgmt_cli/commands/export_file.py +18 -0
- package/skills/builtin/sophnet-customer-management/src/customer_mgmt_cli/commands/import_file.py +15 -0
- package/skills/builtin/sophnet-customer-management/src/customer_mgmt_cli/commands/reminder.py +26 -0
- package/skills/builtin/sophnet-customer-management/src/customer_mgmt_cli/commands/schema.py +28 -0
- package/skills/builtin/sophnet-customer-management/src/customer_mgmt_cli/config.py +54 -0
- package/skills/builtin/sophnet-customer-management/src/customer_mgmt_core/__init__.py +2 -0
- package/skills/builtin/sophnet-customer-management/src/customer_mgmt_core/exporter.py +85 -0
- package/skills/builtin/sophnet-customer-management/src/customer_mgmt_core/models.py +84 -0
- package/skills/builtin/sophnet-customer-management/src/customer_mgmt_core/normalizer.py +144 -0
- package/skills/builtin/sophnet-customer-management/src/customer_mgmt_core/parser.py +241 -0
- package/skills/builtin/sophnet-customer-management/src/customer_mgmt_core/query.py +109 -0
- package/skills/builtin/sophnet-customer-management/src/customer_mgmt_core/reminder.py +121 -0
- package/skills/builtin/sophnet-customer-management/src/customer_mgmt_core/repository.py +397 -0
- package/skills/builtin/sophnet-customer-management/src/customer_mgmt_core/schema.py +106 -0
- package/skills/builtin/sophnet-customer-management/src/customer_mgmt_core/service.py +565 -0
- package/skills/builtin/sophnet-customer-management/uv.lock +48 -0
- package/skills/builtin/sophnet-customized-marketing/SKILL.md +144 -0
- package/skills/builtin/sophnet-customized-marketing/playbooks/campaign-planning.md +187 -0
- package/skills/builtin/sophnet-customized-marketing/playbooks/content-generation.md +124 -0
- package/skills/builtin/sophnet-customized-marketing/playbooks/marketing-calendar.md +59 -0
- package/skills/builtin/sophnet-customized-marketing/playbooks/multi-channel-bundle.md +94 -0
- package/skills/builtin/sophnet-customized-marketing/playbooks/poster-generation.md +182 -0
- package/skills/builtin/sophnet-customized-marketing/playbooks/style-profile-workflow.md +103 -0
- package/skills/builtin/sophnet-customized-marketing/pyproject.toml +9 -0
- package/skills/builtin/sophnet-customized-marketing/references/campaign-mechanics.md +168 -0
- package/skills/builtin/sophnet-customized-marketing/references/content-safety.md +26 -0
- package/skills/builtin/sophnet-customized-marketing/references/marketing-date-checklist.md +99 -0
- package/skills/builtin/sophnet-customized-marketing/references/platform-writing-guidelines.md +88 -0
- package/skills/builtin/sophnet-customized-marketing/references/quality-checklist.md +44 -0
- package/skills/builtin/sophnet-customized-marketing/scripts/generate_poster.py +585 -0
- package/skills/builtin/sophnet-customized-marketing/scripts/style_profile.py +215 -0
- package/skills/builtin/sophnet-face-search/SKILL.md +115 -0
- package/skills/builtin/sophnet-face-search/pyproject.toml +11 -0
- package/skills/builtin/sophnet-face-search/scripts/face_search.py +336 -0
- package/skills/builtin/sophnet-face-search/uv.lock +508 -0
- package/skills/builtin/sophnet-image-edit/SKILL.md +140 -0
- package/skills/builtin/sophnet-image-edit/pyproject.toml +9 -0
- package/skills/builtin/sophnet-image-edit/scripts/edit_and_preview.sh +68 -0
- package/skills/builtin/sophnet-image-edit/scripts/edit_image.py +279 -0
- package/skills/builtin/sophnet-image-edit/uv.lock +234 -0
- package/skills/builtin/sophnet-image-generate/SKILL.md +62 -0
- package/skills/builtin/sophnet-image-generate/pyproject.toml +9 -0
- package/skills/builtin/sophnet-image-generate/scripts/generate_image.py +156 -0
- package/skills/builtin/sophnet-image-generate/uv.lock +234 -0
- package/skills/builtin/sophnet-image-ocr/SKILL.md +167 -0
- package/skills/builtin/sophnet-image-ocr/pyproject.toml +13 -0
- package/skills/builtin/sophnet-image-ocr/scripts/ocr.py +226 -0
- package/skills/builtin/sophnet-image-ocr/uv.lock +234 -0
- package/skills/builtin/sophnet-infinite-talk/SKILL.md +140 -0
- package/skills/builtin/sophnet-infinite-talk/pyproject.toml +9 -0
- package/skills/builtin/sophnet-infinite-talk/scripts/gen.py +172 -0
- package/skills/builtin/sophnet-oss/SKILL.md +109 -0
- package/skills/builtin/sophnet-oss/pyproject.toml +8 -0
- package/skills/builtin/sophnet-oss/scripts/upload_file.py +43 -0
- package/skills/builtin/sophnet-qa-install/SKILL.md +210 -0
- package/skills/builtin/sophnet-qa-install/pyproject.toml +6 -0
- package/skills/builtin/sophnet-qa-install/scripts/backup_md.py +35 -0
- package/skills/builtin/sophnet-qa-install/scripts/check_installed.py +143 -0
- package/skills/builtin/sophnet-qa-install/scripts/update_config.py +142 -0
- package/skills/builtin/sophnet-qa-install/scripts/update_md.py +73 -0
- package/skills/builtin/sophnet-training-install/SKILL.md +211 -0
- package/skills/builtin/sophnet-training-install/pyproject.toml +6 -0
- package/skills/builtin/sophnet-training-install/scripts/backup_md.py +35 -0
- package/skills/builtin/sophnet-training-install/scripts/check_installed.py +144 -0
- package/skills/builtin/sophnet-training-install/scripts/update_config.py +142 -0
- package/skills/builtin/sophnet-training-install/scripts/update_md.py +73 -0
- package/skills/builtin/sophnet-tts/SKILL.md +79 -0
- package/skills/builtin/sophnet-tts/pyproject.toml +9 -0
- package/skills/builtin/sophnet-tts/scripts/gen_tts.py +130 -0
- package/skills/builtin/sophnet-video-generate/SKILL.md +116 -0
- package/skills/builtin/sophnet-video-generate/scripts/gen_video.py +304 -0
- package/skills/builtin/video-understand/SKILL.md +79 -0
- package/skills/builtin/video-understand/scripts/video_understand.py +204 -0
- package/skills/builtin/weather/SKILL.md +112 -0
- package/skills/builtin/web-scraper/SKILL.md +101 -0
- package/skills/builtin/web-scraper/scripts/scrape.py +270 -0
- package/skills/builtin/website-builder/SKILL.md +266 -0
- package/skills/builtin/website-builder/scripts/deploy_site.sh +46 -0
- package/skills/store/didi-ride/SKILL.md +309 -0
- package/skills/store/didi-ride/_meta.json +6 -0
- package/skills/store/didi-ride/assets/PREFERENCE.md +58 -0
- package/skills/store/didi-ride/package.json +15 -0
- package/skills/store/didi-ride/references/api_references.md +171 -0
- package/skills/store/didi-ride/references/error_handling.md +68 -0
- package/skills/store/didi-ride/references/setup.md +73 -0
- package/skills/store/didi-ride/references/workflow.md +150 -0
- package/skills/store/flyai/SKILL.md +119 -0
- package/skills/store/flyai/references/fliggy-fast-search.md +53 -0
- package/skills/store/flyai/references/search-flight.md +89 -0
- package/skills/store/flyai/references/search-hotels.md +57 -0
- package/skills/store/flyai/references/search-poi.md +49 -0
- package/src/commands/download.js +103 -0
- package/src/commands/list.js +67 -0
- package/src/utils/config.js +24 -0
- package/src/utils/gitlab.js +67 -0
- package/src/utils/paths.js +19 -0
- package/src/utils/versions.js +38 -0
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: web-scraper
|
|
3
|
+
description: Scrape web pages on the server side. Supports static HTTP scraping (curl) and dynamic JavaScript rendering (Chrome print-to-pdf + PyMuPDF). Zero extra pip dependencies. Use when web_fetch returns empty or minimal content.
|
|
4
|
+
metadata: { "openclaw": { "emoji": "🕸️", "requires": { "bins": ["python3", "curl"] } } }
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# Web Scraper
|
|
8
|
+
|
|
9
|
+
Server-side web page scraping with two modes. **No extra pip packages needed** -- uses only curl, Chrome, and PyMuPDF (already in the Docker image).
|
|
10
|
+
|
|
11
|
+
## When to Use
|
|
12
|
+
|
|
13
|
+
- `web_fetch` returned empty or very short content (common with SPA/JS-heavy sites)
|
|
14
|
+
- The user explicitly asks to scrape or crawl a URL
|
|
15
|
+
|
|
16
|
+
## Quick Start
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
# Static scrape (fast, curl + text extraction, works for most pages)
|
|
20
|
+
python3 {baseDir}/scripts/scrape.py "https://example.com"
|
|
21
|
+
|
|
22
|
+
# Dynamic scrape (Chrome renders JS, saves PDF, PyMuPDF extracts text)
|
|
23
|
+
python3 {baseDir}/scripts/scrape.py "https://www.bitmain.com/" --mode dynamic
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Options
|
|
27
|
+
|
|
28
|
+
| Option | Default | Description |
|
|
29
|
+
| ------------------------ | -------- | ---------------------------------------------------------------------------------- |
|
|
30
|
+
| `--mode static\|dynamic` | `static` | `static`: curl fetch + regex text extract. `dynamic`: Chrome PDF render + PyMuPDF. |
|
|
31
|
+
| `--timeout <seconds>` | `15` | Request/render timeout |
|
|
32
|
+
| `--max-chars <n>` | `50000` | Truncate output beyond this length |
|
|
33
|
+
|
|
34
|
+
## Decision Logic
|
|
35
|
+
|
|
36
|
+
1. **Try static mode first** (fast, < 2 seconds).
|
|
37
|
+
2. If the result `text` is very short (< 200 characters) or empty, the page likely needs JavaScript.
|
|
38
|
+
3. **Retry with `--mode dynamic`** to render JavaScript via headless Chrome.
|
|
39
|
+
4. If dynamic mode fails (no Chrome), report the error to the user.
|
|
40
|
+
|
|
41
|
+
## Output Format
|
|
42
|
+
|
|
43
|
+
JSON to stdout:
|
|
44
|
+
|
|
45
|
+
```json
|
|
46
|
+
{
|
|
47
|
+
"status": "ok",
|
|
48
|
+
"mode": "dynamic",
|
|
49
|
+
"url": "https://www.bitmain.com/",
|
|
50
|
+
"title": "BITMAIN",
|
|
51
|
+
"text": "BITMAIN\nProducts\nBitcoin Miner S23 Hyd...",
|
|
52
|
+
"length": 1779
|
|
53
|
+
}
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Modes Explained
|
|
57
|
+
|
|
58
|
+
### Static Mode (`--mode static`)
|
|
59
|
+
|
|
60
|
+
- Uses `curl` to fetch raw HTML
|
|
61
|
+
- Regex-based tag stripping for text extraction
|
|
62
|
+
- Fast (< 2 seconds), no Python dependencies needed
|
|
63
|
+
- Does NOT execute JavaScript
|
|
64
|
+
- Good for: documentation, blogs, news, static content
|
|
65
|
+
|
|
66
|
+
### Dynamic Mode (`--mode dynamic`)
|
|
67
|
+
|
|
68
|
+
- Chrome `--headless=new --print-to-pdf` renders the page including all JavaScript
|
|
69
|
+
- PyMuPDF (`fitz`) extracts text from the resulting PDF
|
|
70
|
+
- Slower (15-40 seconds) but captures dynamically generated content
|
|
71
|
+
- Requires `google-chrome` or `chromium` on PATH
|
|
72
|
+
- Good for: SPA (React/Vue/Angular), JS-rendered dashboards
|
|
73
|
+
|
|
74
|
+
## Examples
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
# Basic static scrape
|
|
78
|
+
python3 {baseDir}/scripts/scrape.py "https://docs.python.org/3/tutorial/index.html"
|
|
79
|
+
|
|
80
|
+
# SPA page (needs JavaScript rendering)
|
|
81
|
+
python3 {baseDir}/scripts/scrape.py "https://www.bitmain.com/" --mode dynamic
|
|
82
|
+
|
|
83
|
+
# Longer timeout for slow pages
|
|
84
|
+
python3 {baseDir}/scripts/scrape.py "https://slow-site.example.com" --mode dynamic --timeout 30
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## Dependencies
|
|
88
|
+
|
|
89
|
+
| Tool | Required For | Already Installed |
|
|
90
|
+
| ---------------------------- | ------------------------ | ---------------------- |
|
|
91
|
+
| `python3` | Script runtime | Yes (Docker image) |
|
|
92
|
+
| `curl` | Static mode | Yes (Docker image) |
|
|
93
|
+
| `google-chrome` / `chromium` | Dynamic mode only | Depends on image |
|
|
94
|
+
| `PyMuPDF` (fitz) | Dynamic mode PDF reading | Yes (requirements.txt) |
|
|
95
|
+
| `pdfminer` | Fallback PDF reading | Yes (requirements.txt) |
|
|
96
|
+
|
|
97
|
+
## Security
|
|
98
|
+
|
|
99
|
+
- Only `http://` and `https://` URLs are allowed
|
|
100
|
+
- Requests to `localhost`, private IPs, and reserved addresses are blocked (SSRF protection)
|
|
101
|
+
- Output is truncated at `--max-chars` to prevent excessive memory usage
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Web scraper via headless Chrome print-to-pdf + PyMuPDF text extraction.
|
|
4
|
+
No requests/beautifulsoup4 needed -- only uses packages already in the Docker image.
|
|
5
|
+
Outputs JSON to stdout for agent consumption.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
import ipaddress
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
import shutil
|
|
13
|
+
import subprocess
|
|
14
|
+
import sys
|
|
15
|
+
import tempfile
|
|
16
|
+
from typing import Dict, Optional
|
|
17
|
+
from urllib.parse import urlparse
|
|
18
|
+
|
|
19
|
+
# ---------------------------------------------------------------------------
|
|
20
|
+
# SSRF guard
|
|
21
|
+
# ---------------------------------------------------------------------------
|
|
22
|
+
|
|
23
|
+
BLOCKED_HOSTS = {"localhost", "127.0.0.1", "0.0.0.0", "[::1]", "metadata.google.internal"}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def is_private_ip(hostname):
|
|
27
|
+
# type: (str) -> bool
|
|
28
|
+
try:
|
|
29
|
+
addr = ipaddress.ip_address(hostname)
|
|
30
|
+
return addr.is_private or addr.is_loopback or addr.is_link_local or addr.is_reserved
|
|
31
|
+
except ValueError:
|
|
32
|
+
return False
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def validate_url(url):
|
|
36
|
+
# type: (str) -> Optional[str]
|
|
37
|
+
parsed = urlparse(url)
|
|
38
|
+
if parsed.scheme not in ("http", "https"):
|
|
39
|
+
return "Unsupported scheme: %s. Only http/https allowed." % parsed.scheme
|
|
40
|
+
hostname = parsed.hostname or ""
|
|
41
|
+
if hostname in BLOCKED_HOSTS:
|
|
42
|
+
return "Blocked host: %s" % hostname
|
|
43
|
+
if is_private_ip(hostname):
|
|
44
|
+
return "Private/reserved IP blocked: %s" % hostname
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# ---------------------------------------------------------------------------
|
|
49
|
+
# Chrome binary detection
|
|
50
|
+
# ---------------------------------------------------------------------------
|
|
51
|
+
|
|
52
|
+
CHROME_CANDIDATES = ["google-chrome", "google-chrome-stable", "chromium", "chromium-browser"]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def find_chrome():
|
|
56
|
+
# type: () -> Optional[str]
|
|
57
|
+
for name in CHROME_CANDIDATES:
|
|
58
|
+
path = shutil.which(name)
|
|
59
|
+
if path:
|
|
60
|
+
return path
|
|
61
|
+
return None
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# ---------------------------------------------------------------------------
|
|
65
|
+
# Static scrape via curl (no extra deps)
|
|
66
|
+
# ---------------------------------------------------------------------------
|
|
67
|
+
|
|
68
|
+
def scrape_static(url, timeout):
|
|
69
|
+
# type: (str, int) -> Dict
|
|
70
|
+
"""Lightweight fetch using curl + basic text extraction."""
|
|
71
|
+
curl = shutil.which("curl")
|
|
72
|
+
if not curl:
|
|
73
|
+
return {"status": "error", "mode": "static", "url": url,
|
|
74
|
+
"error": "curl not found on PATH"}
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
result = subprocess.run(
|
|
78
|
+
[curl, "-sL", "--max-time", str(timeout),
|
|
79
|
+
"-H", "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/137.0",
|
|
80
|
+
"-H", "Accept: text/html,*/*;q=0.8",
|
|
81
|
+
url],
|
|
82
|
+
capture_output=True, text=True, timeout=timeout + 5,
|
|
83
|
+
)
|
|
84
|
+
except subprocess.TimeoutExpired:
|
|
85
|
+
return {"status": "error", "mode": "static", "url": url,
|
|
86
|
+
"error": "curl timed out after %ds" % timeout}
|
|
87
|
+
|
|
88
|
+
if result.returncode != 0:
|
|
89
|
+
return {"status": "error", "mode": "static", "url": url,
|
|
90
|
+
"error": "curl failed (code %d): %s" % (result.returncode, result.stderr[:300])}
|
|
91
|
+
|
|
92
|
+
html = result.stdout
|
|
93
|
+
text = _extract_text_from_html(html)
|
|
94
|
+
title = _extract_title_from_html(html)
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
"status": "ok",
|
|
98
|
+
"mode": "static",
|
|
99
|
+
"url": url,
|
|
100
|
+
"title": title,
|
|
101
|
+
"text": text,
|
|
102
|
+
"length": len(text),
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _extract_title_from_html(html):
|
|
107
|
+
# type: (str) -> str
|
|
108
|
+
"""Extract <title> content without bs4."""
|
|
109
|
+
import re
|
|
110
|
+
m = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
|
|
111
|
+
return m.group(1).strip() if m else ""
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _extract_text_from_html(html):
|
|
115
|
+
# type: (str) -> str
|
|
116
|
+
"""Rough HTML-to-text without bs4: strip tags, collapse whitespace."""
|
|
117
|
+
import re
|
|
118
|
+
text = re.sub(r"<script[^>]*>.*?</script>", "", html, flags=re.DOTALL | re.IGNORECASE)
|
|
119
|
+
text = re.sub(r"<style[^>]*>.*?</style>", "", text, flags=re.DOTALL | re.IGNORECASE)
|
|
120
|
+
text = re.sub(r"<[^>]+>", "\n", text)
|
|
121
|
+
text = re.sub(r" ", " ", text)
|
|
122
|
+
text = re.sub(r"&", "&", text)
|
|
123
|
+
text = re.sub(r"<", "<", text)
|
|
124
|
+
text = re.sub(r">", ">", text)
|
|
125
|
+
text = re.sub(r"&#\d+;", "", text)
|
|
126
|
+
text = re.sub(r"&\w+;", "", text)
|
|
127
|
+
lines = [line.strip() for line in text.split("\n") if line.strip()]
|
|
128
|
+
return "\n".join(lines)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# ---------------------------------------------------------------------------
|
|
132
|
+
# Dynamic scrape: Chrome --print-to-pdf + PyMuPDF
|
|
133
|
+
# ---------------------------------------------------------------------------
|
|
134
|
+
|
|
135
|
+
def scrape_dynamic(url, timeout):
|
|
136
|
+
# type: (str, int) -> Dict
|
|
137
|
+
chrome = find_chrome()
|
|
138
|
+
if not chrome:
|
|
139
|
+
return {
|
|
140
|
+
"status": "error", "mode": "dynamic", "url": url,
|
|
141
|
+
"error": "No Chrome/Chromium found. Searched: %s. Use --mode static instead."
|
|
142
|
+
% ", ".join(CHROME_CANDIDATES),
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
tmpdir = tempfile.mkdtemp(prefix="web-scraper-")
|
|
146
|
+
pdf_path = os.path.join(tmpdir, "page.pdf")
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
js_budget_ms = max(5000, timeout * 1000)
|
|
150
|
+
cmd = [
|
|
151
|
+
chrome,
|
|
152
|
+
"--headless=new",
|
|
153
|
+
"--print-to-pdf=" + pdf_path,
|
|
154
|
+
"--no-sandbox",
|
|
155
|
+
"--disable-gpu",
|
|
156
|
+
"--disable-software-rasterizer",
|
|
157
|
+
"--disable-dev-shm-usage",
|
|
158
|
+
"--no-pdf-header-footer",
|
|
159
|
+
"--virtual-time-budget=%d" % js_budget_ms,
|
|
160
|
+
url,
|
|
161
|
+
]
|
|
162
|
+
|
|
163
|
+
try:
|
|
164
|
+
subprocess.run(
|
|
165
|
+
cmd, capture_output=True, text=True,
|
|
166
|
+
timeout=timeout + 30,
|
|
167
|
+
)
|
|
168
|
+
except subprocess.TimeoutExpired:
|
|
169
|
+
return {"status": "error", "mode": "dynamic", "url": url,
|
|
170
|
+
"error": "Chrome timed out after %ds" % timeout}
|
|
171
|
+
|
|
172
|
+
if not os.path.exists(pdf_path) or os.path.getsize(pdf_path) == 0:
|
|
173
|
+
return {"status": "error", "mode": "dynamic", "url": url,
|
|
174
|
+
"error": "Chrome did not produce a PDF file"}
|
|
175
|
+
|
|
176
|
+
text = _extract_text_from_pdf(pdf_path)
|
|
177
|
+
title = _guess_title(text)
|
|
178
|
+
|
|
179
|
+
return {
|
|
180
|
+
"status": "ok",
|
|
181
|
+
"mode": "dynamic",
|
|
182
|
+
"url": url,
|
|
183
|
+
"title": title,
|
|
184
|
+
"text": text,
|
|
185
|
+
"length": len(text),
|
|
186
|
+
}
|
|
187
|
+
finally:
|
|
188
|
+
try:
|
|
189
|
+
shutil.rmtree(tmpdir, ignore_errors=True)
|
|
190
|
+
except Exception:
|
|
191
|
+
pass
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _extract_text_from_pdf(pdf_path):
|
|
195
|
+
# type: (str) -> str
|
|
196
|
+
"""Extract text from PDF using PyMuPDF (already in requirements.txt)."""
|
|
197
|
+
try:
|
|
198
|
+
import fitz # PyMuPDF
|
|
199
|
+
except ImportError:
|
|
200
|
+
return _extract_text_from_pdf_fallback(pdf_path)
|
|
201
|
+
|
|
202
|
+
doc = fitz.open(pdf_path)
|
|
203
|
+
pages = []
|
|
204
|
+
for page in doc:
|
|
205
|
+
pages.append(page.get_text())
|
|
206
|
+
doc.close()
|
|
207
|
+
text = "\n".join(pages).strip()
|
|
208
|
+
lines = [line.strip() for line in text.split("\n") if line.strip()]
|
|
209
|
+
return "\n".join(lines)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _extract_text_from_pdf_fallback(pdf_path):
|
|
213
|
+
# type: (str) -> str
|
|
214
|
+
"""Fallback: try pdfminer if PyMuPDF is not available."""
|
|
215
|
+
try:
|
|
216
|
+
from pdfminer.high_level import extract_text
|
|
217
|
+
return extract_text(pdf_path).strip()
|
|
218
|
+
except ImportError:
|
|
219
|
+
return "[error] Neither PyMuPDF nor pdfminer available. Install with: pip3 install --break-system-packages PyMuPDF"
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def _guess_title(text):
|
|
223
|
+
# type: (str) -> str
|
|
224
|
+
"""Use the first non-empty line as title."""
|
|
225
|
+
for line in text.split("\n"):
|
|
226
|
+
stripped = line.strip()
|
|
227
|
+
if stripped and len(stripped) < 200:
|
|
228
|
+
return stripped
|
|
229
|
+
return ""
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
# ---------------------------------------------------------------------------
|
|
233
|
+
# Main
|
|
234
|
+
# ---------------------------------------------------------------------------
|
|
235
|
+
|
|
236
|
+
def main():
|
|
237
|
+
parser = argparse.ArgumentParser(description="Web scraper for OpenClaw agents")
|
|
238
|
+
parser.add_argument("url", help="URL to scrape (http/https only)")
|
|
239
|
+
parser.add_argument("--mode", choices=["static", "dynamic"], default="static",
|
|
240
|
+
help="static: curl + text extract; dynamic: Chrome PDF + PyMuPDF (default: static)")
|
|
241
|
+
parser.add_argument("--timeout", type=int, default=15,
|
|
242
|
+
help="Request timeout in seconds (default: 15)")
|
|
243
|
+
parser.add_argument("--max-chars", type=int, default=50000,
|
|
244
|
+
help="Max output characters (default: 50000)")
|
|
245
|
+
args = parser.parse_args()
|
|
246
|
+
|
|
247
|
+
err = validate_url(args.url)
|
|
248
|
+
if err:
|
|
249
|
+
print(json.dumps({"status": "error", "url": args.url, "error": err}, ensure_ascii=False))
|
|
250
|
+
sys.exit(1)
|
|
251
|
+
|
|
252
|
+
try:
|
|
253
|
+
if args.mode == "static":
|
|
254
|
+
result = scrape_static(args.url, args.timeout)
|
|
255
|
+
else:
|
|
256
|
+
result = scrape_dynamic(args.url, args.timeout)
|
|
257
|
+
except Exception as e:
|
|
258
|
+
result = {"status": "error", "url": args.url, "mode": args.mode,
|
|
259
|
+
"error": "%s: %s" % (type(e).__name__, e)}
|
|
260
|
+
|
|
261
|
+
if result.get("status") == "ok" and len(result.get("text", "")) > args.max_chars:
|
|
262
|
+
result["text"] = result["text"][:args.max_chars] + "\n\n... [truncated]"
|
|
263
|
+
result["truncated"] = True
|
|
264
|
+
result["length"] = len(result["text"])
|
|
265
|
+
|
|
266
|
+
print(json.dumps(result, ensure_ascii=False, indent=2))
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
if __name__ == "__main__":
|
|
270
|
+
main()
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: website-builder
|
|
3
|
+
description: >
|
|
4
|
+
一键建站助手:根据用户需求生成完整网站(HTML/CSS/JS),自动部署到 moltbot 内置静态托管,
|
|
5
|
+
立即给出可公开访问的链接(http://<host>:18789/canvas/<site-name>/)。
|
|
6
|
+
支持:上传截图/UI图进行像素级复刻、多轮自然语言修改、响应式设计、多页面网站。
|
|
7
|
+
触发场景:用户说"帮我做一个网站"、"做一个landing page"、"做一个作品集"、"复刻这个界面"、
|
|
8
|
+
"帮我建一个xxx网站"等。
|
|
9
|
+
metadata: { "openclaw": { "emoji": "🌐", "requires": { "bins": ["node"] } } }
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## 加载时说明
|
|
13
|
+
|
|
14
|
+
当用户启用此 skill 时,发送以下欢迎语:
|
|
15
|
+
|
|
16
|
+
> 🌐 **建站助手已就绪!**
|
|
17
|
+
>
|
|
18
|
+
> 我可以帮你:
|
|
19
|
+
>
|
|
20
|
+
> - 用一句话描述需求,自动生成并部署网站
|
|
21
|
+
> - 上传截图 / UI 图,像素级复刻
|
|
22
|
+
> - 多轮对话实时修改,所见即所得
|
|
23
|
+
>
|
|
24
|
+
> 生成后你会收到一个可直接访问的链接,无需额外配置。
|
|
25
|
+
>
|
|
26
|
+
> 你想做什么样的网站?
|
|
27
|
+
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
# 建站助手 (Website Builder)
|
|
31
|
+
|
|
32
|
+
根据需求生成完整网站,部署到 moltbot gateway 的 `/canvas/` 静态路由,立即可访问。
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## 一、访问路径说明
|
|
37
|
+
|
|
38
|
+
网站文件存放在:
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
<workspace>/canvas/<site-name>/index.html
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
访问地址:
|
|
45
|
+
|
|
46
|
+
```
|
|
47
|
+
http://<moltbot-host>:18789/canvas/<site-name>/
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
其中 `<moltbot-host>` 是运行 moltbot 的服务器 IP 或域名(如 `192.168.1.100`)。
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## 二、工作流程
|
|
55
|
+
|
|
56
|
+
### Step 1:理解需求,输出确认卡片
|
|
57
|
+
|
|
58
|
+
收到建站需求后,先展示理解摘要让用户确认,**不要直接开始写代码**:
|
|
59
|
+
|
|
60
|
+
```
|
|
61
|
+
🌐 建站方案确认
|
|
62
|
+
|
|
63
|
+
- 网站类型:个人作品集
|
|
64
|
+
- 风格:极简黑白
|
|
65
|
+
- 页面:首页、作品、关于、联系
|
|
66
|
+
- 技术栈:纯 HTML + CSS + JS(无需构建)
|
|
67
|
+
- 站点名称(URL路径):portfolio
|
|
68
|
+
- 访问地址:http://<your-ip>:18789/canvas/portfolio/
|
|
69
|
+
|
|
70
|
+
确认开始生成?
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Step 2:生成网站文件
|
|
74
|
+
|
|
75
|
+
用户确认后,调用部署脚本:
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
bash /app/skills/website-builder/scripts/deploy_site.sh \
|
|
79
|
+
--name "<site-name>" \
|
|
80
|
+
--workspace "<workspace-dir>"
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
脚本负责:
|
|
84
|
+
|
|
85
|
+
1. 创建 `<workspace>/canvas/<site-name>/` 目录
|
|
86
|
+
2. Agent 将生成的 HTML/CSS/JS 文件写入该目录
|
|
87
|
+
3. 输出访问链接
|
|
88
|
+
|
|
89
|
+
#### 生成代码的要求
|
|
90
|
+
|
|
91
|
+
- **单文件优先**:能写在一个 `index.html` 里就不拆分(内联 CSS + JS),更易修改
|
|
92
|
+
- **多页面**:每个页面一个 HTML 文件(`about.html`、`works.html` 等),共享一个 `style.css`
|
|
93
|
+
- **响应式**:必须适配手机和桌面
|
|
94
|
+
- **无外部依赖**:不引用 CDN,所有样式和脚本内联或本地,确保离线可用
|
|
95
|
+
- **现代设计**:使用 CSS Grid / Flexbox,避免过时的表格布局
|
|
96
|
+
|
|
97
|
+
### Step 3:写入文件
|
|
98
|
+
|
|
99
|
+
使用 exec 工具直接写文件到 canvas 目录:
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
# 写入 index.html
|
|
103
|
+
with open('<workspace>/canvas/<site-name>/index.html', 'w') as f:
|
|
104
|
+
f.write('''...生成的HTML内容...''')
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### Step 4:输出访问链接
|
|
108
|
+
|
|
109
|
+
```
|
|
110
|
+
✅ 网站已部署!
|
|
111
|
+
|
|
112
|
+
🔗 访问链接:http://<your-ip>:18789/canvas/<site-name>/
|
|
113
|
+
|
|
114
|
+
📁 文件位置:<workspace>/canvas/<site-name>/
|
|
115
|
+
├── index.html
|
|
116
|
+
├── about.html(如有)
|
|
117
|
+
└── style.css(如有)
|
|
118
|
+
|
|
119
|
+
需要修改?直接告诉我(如"把导航栏改成深色"、"加一个联系表单")。
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## 三、获取 moltbot 访问地址
|
|
125
|
+
|
|
126
|
+
Agent 需要告知用户正确的访问地址。通过以下方式获取 host:
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
# 获取容器内可用的网络地址
|
|
130
|
+
hostname -I 2>/dev/null | awk '{print $1}'
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
或直接提示用户:
|
|
134
|
+
|
|
135
|
+
> 请将 `<your-ip>` 替换为你访问 moltbot 的 IP 地址(就是你打开 moltbot 界面时用的那个 IP)。
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
## 四、部署脚本
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
bash /app/skills/website-builder/scripts/deploy_site.sh --name <site-name> --workspace <workspace-dir>
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
脚本仅负责创建目录,文件由 agent 用 exec 写入。
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## 五、多轮修改流程
|
|
150
|
+
|
|
151
|
+
用户发出修改指令后:
|
|
152
|
+
|
|
153
|
+
1. 读取现有文件:`cat <workspace>/canvas/<site-name>/index.html`
|
|
154
|
+
2. 按需修改代码
|
|
155
|
+
3. 重新写入文件
|
|
156
|
+
4. 告知用户刷新浏览器即可看到效果(支持 live reload 如已开启)
|
|
157
|
+
|
|
158
|
+
**常见修改示例:**
|
|
159
|
+
|
|
160
|
+
| 用户说 | Agent 动作 |
|
|
161
|
+
| ---------------------- | ---------------------------------- |
|
|
162
|
+
| "把导航栏改成深色背景" | 修改 CSS `.navbar` 背景色 |
|
|
163
|
+
| "按钮换成蓝色渐变" | 修改 button 样式 |
|
|
164
|
+
| "加一个联系我表单" | 新增 form 区块 + 样式 |
|
|
165
|
+
| "首页加个轮播图" | 添加 JS 轮播逻辑 |
|
|
166
|
+
| "字体换成更现代的" | 引入 Google Fonts 或换用系统字体栈 |
|
|
167
|
+
| "把这个部分删掉" | 删除对应 HTML 区块 |
|
|
168
|
+
|
|
169
|
+
---
|
|
170
|
+
|
|
171
|
+
## 六、上传截图复刻流程
|
|
172
|
+
|
|
173
|
+
用户上传网页截图或 UI 图时:
|
|
174
|
+
|
|
175
|
+
1. **读取图片描述**:系统会在消息上下文中提供图片的文字描述(`[Image] Description: ...`)
|
|
176
|
+
2. **提取设计元素**:从描述中理解布局、配色、字体、组件结构
|
|
177
|
+
3. **生成复刻代码**:按理解的设计还原 HTML/CSS
|
|
178
|
+
4. **展示确认**:告知用户已识别的设计要素,确认后部署
|
|
179
|
+
|
|
180
|
+
若需要更精确的 OCR 识别,可配合 `sophnet-image-ocr` skill 提取图片中的文字内容。
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## 七、站点命名规则
|
|
185
|
+
|
|
186
|
+
站点名称(`<site-name>`)用于 URL 路径,规则:
|
|
187
|
+
|
|
188
|
+
- 只含小写字母、数字、连字符
|
|
189
|
+
- 不含空格和特殊字符
|
|
190
|
+
- 示例:`portfolio`、`my-blog`、`landing-2026`
|
|
191
|
+
|
|
192
|
+
若用户未指定,Agent 根据需求自动生成一个合适的名称并告知用户。
|
|
193
|
+
|
|
194
|
+
---
|
|
195
|
+
|
|
196
|
+
## 八、管理已部署站点
|
|
197
|
+
|
|
198
|
+
```bash
|
|
199
|
+
# 查看所有已部署站点
|
|
200
|
+
ls <workspace>/canvas/
|
|
201
|
+
|
|
202
|
+
# 删除某个站点
|
|
203
|
+
rm -rf <workspace>/canvas/<site-name>/
|
|
204
|
+
|
|
205
|
+
# 更新站点(直接覆盖文件即可)
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
---
|
|
209
|
+
|
|
210
|
+
## 九、快速建站示例
|
|
211
|
+
|
|
212
|
+
### 示例 A:个人作品集
|
|
213
|
+
|
|
214
|
+
**用户**:帮我做一个极简风格的个人作品集网站,包含首页、作品、关于我三个页面
|
|
215
|
+
|
|
216
|
+
**Bot**:
|
|
217
|
+
|
|
218
|
+
```
|
|
219
|
+
🌐 建站方案确认
|
|
220
|
+
|
|
221
|
+
- 类型:个人作品集
|
|
222
|
+
- 风格:极简(白底、细线、大量留白)
|
|
223
|
+
- 页面:首页(index.html)、作品(works.html)、关于(about.html)
|
|
224
|
+
- 站点名称:portfolio
|
|
225
|
+
- 访问地址:http://<your-ip>:18789/canvas/portfolio/
|
|
226
|
+
|
|
227
|
+
确认开始?
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
### 示例 B:产品 Landing Page
|
|
231
|
+
|
|
232
|
+
**用户**:帮我做一个卖运动鞋的 landing page,苹果官网风格
|
|
233
|
+
|
|
234
|
+
**Bot**:
|
|
235
|
+
|
|
236
|
+
```
|
|
237
|
+
🌐 建站方案确认
|
|
238
|
+
|
|
239
|
+
- 类型:产品 Landing Page
|
|
240
|
+
- 风格:极简(深色背景、全屏大图、苹果式排版)
|
|
241
|
+
- 内容:Hero区、产品特性、图片展示、购买CTA
|
|
242
|
+
- 站点名称:sneaker-landing
|
|
243
|
+
- 访问地址:http://<your-ip>:18789/canvas/sneaker-landing/
|
|
244
|
+
|
|
245
|
+
确认开始?
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
### 示例 C:复刻截图
|
|
249
|
+
|
|
250
|
+
**用户**:(上传网页截图)帮我复刻这个界面
|
|
251
|
+
|
|
252
|
+
**Bot**:
|
|
253
|
+
|
|
254
|
+
```
|
|
255
|
+
📸 识别到以下设计要素:
|
|
256
|
+
|
|
257
|
+
- 布局:顶部导航 + Hero大图 + 三栏特性介绍 + 底部
|
|
258
|
+
- 配色:深蓝主色 (#1a237e),白色文字,橙色强调色
|
|
259
|
+
- 字体:无衬线,标题大号加粗
|
|
260
|
+
- 组件:固定导航栏、全屏背景图、卡片组件
|
|
261
|
+
|
|
262
|
+
站点名称:cloned-site
|
|
263
|
+
访问地址:http://<your-ip>:18789/canvas/cloned-site/
|
|
264
|
+
|
|
265
|
+
开始复刻?
|
|
266
|
+
```
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
#
|
|
3
|
+
# deploy_site.sh - Create canvas directory for a website
|
|
4
|
+
#
|
|
5
|
+
# Usage:
|
|
6
|
+
# bash deploy_site.sh --name <site-name> --workspace <workspace-dir>
|
|
7
|
+
#
|
|
8
|
+
# Outputs:
|
|
9
|
+
# SITE_DIR=<absolute path to site directory>
|
|
10
|
+
# SITE_URL_PATH=/canvas/<site-name>/
|
|
11
|
+
#
|
|
12
|
+
|
|
13
|
+
set -euo pipefail
|
|
14
|
+
|
|
15
|
+
SITE_NAME=""
|
|
16
|
+
WORKSPACE_DIR=""
|
|
17
|
+
|
|
18
|
+
while [[ $# -gt 0 ]]; do
|
|
19
|
+
case "$1" in
|
|
20
|
+
--name) SITE_NAME="$2"; shift 2 ;;
|
|
21
|
+
--workspace) WORKSPACE_DIR="$2"; shift 2 ;;
|
|
22
|
+
*) echo "Unknown option: $1" >&2; exit 1 ;;
|
|
23
|
+
esac
|
|
24
|
+
done
|
|
25
|
+
|
|
26
|
+
if [[ -z "$SITE_NAME" ]]; then
|
|
27
|
+
echo "Error: --name is required" >&2
|
|
28
|
+
exit 1
|
|
29
|
+
fi
|
|
30
|
+
|
|
31
|
+
# Sanitize site name
|
|
32
|
+
SITE_NAME="$(echo "$SITE_NAME" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-z0-9-]/-/g' | sed 's/--*/-/g' | sed 's/^-//;s/-$//')"
|
|
33
|
+
|
|
34
|
+
if [[ -z "$WORKSPACE_DIR" ]]; then
|
|
35
|
+
# Try to infer from environment
|
|
36
|
+
STATE_DIR="${HOME}/.openclaw"
|
|
37
|
+
WORKSPACE_DIR="${STATE_DIR}/workspace"
|
|
38
|
+
fi
|
|
39
|
+
|
|
40
|
+
CANVAS_ROOT="${WORKSPACE_DIR}/canvas"
|
|
41
|
+
SITE_DIR="${CANVAS_ROOT}/${SITE_NAME}"
|
|
42
|
+
|
|
43
|
+
mkdir -p "$SITE_DIR"
|
|
44
|
+
|
|
45
|
+
echo "SITE_DIR=${SITE_DIR}"
|
|
46
|
+
echo "SITE_URL_PATH=/canvas/${SITE_NAME}/"
|