smart-web-mcp 0.5.19 → 0.5.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,258 @@
1
+ #!/usr/bin/env python3
2
+ import json
3
+ import re
4
+ import subprocess
5
+ import sys
6
+ import time
7
+ from typing import Any
8
+
9
+
10
+ def fail(code: str, message: str, category: str = "unavailable"):
11
+ print(
12
+ json.dumps(
13
+ {
14
+ "ok": False,
15
+ "method": "undetected_chromedriver",
16
+ "error": {
17
+ "category": category,
18
+ "code": code,
19
+ "message": message,
20
+ },
21
+ }
22
+ )
23
+ )
24
+ raise SystemExit(0)
25
+
26
+
27
+ payload = {}
28
+ try:
29
+ payload = json.loads(sys.argv[1]) if len(sys.argv) > 1 else {}
30
+ except Exception as exc: # noqa: BLE001
31
+ fail("uc_invalid_payload", str(exc), "parse_error")
32
+
33
+ uc: Any = None
34
+ try:
35
+ import undetected_chromedriver as uc # type: ignore[import-not-found]
36
+ except Exception as exc: # noqa: BLE001
37
+ fail("uc_import_failed", str(exc))
38
+
39
+
40
+ def run_js(driver, script, *args):
41
+ try:
42
+ return driver.execute_script(script, *args)
43
+ except Exception: # noqa: BLE001
44
+ return None
45
+
46
+
47
+ url = str(payload.get("url") or "").strip()
48
+ timeout_ms = int(payload.get("timeoutMs") or 25000)
49
+ scroll_steps = max(0, min(12, int(payload.get("scrollSteps") or 0)))
50
+ max_load_more_clicks = max(0, min(10, int(payload.get("maxLoadMoreClicks") or 0)))
51
+ max_anchors = max(50, min(2000, int(payload.get("maxAnchors") or 400)))
52
+ wait_until = str(payload.get("waitUntil") or "networkidle")
53
+ user_agent = str(payload.get("userAgent") or "Mozilla/5.0")
54
+ chrome_path = str(payload.get("chromePath") or "").strip()
55
+ extra_headers = (
56
+ payload.get("extraHeaders") if isinstance(payload.get("extraHeaders"), dict) else {}
57
+ )
58
+
59
+ if not url:
60
+ fail("uc_missing_url", "URL is required", "parse_error")
61
+
62
+
63
+ def chrome_major_version(binary_path: str):
64
+ commands = (
65
+ [binary_path]
66
+ if binary_path
67
+ else ["google-chrome", "google-chrome-stable", "chromium", "chromium-browser"]
68
+ )
69
+ for command in commands:
70
+ if not command:
71
+ continue
72
+ try:
73
+ completed = subprocess.run(
74
+ [command, "--version"],
75
+ capture_output=True,
76
+ text=True,
77
+ timeout=5,
78
+ check=False,
79
+ )
80
+ except Exception:
81
+ continue
82
+ text = f"{completed.stdout} {completed.stderr}"
83
+ match = re.search(r"(\d+)\.\d+\.\d+\.\d+", text)
84
+ if match:
85
+ return int(match.group(1))
86
+ return None
87
+
88
+
89
+ options = uc.ChromeOptions()
90
+ options.add_argument("--headless=new")
91
+ options.add_argument("--disable-blink-features=AutomationControlled")
92
+ options.add_argument("--no-sandbox")
93
+ options.add_argument("--disable-dev-shm-usage")
94
+ options.add_argument("--window-size=1440,2400")
95
+ options.add_argument("--lang=ko-KR")
96
+ options.add_argument(f"--user-agent={user_agent}")
97
+ if chrome_path:
98
+ options.binary_location = chrome_path
99
+
100
+ driver = None
101
+
102
+ try:
103
+ version_main = chrome_major_version(chrome_path)
104
+ driver = uc.Chrome(
105
+ options=options,
106
+ headless=True,
107
+ browser_executable_path=chrome_path or None,
108
+ use_subprocess=True,
109
+ version_main=version_main,
110
+ )
111
+ if extra_headers:
112
+ driver.execute_cdp_cmd("Network.enable", {})
113
+ driver.execute_cdp_cmd(
114
+ "Network.setExtraHTTPHeaders",
115
+ {"headers": {str(key): str(value) for key, value in extra_headers.items()}},
116
+ )
117
+ driver.set_page_load_timeout(max(5, timeout_ms // 1000))
118
+ driver.get(url)
119
+ time.sleep(1.0 if wait_until == "domcontentloaded" else 2.0)
120
+
121
+ load_more_script = """
122
+ const pattern = /^(load more|show more|more posts|more results|view more|see more|더보기|더 보기|더 불러오기)$/i;
123
+ const nodes = Array.from(document.querySelectorAll('button, [role="button"], input[type="button"], input[type="submit"], a'));
124
+ for (const node of nodes) {
125
+ const text = (node.innerText || node.getAttribute('aria-label') || node.getAttribute('title') || '').replace(/\s+/g, ' ').trim();
126
+ if (!text || !pattern.test(text)) continue;
127
+ if (node.tagName === 'A') {
128
+ const href = node.getAttribute('href') || '';
129
+ if (href && href !== '#' && !href.toLowerCase().startsWith('javascript:')) continue;
130
+ }
131
+ const rect = node.getBoundingClientRect();
132
+ if (rect.width <= 0 || rect.height <= 0) continue;
133
+ node.click();
134
+ return true;
135
+ }
136
+ return false;
137
+ """
138
+
139
+ previous_height = (
140
+ run_js(
141
+ driver,
142
+ "return document.body?.scrollHeight || document.documentElement?.scrollHeight || 0",
143
+ )
144
+ or 0
145
+ )
146
+ stagnant = 0
147
+ load_more_clicks = 0
148
+ completed_scrolls = 0
149
+ for _ in range(scroll_steps):
150
+ clicked = False
151
+ if load_more_clicks < max_load_more_clicks:
152
+ clicked = bool(run_js(driver, load_more_script))
153
+ if clicked:
154
+ load_more_clicks += 1
155
+ time.sleep(0.7)
156
+ run_js(
157
+ driver,
158
+ "window.scrollTo(0, document.body?.scrollHeight || document.documentElement?.scrollHeight || 0)",
159
+ )
160
+ time.sleep(0.8)
161
+ completed_scrolls += 1
162
+ current_height = (
163
+ run_js(
164
+ driver,
165
+ "return document.body?.scrollHeight || document.documentElement?.scrollHeight || 0",
166
+ )
167
+ or previous_height
168
+ )
169
+ if not clicked and current_height <= previous_height:
170
+ stagnant += 1
171
+ else:
172
+ stagnant = 0
173
+ previous_height = current_height
174
+ if stagnant >= 2:
175
+ break
176
+
177
+ extract_script = """
178
+ const maxItems = arguments[0];
179
+ const selectors = ['main','article','[role="main"]','.theme-doc-markdown','.theme-doc-markdown.markdown','.markdown','.md-content','.VPDoc','.vp-doc','.prose','.documentation','.docs-content','.content'];
180
+ const nodes = Array.from(document.querySelectorAll('a[href]')).slice(0, maxItems).map((element) => {
181
+ const closest = element.closest('article, li, tr, section, main, .topic-list-item, .topic-body, .topic-post, .athing, .titleline, [role="article"], .feed-item, .timeline-item, .post, .storylink');
182
+ const text = (element.innerText || element.getAttribute('aria-label') || element.getAttribute('title') || '').replace(/\s+/g, ' ').trim();
183
+ const contextSource = (closest && closest.innerText) || element.innerText || element.textContent || '';
184
+ return {
185
+ href: element.href || '',
186
+ text,
187
+ context: contextSource.replace(/\s+/g, ' ').trim().slice(0, 500),
188
+ rel: element.rel || '',
189
+ class_name: typeof element.className === 'string' ? element.className : '',
190
+ aria_label: element.getAttribute('aria-label') || '',
191
+ };
192
+ });
193
+ const candidates = selectors.flatMap((selector) => Array.from(document.querySelectorAll(selector)));
194
+ const unique = Array.from(new Set(candidates));
195
+ const best = unique.map((node) => ({ node, score: (node.innerText || '').trim().length })).sort((a, b) => b.score - a.score)[0]?.node || document.body;
196
+ const clone = best instanceof HTMLElement ? best.cloneNode(true) : document.body.cloneNode(true);
197
+ if (clone instanceof HTMLElement) {
198
+ clone.querySelectorAll('script, style, noscript, svg, canvas, iframe, form, button, input, textarea, select, nav, header, footer, aside').forEach((node) => node.remove());
199
+ clone.querySelectorAll('.sidebar, .toc, .table-of-contents, .breadcrumb, .breadcrumbs, .edit-this-page, .theme-edit-this-page, .theme-doc-toc-desktop, .theme-doc-toc-mobile, .pagination-nav').forEach((node) => node.remove());
200
+ }
201
+ const headings = clone instanceof HTMLElement ? Array.from(clone.querySelectorAll('h1, h2, h3, h4, h5, h6')).map((node) => ({ level: Number(node.tagName.slice(1)), text: (node.textContent || '').replace(/\s+/g, ' ').trim(), id: node.id || '' })).filter((item) => item.text) : [];
202
+ return {
203
+ title: document.title || '',
204
+ html: document.documentElement ? document.documentElement.outerHTML : '',
205
+ text: document.body ? document.body.innerText || '' : '',
206
+ final_url: window.location.href,
207
+ anchors: nodes,
208
+ content_html: clone instanceof HTMLElement ? clone.innerHTML : '',
209
+ content_text: clone instanceof HTMLElement ? (clone.innerText || '').replace(/\s+/g, ' ').trim() : '',
210
+ headings,
211
+ };
212
+ """
213
+
214
+ extracted = run_js(driver, extract_script, max_anchors) or {}
215
+ anchors = extracted.get("anchors") if isinstance(extracted, dict) else []
216
+ links = []
217
+ seen = set()
218
+ for anchor in anchors or []:
219
+ href = str((anchor or {}).get("href") or "").strip()
220
+ if href and href not in seen:
221
+ seen.add(href)
222
+ links.append(href)
223
+
224
+ print(
225
+ json.dumps(
226
+ {
227
+ "ok": True,
228
+ "method": "undetected_chromedriver",
229
+ "url": url,
230
+ "final_url": str(
231
+ (extracted or {}).get("final_url") or driver.current_url or url
232
+ ),
233
+ "title": str((extracted or {}).get("title") or ""),
234
+ "html": str((extracted or {}).get("html") or ""),
235
+ "text": str((extracted or {}).get("text") or ""),
236
+ "content_html": str((extracted or {}).get("content_html") or ""),
237
+ "content_text": str((extracted or {}).get("content_text") or ""),
238
+ "headings": (extracted or {}).get("headings") or [],
239
+ "links": links,
240
+ "anchors": anchors or [],
241
+ "scroll_steps": completed_scrolls,
242
+ "load_more_clicks": load_more_clicks,
243
+ }
244
+ )
245
+ )
246
+ except Exception as exc: # noqa: BLE001
247
+ message = str(exc)
248
+ fail(
249
+ "uc_fetch_failed",
250
+ message,
251
+ "timeout" if "timeout" in message.lower() else "unavailable",
252
+ )
253
+ finally:
254
+ if driver is not None:
255
+ try:
256
+ driver.quit()
257
+ except Exception:
258
+ pass