smart-web-mcp 0.5.20 → 0.5.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +8 -0
- package/README.md +7 -0
- package/dist/browser-session.d.ts +4 -2
- package/dist/browser-session.js +125 -15
- package/dist/browser-session.js.map +1 -1
- package/dist/linkedin.d.ts +1 -0
- package/dist/linkedin.js +7 -0
- package/dist/linkedin.js.map +1 -1
- package/dist/settings.d.ts +2 -0
- package/dist/settings.js +5 -1
- package/dist/settings.js.map +1 -1
- package/dist/smartcrawl.d.ts +11 -2
- package/dist/smartcrawl.js +93 -4
- package/dist/smartcrawl.js.map +1 -1
- package/dist/smartfetch/providers/linkedin.js +39 -4
- package/dist/smartfetch/providers/linkedin.js.map +1 -1
- package/dist/smartfetch/providers/x.js +44 -17
- package/dist/smartfetch/providers/x.js.map +1 -1
- package/dist/smartfetch.js +6 -1
- package/dist/smartfetch.js.map +1 -1
- package/package.json +3 -1
- package/python/requirements-undetected-chromedriver.txt +3 -0
- package/scripts/undetected_chromedriver_fetch.py +258 -0
|
@@ -0,0 +1,258 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import json
|
|
3
|
+
import re
|
|
4
|
+
import subprocess
|
|
5
|
+
import sys
|
|
6
|
+
import time
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def fail(code: str, message: str, category: str = "unavailable"):
|
|
11
|
+
print(
|
|
12
|
+
json.dumps(
|
|
13
|
+
{
|
|
14
|
+
"ok": False,
|
|
15
|
+
"method": "undetected_chromedriver",
|
|
16
|
+
"error": {
|
|
17
|
+
"category": category,
|
|
18
|
+
"code": code,
|
|
19
|
+
"message": message,
|
|
20
|
+
},
|
|
21
|
+
}
|
|
22
|
+
)
|
|
23
|
+
)
|
|
24
|
+
raise SystemExit(0)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
payload = {}
|
|
28
|
+
try:
|
|
29
|
+
payload = json.loads(sys.argv[1]) if len(sys.argv) > 1 else {}
|
|
30
|
+
except Exception as exc: # noqa: BLE001
|
|
31
|
+
fail("uc_invalid_payload", str(exc), "parse_error")
|
|
32
|
+
|
|
33
|
+
uc: Any = None
|
|
34
|
+
try:
|
|
35
|
+
import undetected_chromedriver as uc # type: ignore[import-not-found]
|
|
36
|
+
except Exception as exc: # noqa: BLE001
|
|
37
|
+
fail("uc_import_failed", str(exc))
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def run_js(driver, script, *args):
|
|
41
|
+
try:
|
|
42
|
+
return driver.execute_script(script, *args)
|
|
43
|
+
except Exception: # noqa: BLE001
|
|
44
|
+
return None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
url = str(payload.get("url") or "").strip()
|
|
48
|
+
timeout_ms = int(payload.get("timeoutMs") or 25000)
|
|
49
|
+
scroll_steps = max(0, min(12, int(payload.get("scrollSteps") or 0)))
|
|
50
|
+
max_load_more_clicks = max(0, min(10, int(payload.get("maxLoadMoreClicks") or 0)))
|
|
51
|
+
max_anchors = max(50, min(2000, int(payload.get("maxAnchors") or 400)))
|
|
52
|
+
wait_until = str(payload.get("waitUntil") or "networkidle")
|
|
53
|
+
user_agent = str(payload.get("userAgent") or "Mozilla/5.0")
|
|
54
|
+
chrome_path = str(payload.get("chromePath") or "").strip()
|
|
55
|
+
extra_headers = (
|
|
56
|
+
payload.get("extraHeaders") if isinstance(payload.get("extraHeaders"), dict) else {}
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
if not url:
|
|
60
|
+
fail("uc_missing_url", "URL is required", "parse_error")
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def chrome_major_version(binary_path: str):
|
|
64
|
+
commands = (
|
|
65
|
+
[binary_path]
|
|
66
|
+
if binary_path
|
|
67
|
+
else ["google-chrome", "google-chrome-stable", "chromium", "chromium-browser"]
|
|
68
|
+
)
|
|
69
|
+
for command in commands:
|
|
70
|
+
if not command:
|
|
71
|
+
continue
|
|
72
|
+
try:
|
|
73
|
+
completed = subprocess.run(
|
|
74
|
+
[command, "--version"],
|
|
75
|
+
capture_output=True,
|
|
76
|
+
text=True,
|
|
77
|
+
timeout=5,
|
|
78
|
+
check=False,
|
|
79
|
+
)
|
|
80
|
+
except Exception:
|
|
81
|
+
continue
|
|
82
|
+
text = f"{completed.stdout} {completed.stderr}"
|
|
83
|
+
match = re.search(r"(\d+)\.\d+\.\d+\.\d+", text)
|
|
84
|
+
if match:
|
|
85
|
+
return int(match.group(1))
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
options = uc.ChromeOptions()
|
|
90
|
+
options.add_argument("--headless=new")
|
|
91
|
+
options.add_argument("--disable-blink-features=AutomationControlled")
|
|
92
|
+
options.add_argument("--no-sandbox")
|
|
93
|
+
options.add_argument("--disable-dev-shm-usage")
|
|
94
|
+
options.add_argument("--window-size=1440,2400")
|
|
95
|
+
options.add_argument("--lang=ko-KR")
|
|
96
|
+
options.add_argument(f"--user-agent={user_agent}")
|
|
97
|
+
if chrome_path:
|
|
98
|
+
options.binary_location = chrome_path
|
|
99
|
+
|
|
100
|
+
driver = None
|
|
101
|
+
|
|
102
|
+
try:
|
|
103
|
+
version_main = chrome_major_version(chrome_path)
|
|
104
|
+
driver = uc.Chrome(
|
|
105
|
+
options=options,
|
|
106
|
+
headless=True,
|
|
107
|
+
browser_executable_path=chrome_path or None,
|
|
108
|
+
use_subprocess=True,
|
|
109
|
+
version_main=version_main,
|
|
110
|
+
)
|
|
111
|
+
if extra_headers:
|
|
112
|
+
driver.execute_cdp_cmd("Network.enable", {})
|
|
113
|
+
driver.execute_cdp_cmd(
|
|
114
|
+
"Network.setExtraHTTPHeaders",
|
|
115
|
+
{"headers": {str(key): str(value) for key, value in extra_headers.items()}},
|
|
116
|
+
)
|
|
117
|
+
driver.set_page_load_timeout(max(5, timeout_ms // 1000))
|
|
118
|
+
driver.get(url)
|
|
119
|
+
time.sleep(1.0 if wait_until == "domcontentloaded" else 2.0)
|
|
120
|
+
|
|
121
|
+
load_more_script = """
|
|
122
|
+
const pattern = /^(load more|show more|more posts|more results|view more|see more|더보기|더 보기|더 불러오기)$/i;
|
|
123
|
+
const nodes = Array.from(document.querySelectorAll('button, [role="button"], input[type="button"], input[type="submit"], a'));
|
|
124
|
+
for (const node of nodes) {
|
|
125
|
+
const text = (node.innerText || node.getAttribute('aria-label') || node.getAttribute('title') || '').replace(/\s+/g, ' ').trim();
|
|
126
|
+
if (!text || !pattern.test(text)) continue;
|
|
127
|
+
if (node.tagName === 'A') {
|
|
128
|
+
const href = node.getAttribute('href') || '';
|
|
129
|
+
if (href && href !== '#' && !href.toLowerCase().startsWith('javascript:')) continue;
|
|
130
|
+
}
|
|
131
|
+
const rect = node.getBoundingClientRect();
|
|
132
|
+
if (rect.width <= 0 || rect.height <= 0) continue;
|
|
133
|
+
node.click();
|
|
134
|
+
return true;
|
|
135
|
+
}
|
|
136
|
+
return false;
|
|
137
|
+
"""
|
|
138
|
+
|
|
139
|
+
previous_height = (
|
|
140
|
+
run_js(
|
|
141
|
+
driver,
|
|
142
|
+
"return document.body?.scrollHeight || document.documentElement?.scrollHeight || 0",
|
|
143
|
+
)
|
|
144
|
+
or 0
|
|
145
|
+
)
|
|
146
|
+
stagnant = 0
|
|
147
|
+
load_more_clicks = 0
|
|
148
|
+
completed_scrolls = 0
|
|
149
|
+
for _ in range(scroll_steps):
|
|
150
|
+
clicked = False
|
|
151
|
+
if load_more_clicks < max_load_more_clicks:
|
|
152
|
+
clicked = bool(run_js(driver, load_more_script))
|
|
153
|
+
if clicked:
|
|
154
|
+
load_more_clicks += 1
|
|
155
|
+
time.sleep(0.7)
|
|
156
|
+
run_js(
|
|
157
|
+
driver,
|
|
158
|
+
"window.scrollTo(0, document.body?.scrollHeight || document.documentElement?.scrollHeight || 0)",
|
|
159
|
+
)
|
|
160
|
+
time.sleep(0.8)
|
|
161
|
+
completed_scrolls += 1
|
|
162
|
+
current_height = (
|
|
163
|
+
run_js(
|
|
164
|
+
driver,
|
|
165
|
+
"return document.body?.scrollHeight || document.documentElement?.scrollHeight || 0",
|
|
166
|
+
)
|
|
167
|
+
or previous_height
|
|
168
|
+
)
|
|
169
|
+
if not clicked and current_height <= previous_height:
|
|
170
|
+
stagnant += 1
|
|
171
|
+
else:
|
|
172
|
+
stagnant = 0
|
|
173
|
+
previous_height = current_height
|
|
174
|
+
if stagnant >= 2:
|
|
175
|
+
break
|
|
176
|
+
|
|
177
|
+
extract_script = """
|
|
178
|
+
const maxItems = arguments[0];
|
|
179
|
+
const selectors = ['main','article','[role="main"]','.theme-doc-markdown','.theme-doc-markdown.markdown','.markdown','.md-content','.VPDoc','.vp-doc','.prose','.documentation','.docs-content','.content'];
|
|
180
|
+
const nodes = Array.from(document.querySelectorAll('a[href]')).slice(0, maxItems).map((element) => {
|
|
181
|
+
const closest = element.closest('article, li, tr, section, main, .topic-list-item, .topic-body, .topic-post, .athing, .titleline, [role="article"], .feed-item, .timeline-item, .post, .storylink');
|
|
182
|
+
const text = (element.innerText || element.getAttribute('aria-label') || element.getAttribute('title') || '').replace(/\s+/g, ' ').trim();
|
|
183
|
+
const contextSource = (closest && closest.innerText) || element.innerText || element.textContent || '';
|
|
184
|
+
return {
|
|
185
|
+
href: element.href || '',
|
|
186
|
+
text,
|
|
187
|
+
context: contextSource.replace(/\s+/g, ' ').trim().slice(0, 500),
|
|
188
|
+
rel: element.rel || '',
|
|
189
|
+
class_name: typeof element.className === 'string' ? element.className : '',
|
|
190
|
+
aria_label: element.getAttribute('aria-label') || '',
|
|
191
|
+
};
|
|
192
|
+
});
|
|
193
|
+
const candidates = selectors.flatMap((selector) => Array.from(document.querySelectorAll(selector)));
|
|
194
|
+
const unique = Array.from(new Set(candidates));
|
|
195
|
+
const best = unique.map((node) => ({ node, score: (node.innerText || '').trim().length })).sort((a, b) => b.score - a.score)[0]?.node || document.body;
|
|
196
|
+
const clone = best instanceof HTMLElement ? best.cloneNode(true) : document.body.cloneNode(true);
|
|
197
|
+
if (clone instanceof HTMLElement) {
|
|
198
|
+
clone.querySelectorAll('script, style, noscript, svg, canvas, iframe, form, button, input, textarea, select, nav, header, footer, aside').forEach((node) => node.remove());
|
|
199
|
+
clone.querySelectorAll('.sidebar, .toc, .table-of-contents, .breadcrumb, .breadcrumbs, .edit-this-page, .theme-edit-this-page, .theme-doc-toc-desktop, .theme-doc-toc-mobile, .pagination-nav').forEach((node) => node.remove());
|
|
200
|
+
}
|
|
201
|
+
const headings = clone instanceof HTMLElement ? Array.from(clone.querySelectorAll('h1, h2, h3, h4, h5, h6')).map((node) => ({ level: Number(node.tagName.slice(1)), text: (node.textContent || '').replace(/\s+/g, ' ').trim(), id: node.id || '' })).filter((item) => item.text) : [];
|
|
202
|
+
return {
|
|
203
|
+
title: document.title || '',
|
|
204
|
+
html: document.documentElement ? document.documentElement.outerHTML : '',
|
|
205
|
+
text: document.body ? document.body.innerText || '' : '',
|
|
206
|
+
final_url: window.location.href,
|
|
207
|
+
anchors: nodes,
|
|
208
|
+
content_html: clone instanceof HTMLElement ? clone.innerHTML : '',
|
|
209
|
+
content_text: clone instanceof HTMLElement ? (clone.innerText || '').replace(/\s+/g, ' ').trim() : '',
|
|
210
|
+
headings,
|
|
211
|
+
};
|
|
212
|
+
"""
|
|
213
|
+
|
|
214
|
+
extracted = run_js(driver, extract_script, max_anchors) or {}
|
|
215
|
+
anchors = extracted.get("anchors") if isinstance(extracted, dict) else []
|
|
216
|
+
links = []
|
|
217
|
+
seen = set()
|
|
218
|
+
for anchor in anchors or []:
|
|
219
|
+
href = str((anchor or {}).get("href") or "").strip()
|
|
220
|
+
if href and href not in seen:
|
|
221
|
+
seen.add(href)
|
|
222
|
+
links.append(href)
|
|
223
|
+
|
|
224
|
+
print(
|
|
225
|
+
json.dumps(
|
|
226
|
+
{
|
|
227
|
+
"ok": True,
|
|
228
|
+
"method": "undetected_chromedriver",
|
|
229
|
+
"url": url,
|
|
230
|
+
"final_url": str(
|
|
231
|
+
(extracted or {}).get("final_url") or driver.current_url or url
|
|
232
|
+
),
|
|
233
|
+
"title": str((extracted or {}).get("title") or ""),
|
|
234
|
+
"html": str((extracted or {}).get("html") or ""),
|
|
235
|
+
"text": str((extracted or {}).get("text") or ""),
|
|
236
|
+
"content_html": str((extracted or {}).get("content_html") or ""),
|
|
237
|
+
"content_text": str((extracted or {}).get("content_text") or ""),
|
|
238
|
+
"headings": (extracted or {}).get("headings") or [],
|
|
239
|
+
"links": links,
|
|
240
|
+
"anchors": anchors or [],
|
|
241
|
+
"scroll_steps": completed_scrolls,
|
|
242
|
+
"load_more_clicks": load_more_clicks,
|
|
243
|
+
}
|
|
244
|
+
)
|
|
245
|
+
)
|
|
246
|
+
except Exception as exc: # noqa: BLE001
|
|
247
|
+
message = str(exc)
|
|
248
|
+
fail(
|
|
249
|
+
"uc_fetch_failed",
|
|
250
|
+
message,
|
|
251
|
+
"timeout" if "timeout" in message.lower() else "unavailable",
|
|
252
|
+
)
|
|
253
|
+
finally:
|
|
254
|
+
if driver is not None:
|
|
255
|
+
try:
|
|
256
|
+
driver.quit()
|
|
257
|
+
except Exception:
|
|
258
|
+
pass
|