0agent 1.0.9 → 1.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/daemon.mjs +61 -0
- package/package.json +1 -1
package/dist/daemon.mjs
CHANGED
|
@@ -1736,6 +1736,20 @@ var AGENT_TOOLS = [
|
|
|
1736
1736
|
path: { type: "string", description: 'Directory path relative to working directory (default: ".")' }
|
|
1737
1737
|
}
|
|
1738
1738
|
}
|
|
1739
|
+
},
|
|
1740
|
+
{
|
|
1741
|
+
name: "scrape_url",
|
|
1742
|
+
description: "Scrape a URL and return clean structured content. Handles JavaScript-rendered pages, auto-adapts to page structure, returns text/links/metadata. Better than shell curl for web pages.",
|
|
1743
|
+
input_schema: {
|
|
1744
|
+
type: "object",
|
|
1745
|
+
properties: {
|
|
1746
|
+
url: { type: "string", description: "URL to scrape" },
|
|
1747
|
+
mode: { type: "string", description: 'What to extract: "text" (default), "links", "tables", "full", "markdown"' },
|
|
1748
|
+
selector: { type: "string", description: "Optional CSS selector to target specific element" },
|
|
1749
|
+
wait_ms: { type: "number", description: "Wait N ms after page load (for JS-heavy pages, default 0)" }
|
|
1750
|
+
},
|
|
1751
|
+
required: ["url"]
|
|
1752
|
+
}
|
|
1739
1753
|
}
|
|
1740
1754
|
];
|
|
1741
1755
|
var LLMExecutor = class {
|
|
@@ -2135,6 +2149,13 @@ var AgentExecutor = class {
|
|
|
2135
2149
|
return this.readFile(String(input.path ?? ""));
|
|
2136
2150
|
case "list_dir":
|
|
2137
2151
|
return this.listDir(input.path ? String(input.path) : void 0);
|
|
2152
|
+
case "scrape_url":
|
|
2153
|
+
return this.scrapeUrl(
|
|
2154
|
+
String(input.url ?? ""),
|
|
2155
|
+
String(input.mode ?? "text"),
|
|
2156
|
+
input.selector ? String(input.selector) : void 0,
|
|
2157
|
+
Number(input.wait_ms ?? 0)
|
|
2158
|
+
);
|
|
2138
2159
|
default:
|
|
2139
2160
|
return `Unknown tool: ${name}`;
|
|
2140
2161
|
}
|
|
@@ -2174,6 +2195,45 @@ var AgentExecutor = class {
|
|
|
2174
2195
|
return content.length > 8e3 ? content.slice(0, 8e3) + `
|
|
2175
2196
|
\u2026[truncated, ${content.length} total bytes]` : content;
|
|
2176
2197
|
}
|
|
2198
|
+
async scrapeUrl(url, mode, selector, waitMs) {
|
|
2199
|
+
if (!url.startsWith("http")) return "Error: URL must start with http:// or https://";
|
|
2200
|
+
const selectorLine = selector ? `element = page.find('${selector}')
|
|
2201
|
+
content = element.text if element else page.get_all_text()` : `content = page.get_all_text()`;
|
|
2202
|
+
const modeLine = mode === "links" ? `result = [a.attrib.get('href','') for a in page.find_all('a') if a.attrib.get('href','').startswith('http')]` : mode === "tables" ? `result = [str(t) for t in page.find_all('table')]` : mode === "markdown" ? `result = page.get_all_text()` : `result = page.get_all_text()`;
|
|
2203
|
+
const script = [
|
|
2204
|
+
`import sys`,
|
|
2205
|
+
`try:`,
|
|
2206
|
+
` from scrapling import Fetcher`,
|
|
2207
|
+
`except ImportError:`,
|
|
2208
|
+
` import subprocess, sys`,
|
|
2209
|
+
` subprocess.run([sys.executable, '-m', 'pip', 'install', 'scrapling', '-q'], check=True)`,
|
|
2210
|
+
` from scrapling import Fetcher`,
|
|
2211
|
+
`try:`,
|
|
2212
|
+
` fetcher = Fetcher(auto_match=False)`,
|
|
2213
|
+
` page = fetcher.get('${url}', timeout=20)`,
|
|
2214
|
+
` ${modeLine}`,
|
|
2215
|
+
` if isinstance(result, list):`,
|
|
2216
|
+
` print('\\n'.join(str(r) for r in result[:50]))`,
|
|
2217
|
+
` else:`,
|
|
2218
|
+
` text = str(result).strip()`,
|
|
2219
|
+
` print(text[:6000] + ('...[truncated]' if len(text)>6000 else ''))`,
|
|
2220
|
+
`except Exception as e:`,
|
|
2221
|
+
` # Fallback to simple fetch if scrapling fails`,
|
|
2222
|
+
` import urllib.request`,
|
|
2223
|
+
` try:`,
|
|
2224
|
+
` req = urllib.request.Request('${url}', headers={'User-Agent': 'Mozilla/5.0'})`,
|
|
2225
|
+
` with urllib.request.urlopen(req, timeout=15) as resp:`,
|
|
2226
|
+
` body = resp.read().decode('utf-8', errors='ignore')`,
|
|
2227
|
+
` # Strip tags simply`,
|
|
2228
|
+
` import re`,
|
|
2229
|
+
` text = re.sub(r'<[^>]+>', ' ', body)`,
|
|
2230
|
+
` text = re.sub(r'\\s+', ' ', text).strip()`,
|
|
2231
|
+
` print(text[:5000])`,
|
|
2232
|
+
` except Exception as e2:`,
|
|
2233
|
+
` print(f'Scrape failed: {e} / {e2}', file=sys.stderr)`
|
|
2234
|
+
].join("\n");
|
|
2235
|
+
return this.shellExec(`python3 -c "${script.replace(/"/g, '\\"').replace(/\n/g, ";")}"`, 3e4);
|
|
2236
|
+
}
|
|
2177
2237
|
listDir(dirPath) {
|
|
2178
2238
|
const safe = this.safePath(dirPath ?? ".");
|
|
2179
2239
|
if (!safe) return "Error: path outside working directory";
|
|
@@ -2212,6 +2272,7 @@ var AgentExecutor = class {
|
|
|
2212
2272
|
if (toolName === "write_file") return `"${input.path}"`;
|
|
2213
2273
|
if (toolName === "read_file") return `"${input.path}"`;
|
|
2214
2274
|
if (toolName === "list_dir") return `"${input.path ?? "."}"`;
|
|
2275
|
+
if (toolName === "scrape_url") return `"${String(input.url ?? "").slice(0, 60)}" mode=${input.mode ?? "text"}`;
|
|
2215
2276
|
return JSON.stringify(input).slice(0, 60);
|
|
2216
2277
|
}
|
|
2217
2278
|
};
|