openclaw-smart-fetch 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -11,7 +11,7 @@
11
11
  - 🔁 **Client-side `<meta>` redirects** — follows sane meta refresh redirects with loop limits
12
12
  - 🔗 **Alternate content fallback** — when extraction produces no/thin content, follows qualified `<link rel="alternate" type="...">` entries in `<head>` that match the requested output format
13
13
  - ⚡ **Batch fetch** — fetch many URLs with bounded concurrency
14
- - 📝 **Multiple output formats** — `markdown`, `html`, `text`, `json`
14
+ - 📝 **Multiple output formats** — `markdown`, `html`, `text`, `json`, `raw`
15
15
  - 🔄 **Built-in `web_fetch` fallback** — automatically improves the core web_fetch tool
16
16
  - 📖 **Bundled skill** — agents get usage guidance injected into their system prompt
17
17
 
@@ -139,6 +139,7 @@ Skills are declared in the manifest (`openclaw.plugin.json`) under `"skills":
139
139
  | `html` | Cleaned HTML output |
140
140
  | `text` | Plain text with markdown stripped |
141
141
  | `json` | Structured JSON for metadata-heavy workflows |
142
+ | `raw` | Full raw server response without extraction or truncation — for further parsing |
142
143
 
143
144
  ## Plugin config
144
145
 
package/dist/index.js CHANGED
@@ -9958,6 +9958,7 @@ var DEFAULT_TIMEOUT_MS = 15e3;
9958
9958
  var DEFAULT_BATCH_CONCURRENCY = 8;
9959
9959
  var DEFAULT_INCLUDE_REPLIES = "extractors";
9960
9960
  var DEFAULT_ACCEPT_HEADER = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
9961
+ var DEFAULT_RAW_ACCEPT_HEADER = "text/html,application/xhtml+xml,application/json,application/xml;q=0.9,text/markdown;q=0.8,text/plain;q=0.8,*/*;q=0.7";
9961
9962
  var DEFAULT_JSON_ACCEPT_HEADER = "application/json,text/json,application/ld+json;q=0.9,text/plain;q=0.8,*/*;q=0.7";
9962
9963
  var DEFAULT_ACCEPT_LANGUAGE_HEADER = "en-US,en;q=0.9";
9963
9964
 
@@ -10175,7 +10176,8 @@ function buildCompactMetadataHeader(result) {
10175
10176
  ["URL", result.finalUrl],
10176
10177
  ["Title", result.title],
10177
10178
  ["Author", result.author],
10178
- ["Published", result.published]
10179
+ ["Published", result.published],
10180
+ ["Content-Type", result.contentType]
10179
10181
  ]);
10180
10182
  }
10181
10183
  function buildMetadataHeader(result) {
@@ -10193,6 +10195,7 @@ function buildMetadataHeader(result) {
10193
10195
  ["Title", result.title],
10194
10196
  ["Author", result.author],
10195
10197
  ["Published", result.published],
10198
+ ["Content-Type", result.contentType],
10196
10199
  ["Site", result.site],
10197
10200
  ["Language", result.language],
10198
10201
  ["Words", result.wordCount],
@@ -10775,7 +10778,9 @@ function mapRequestEventToProgress(event) {
10775
10778
  }
10776
10779
  }
10777
10780
  function resolveAcceptHeader(format) {
10778
- return format === "json" ? DEFAULT_JSON_ACCEPT_HEADER : DEFAULT_ACCEPT_HEADER;
10781
+ if (format === "json") return DEFAULT_JSON_ACCEPT_HEADER;
10782
+ if (format === "raw") return DEFAULT_RAW_ACCEPT_HEADER;
10783
+ return DEFAULT_ACCEPT_HEADER;
10779
10784
  }
10780
10785
  function isJsonContentType(contentType) {
10781
10786
  const normalized = contentType.split(";")[0]?.trim().toLowerCase() ?? "";
@@ -10796,7 +10801,8 @@ function extractQualifiedAlternateLinks(document, baseUrl, format) {
10796
10801
  markdown: ["text/markdown", "text/x-markdown"],
10797
10802
  text: ["text/plain", "text/markdown", "text/x-markdown"],
10798
10803
  html: ["text/html", "application/xhtml+xml"],
10799
- json: ["application/json", "text/json"]
10804
+ json: ["application/json", "text/json"],
10805
+ raw: []
10800
10806
  };
10801
10807
  const accepted = acceptedTypes[format];
10802
10808
  const head = document.head;
@@ -11074,6 +11080,81 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
11074
11080
  );
11075
11081
  }
11076
11082
  const jsonResponse = isJsonResponse(contentType, rawBody);
11083
+ if (format === "raw") {
11084
+ const isXUrl2 = /^https?:\/\/(www\.)?(x\.com|twitter\.com)\//i.test(
11085
+ opts.url
11086
+ );
11087
+ if (isXUrl2) {
11088
+ let extractedContent2;
11089
+ const suppressedErrors2 = [];
11090
+ const origConsoleError = console.error;
11091
+ console.error = (...args) => {
11092
+ suppressedErrors2.push(args);
11093
+ };
11094
+ try {
11095
+ const extractionDocument2 = parseLinkedomHTML(rawBody, finalUrl);
11096
+ const extracted2 = await dependencies.defuddle(
11097
+ extractionDocument2,
11098
+ finalUrl,
11099
+ {
11100
+ markdown: true,
11101
+ removeImages,
11102
+ includeReplies
11103
+ }
11104
+ );
11105
+ extractedContent2 = extracted2.content;
11106
+ } finally {
11107
+ console.error = origConsoleError;
11108
+ }
11109
+ const hasOembed404 = suppressedErrors2.some(
11110
+ (args) => args.some(
11111
+ (arg) => typeof arg === "string" && arg.includes("oEmbed request failed: 404")
11112
+ )
11113
+ );
11114
+ const hasJsDisabledShell = isTwitterJsDisabledPage(
11115
+ parseLinkedomHTML(rawBody, finalUrl),
11116
+ opts.url
11117
+ );
11118
+ if ((hasOembed404 || hasJsDisabledShell) && !extractedContent2) {
11119
+ return {
11120
+ error: `Server returned HTTP 404 Not Found for ${opts.url}.`,
11121
+ code: "http_error",
11122
+ phase: "loading",
11123
+ retryable: false,
11124
+ timeoutMs,
11125
+ url: opts.url,
11126
+ finalUrl,
11127
+ statusCode: 404,
11128
+ statusText: "Not Found",
11129
+ mimeType: normalizeContentType(contentType) || void 0,
11130
+ contentLength: errorContext.contentLength
11131
+ };
11132
+ }
11133
+ }
11134
+ const effectiveContent = opts.maxChars !== void 0 ? truncateContent(rawBody, maxChars) : rawBody;
11135
+ const result2 = {
11136
+ kind: "content",
11137
+ url: opts.url,
11138
+ finalUrl,
11139
+ title: "",
11140
+ author: "",
11141
+ published: "",
11142
+ site: new URL(finalUrl).hostname,
11143
+ language: "",
11144
+ wordCount: 0,
11145
+ content: effectiveContent,
11146
+ browser,
11147
+ os,
11148
+ contentType: normalizeContentType(contentType) || void 0
11149
+ };
11150
+ emitStatus(hooks, "done");
11151
+ emitProgress(hooks, {
11152
+ status: "done",
11153
+ progress: 1,
11154
+ phase: "raw_done"
11155
+ });
11156
+ return result2;
11157
+ }
11077
11158
  if (format === "json") {
11078
11159
  if (!jsonResponse) {
11079
11160
  if (HTML_CONTENT_TYPES.some((value) => contentType.includes(value))) {
@@ -11379,10 +11460,11 @@ function createBaseFetchToolParameterProperties(defaults) {
11379
11460
  Type.Literal("markdown"),
11380
11461
  Type.Literal("html"),
11381
11462
  Type.Literal("text"),
11382
- Type.Literal("json")
11463
+ Type.Literal("json"),
11464
+ Type.Literal("raw")
11383
11465
  ],
11384
11466
  {
11385
- description: 'Output format. "markdown" (default), "html" (cleaned HTML), "text" (plain text, no formatting), or "json" (pretty-printed JSON)'
11467
+ description: 'Output format. "markdown" (default), "html" (cleaned HTML), "text" (plain text, no formatting), "json" (pretty-printed JSON), or "raw" (full raw server response without extraction or truncation, for further parsing)'
11386
11468
  }
11387
11469
  )
11388
11470
  ),