pi-smart-fetch 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -13,7 +13,7 @@
13
13
  - 🔁 **Client-side `<meta>` redirects** — follows sane meta refresh redirects with loop limits
14
14
  - 🔗 **Alternate content fallback** — when extraction produces no/thin content, follows qualified `<link rel="alternate" type="...">` entries in `<head>` that match the requested output format
15
15
  - ⚡ **Batch fetch** — fetch many URLs with bounded concurrency
16
- - 📝 **Multiple output formats** — `markdown`, `html`, `text`, `json`
16
+ - 📝 **Multiple output formats** — `markdown`, `html`, `text`, `json`, `raw`
17
17
 
18
18
  ## Site optimisations
19
19
 
@@ -70,6 +70,7 @@ For `batch_web_fetch`, each item in `requests` accepts the same parameters as `w
70
70
  | `html` | Cleaned HTML output |
71
71
  | `text` | Plain text with markdown stripped |
72
72
  | `json` | Structured JSON for metadata-heavy workflows |
73
+ | `raw` | Full raw server response without extraction or truncation — for further parsing |
73
74
 
74
75
  ## Global defaults
75
76
 
package/dist/index.js CHANGED
@@ -9960,6 +9960,7 @@ var DEFAULT_TIMEOUT_MS = 15e3;
9960
9960
  var DEFAULT_BATCH_CONCURRENCY = 8;
9961
9961
  var DEFAULT_INCLUDE_REPLIES = "extractors";
9962
9962
  var DEFAULT_ACCEPT_HEADER = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
9963
+ var DEFAULT_RAW_ACCEPT_HEADER = "text/html,application/xhtml+xml,application/json,application/xml;q=0.9,text/markdown;q=0.8,text/plain;q=0.8,*/*;q=0.7";
9963
9964
  var DEFAULT_JSON_ACCEPT_HEADER = "application/json,text/json,application/ld+json;q=0.9,text/plain;q=0.8,*/*;q=0.7";
9964
9965
  var DEFAULT_ACCEPT_LANGUAGE_HEADER = "en-US,en;q=0.9";
9965
9966
 
@@ -10177,7 +10178,8 @@ function buildCompactMetadataHeader(result) {
10177
10178
  ["URL", result.finalUrl],
10178
10179
  ["Title", result.title],
10179
10180
  ["Author", result.author],
10180
- ["Published", result.published]
10181
+ ["Published", result.published],
10182
+ ["Content-Type", result.contentType]
10181
10183
  ]);
10182
10184
  }
10183
10185
  function buildMetadataHeader(result) {
@@ -10195,6 +10197,7 @@ function buildMetadataHeader(result) {
10195
10197
  ["Title", result.title],
10196
10198
  ["Author", result.author],
10197
10199
  ["Published", result.published],
10200
+ ["Content-Type", result.contentType],
10198
10201
  ["Site", result.site],
10199
10202
  ["Language", result.language],
10200
10203
  ["Words", result.wordCount],
@@ -10777,7 +10780,9 @@ function mapRequestEventToProgress(event) {
10777
10780
  }
10778
10781
  }
10779
10782
  function resolveAcceptHeader(format) {
10780
- return format === "json" ? DEFAULT_JSON_ACCEPT_HEADER : DEFAULT_ACCEPT_HEADER;
10783
+ if (format === "json") return DEFAULT_JSON_ACCEPT_HEADER;
10784
+ if (format === "raw") return DEFAULT_RAW_ACCEPT_HEADER;
10785
+ return DEFAULT_ACCEPT_HEADER;
10781
10786
  }
10782
10787
  function isJsonContentType(contentType) {
10783
10788
  const normalized = contentType.split(";")[0]?.trim().toLowerCase() ?? "";
@@ -10798,7 +10803,8 @@ function extractQualifiedAlternateLinks(document, baseUrl, format) {
10798
10803
  markdown: ["text/markdown", "text/x-markdown"],
10799
10804
  text: ["text/plain", "text/markdown", "text/x-markdown"],
10800
10805
  html: ["text/html", "application/xhtml+xml"],
10801
- json: ["application/json", "text/json"]
10806
+ json: ["application/json", "text/json"],
10807
+ raw: []
10802
10808
  };
10803
10809
  const accepted = acceptedTypes[format];
10804
10810
  const head = document.head;
@@ -11076,6 +11082,81 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
11076
11082
  );
11077
11083
  }
11078
11084
  const jsonResponse = isJsonResponse(contentType, rawBody);
11085
+ if (format === "raw") {
11086
+ const isXUrl2 = /^https?:\/\/(www\.)?(x\.com|twitter\.com)\//i.test(
11087
+ opts.url
11088
+ );
11089
+ if (isXUrl2) {
11090
+ let extractedContent2;
11091
+ const suppressedErrors2 = [];
11092
+ const origConsoleError = console.error;
11093
+ console.error = (...args) => {
11094
+ suppressedErrors2.push(args);
11095
+ };
11096
+ try {
11097
+ const extractionDocument2 = parseLinkedomHTML(rawBody, finalUrl);
11098
+ const extracted2 = await dependencies.defuddle(
11099
+ extractionDocument2,
11100
+ finalUrl,
11101
+ {
11102
+ markdown: true,
11103
+ removeImages,
11104
+ includeReplies
11105
+ }
11106
+ );
11107
+ extractedContent2 = extracted2.content;
11108
+ } finally {
11109
+ console.error = origConsoleError;
11110
+ }
11111
+ const hasOembed404 = suppressedErrors2.some(
11112
+ (args) => args.some(
11113
+ (arg) => typeof arg === "string" && arg.includes("oEmbed request failed: 404")
11114
+ )
11115
+ );
11116
+ const hasJsDisabledShell = isTwitterJsDisabledPage(
11117
+ parseLinkedomHTML(rawBody, finalUrl),
11118
+ opts.url
11119
+ );
11120
+ if ((hasOembed404 || hasJsDisabledShell) && !extractedContent2) {
11121
+ return {
11122
+ error: `Server returned HTTP 404 Not Found for ${opts.url}.`,
11123
+ code: "http_error",
11124
+ phase: "loading",
11125
+ retryable: false,
11126
+ timeoutMs,
11127
+ url: opts.url,
11128
+ finalUrl,
11129
+ statusCode: 404,
11130
+ statusText: "Not Found",
11131
+ mimeType: normalizeContentType(contentType) || void 0,
11132
+ contentLength: errorContext.contentLength
11133
+ };
11134
+ }
11135
+ }
11136
+ const effectiveContent = opts.maxChars !== void 0 ? truncateContent(rawBody, maxChars) : rawBody;
11137
+ const result2 = {
11138
+ kind: "content",
11139
+ url: opts.url,
11140
+ finalUrl,
11141
+ title: "",
11142
+ author: "",
11143
+ published: "",
11144
+ site: new URL(finalUrl).hostname,
11145
+ language: "",
11146
+ wordCount: 0,
11147
+ content: effectiveContent,
11148
+ browser,
11149
+ os,
11150
+ contentType: normalizeContentType(contentType) || void 0
11151
+ };
11152
+ emitStatus(hooks, "done");
11153
+ emitProgress(hooks, {
11154
+ status: "done",
11155
+ progress: 1,
11156
+ phase: "raw_done"
11157
+ });
11158
+ return result2;
11159
+ }
11079
11160
  if (format === "json") {
11080
11161
  if (!jsonResponse) {
11081
11162
  if (HTML_CONTENT_TYPES.some((value) => contentType.includes(value))) {
@@ -11381,10 +11462,11 @@ function createBaseFetchToolParameterProperties(defaults) {
11381
11462
  Type.Literal("markdown"),
11382
11463
  Type.Literal("html"),
11383
11464
  Type.Literal("text"),
11384
- Type.Literal("json")
11465
+ Type.Literal("json"),
11466
+ Type.Literal("raw")
11385
11467
  ],
11386
11468
  {
11387
- description: 'Output format. "markdown" (default), "html" (cleaned HTML), "text" (plain text, no formatting), or "json" (pretty-printed JSON)'
11469
+ description: 'Output format. "markdown" (default), "html" (cleaned HTML), "text" (plain text, no formatting), "json" (pretty-printed JSON), or "raw" (full raw server response without extraction or truncation, for further parsing)'
11388
11470
  }
11389
11471
  )
11390
11472
  ),