pi-smart-fetch 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -1
- package/dist/index.js +87 -5
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -13,7 +13,7 @@
|
|
|
13
13
|
- 🔁 **Client-side `<meta>` redirects** — follows sane meta refresh redirects with loop limits
|
|
14
14
|
- 🔗 **Alternate content fallback** — when extraction produces no/thin content, follows qualified `<link rel="alternate" type="...">` entries in `<head>` that match the requested output format
|
|
15
15
|
- ⚡ **Batch fetch** — fetch many URLs with bounded concurrency
|
|
16
|
-
- 📝 **Multiple output formats** — `markdown`, `html`, `text`, `json`
|
|
16
|
+
- 📝 **Multiple output formats** — `markdown`, `html`, `text`, `json`, `raw`
|
|
17
17
|
|
|
18
18
|
## Site optimisations
|
|
19
19
|
|
|
@@ -70,6 +70,7 @@ For `batch_web_fetch`, each item in `requests` accepts the same parameters as `w
|
|
|
70
70
|
| `html` | Cleaned HTML output |
|
|
71
71
|
| `text` | Plain text with markdown stripped |
|
|
72
72
|
| `json` | Structured JSON for metadata-heavy workflows |
|
|
73
|
+
| `raw` | Full raw server response without extraction or truncation — for further parsing |
|
|
73
74
|
|
|
74
75
|
## Global defaults
|
|
75
76
|
|
package/dist/index.js
CHANGED
|
@@ -9960,6 +9960,7 @@ var DEFAULT_TIMEOUT_MS = 15e3;
|
|
|
9960
9960
|
var DEFAULT_BATCH_CONCURRENCY = 8;
|
|
9961
9961
|
var DEFAULT_INCLUDE_REPLIES = "extractors";
|
|
9962
9962
|
var DEFAULT_ACCEPT_HEADER = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
|
|
9963
|
+
var DEFAULT_RAW_ACCEPT_HEADER = "text/html,application/xhtml+xml,application/json,application/xml;q=0.9,text/markdown;q=0.8,text/plain;q=0.8,*/*;q=0.7";
|
|
9963
9964
|
var DEFAULT_JSON_ACCEPT_HEADER = "application/json,text/json,application/ld+json;q=0.9,text/plain;q=0.8,*/*;q=0.7";
|
|
9964
9965
|
var DEFAULT_ACCEPT_LANGUAGE_HEADER = "en-US,en;q=0.9";
|
|
9965
9966
|
|
|
@@ -10177,7 +10178,8 @@ function buildCompactMetadataHeader(result) {
|
|
|
10177
10178
|
["URL", result.finalUrl],
|
|
10178
10179
|
["Title", result.title],
|
|
10179
10180
|
["Author", result.author],
|
|
10180
|
-
["Published", result.published]
|
|
10181
|
+
["Published", result.published],
|
|
10182
|
+
["Content-Type", result.contentType]
|
|
10181
10183
|
]);
|
|
10182
10184
|
}
|
|
10183
10185
|
function buildMetadataHeader(result) {
|
|
@@ -10195,6 +10197,7 @@ function buildMetadataHeader(result) {
|
|
|
10195
10197
|
["Title", result.title],
|
|
10196
10198
|
["Author", result.author],
|
|
10197
10199
|
["Published", result.published],
|
|
10200
|
+
["Content-Type", result.contentType],
|
|
10198
10201
|
["Site", result.site],
|
|
10199
10202
|
["Language", result.language],
|
|
10200
10203
|
["Words", result.wordCount],
|
|
@@ -10777,7 +10780,9 @@ function mapRequestEventToProgress(event) {
|
|
|
10777
10780
|
}
|
|
10778
10781
|
}
|
|
10779
10782
|
function resolveAcceptHeader(format) {
|
|
10780
|
-
|
|
10783
|
+
if (format === "json") return DEFAULT_JSON_ACCEPT_HEADER;
|
|
10784
|
+
if (format === "raw") return DEFAULT_RAW_ACCEPT_HEADER;
|
|
10785
|
+
return DEFAULT_ACCEPT_HEADER;
|
|
10781
10786
|
}
|
|
10782
10787
|
function isJsonContentType(contentType) {
|
|
10783
10788
|
const normalized = contentType.split(";")[0]?.trim().toLowerCase() ?? "";
|
|
@@ -10798,7 +10803,8 @@ function extractQualifiedAlternateLinks(document, baseUrl, format) {
|
|
|
10798
10803
|
markdown: ["text/markdown", "text/x-markdown"],
|
|
10799
10804
|
text: ["text/plain", "text/markdown", "text/x-markdown"],
|
|
10800
10805
|
html: ["text/html", "application/xhtml+xml"],
|
|
10801
|
-
json: ["application/json", "text/json"]
|
|
10806
|
+
json: ["application/json", "text/json"],
|
|
10807
|
+
raw: []
|
|
10802
10808
|
};
|
|
10803
10809
|
const accepted = acceptedTypes[format];
|
|
10804
10810
|
const head = document.head;
|
|
@@ -11076,6 +11082,81 @@ function createDefuddleFetch(dependencies = runtimeDependencies) {
|
|
|
11076
11082
|
);
|
|
11077
11083
|
}
|
|
11078
11084
|
const jsonResponse = isJsonResponse(contentType, rawBody);
|
|
11085
|
+
if (format === "raw") {
|
|
11086
|
+
const isXUrl2 = /^https?:\/\/(www\.)?(x\.com|twitter\.com)\//i.test(
|
|
11087
|
+
opts.url
|
|
11088
|
+
);
|
|
11089
|
+
if (isXUrl2) {
|
|
11090
|
+
let extractedContent2;
|
|
11091
|
+
const suppressedErrors2 = [];
|
|
11092
|
+
const origConsoleError = console.error;
|
|
11093
|
+
console.error = (...args) => {
|
|
11094
|
+
suppressedErrors2.push(args);
|
|
11095
|
+
};
|
|
11096
|
+
try {
|
|
11097
|
+
const extractionDocument2 = parseLinkedomHTML(rawBody, finalUrl);
|
|
11098
|
+
const extracted2 = await dependencies.defuddle(
|
|
11099
|
+
extractionDocument2,
|
|
11100
|
+
finalUrl,
|
|
11101
|
+
{
|
|
11102
|
+
markdown: true,
|
|
11103
|
+
removeImages,
|
|
11104
|
+
includeReplies
|
|
11105
|
+
}
|
|
11106
|
+
);
|
|
11107
|
+
extractedContent2 = extracted2.content;
|
|
11108
|
+
} finally {
|
|
11109
|
+
console.error = origConsoleError;
|
|
11110
|
+
}
|
|
11111
|
+
const hasOembed404 = suppressedErrors2.some(
|
|
11112
|
+
(args) => args.some(
|
|
11113
|
+
(arg) => typeof arg === "string" && arg.includes("oEmbed request failed: 404")
|
|
11114
|
+
)
|
|
11115
|
+
);
|
|
11116
|
+
const hasJsDisabledShell = isTwitterJsDisabledPage(
|
|
11117
|
+
parseLinkedomHTML(rawBody, finalUrl),
|
|
11118
|
+
opts.url
|
|
11119
|
+
);
|
|
11120
|
+
if ((hasOembed404 || hasJsDisabledShell) && !extractedContent2) {
|
|
11121
|
+
return {
|
|
11122
|
+
error: `Server returned HTTP 404 Not Found for ${opts.url}.`,
|
|
11123
|
+
code: "http_error",
|
|
11124
|
+
phase: "loading",
|
|
11125
|
+
retryable: false,
|
|
11126
|
+
timeoutMs,
|
|
11127
|
+
url: opts.url,
|
|
11128
|
+
finalUrl,
|
|
11129
|
+
statusCode: 404,
|
|
11130
|
+
statusText: "Not Found",
|
|
11131
|
+
mimeType: normalizeContentType(contentType) || void 0,
|
|
11132
|
+
contentLength: errorContext.contentLength
|
|
11133
|
+
};
|
|
11134
|
+
}
|
|
11135
|
+
}
|
|
11136
|
+
const effectiveContent = opts.maxChars !== void 0 ? truncateContent(rawBody, maxChars) : rawBody;
|
|
11137
|
+
const result2 = {
|
|
11138
|
+
kind: "content",
|
|
11139
|
+
url: opts.url,
|
|
11140
|
+
finalUrl,
|
|
11141
|
+
title: "",
|
|
11142
|
+
author: "",
|
|
11143
|
+
published: "",
|
|
11144
|
+
site: new URL(finalUrl).hostname,
|
|
11145
|
+
language: "",
|
|
11146
|
+
wordCount: 0,
|
|
11147
|
+
content: effectiveContent,
|
|
11148
|
+
browser,
|
|
11149
|
+
os,
|
|
11150
|
+
contentType: normalizeContentType(contentType) || void 0
|
|
11151
|
+
};
|
|
11152
|
+
emitStatus(hooks, "done");
|
|
11153
|
+
emitProgress(hooks, {
|
|
11154
|
+
status: "done",
|
|
11155
|
+
progress: 1,
|
|
11156
|
+
phase: "raw_done"
|
|
11157
|
+
});
|
|
11158
|
+
return result2;
|
|
11159
|
+
}
|
|
11079
11160
|
if (format === "json") {
|
|
11080
11161
|
if (!jsonResponse) {
|
|
11081
11162
|
if (HTML_CONTENT_TYPES.some((value) => contentType.includes(value))) {
|
|
@@ -11381,10 +11462,11 @@ function createBaseFetchToolParameterProperties(defaults) {
|
|
|
11381
11462
|
Type.Literal("markdown"),
|
|
11382
11463
|
Type.Literal("html"),
|
|
11383
11464
|
Type.Literal("text"),
|
|
11384
|
-
Type.Literal("json")
|
|
11465
|
+
Type.Literal("json"),
|
|
11466
|
+
Type.Literal("raw")
|
|
11385
11467
|
],
|
|
11386
11468
|
{
|
|
11387
|
-
description: 'Output format. "markdown" (default), "html" (cleaned HTML), "text" (plain text, no formatting),
|
|
11469
|
+
description: 'Output format. "markdown" (default), "html" (cleaned HTML), "text" (plain text, no formatting), "json" (pretty-printed JSON), or "raw" (full raw server response without extraction or truncation, for further parsing)'
|
|
11388
11470
|
}
|
|
11389
11471
|
)
|
|
11390
11472
|
),
|