smart-web-mcp 0.4.3 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +31 -0
- package/README.md +43 -23
- package/dist/browser-session.d.ts +41 -0
- package/dist/browser-session.js +206 -0
- package/dist/browser-session.js.map +1 -0
- package/dist/index.js +56 -1
- package/dist/index.js.map +1 -1
- package/dist/shared.d.ts +54 -1
- package/dist/shared.js +159 -5
- package/dist/shared.js.map +1 -1
- package/dist/smartcrawl.d.ts +67 -0
- package/dist/smartcrawl.js +393 -0
- package/dist/smartcrawl.js.map +1 -0
- package/dist/smartfetch/archive-fallback.js +2 -2
- package/dist/smartfetch/archive-fallback.js.map +1 -1
- package/dist/smartfetch/assets.d.ts +2 -0
- package/dist/smartfetch/assets.js +85 -0
- package/dist/smartfetch/assets.js.map +1 -0
- package/dist/smartfetch/provider-policy.js +39 -4
- package/dist/smartfetch/provider-policy.js.map +1 -1
- package/dist/smartfetch/providers/dcinside.js +1 -1
- package/dist/smartfetch/providers/dcinside.js.map +1 -1
- package/dist/smartfetch/providers/naver-blog.js +1 -1
- package/dist/smartfetch/providers/naver-blog.js.map +1 -1
- package/dist/smartfetch/providers/x.js +3 -3
- package/dist/smartfetch/providers/x.js.map +1 -1
- package/dist/smartfetch/providers/youtube.js +24 -4
- package/dist/smartfetch/providers/youtube.js.map +1 -1
- package/dist/smartfetch.d.ts +11 -1
- package/dist/smartfetch.js +142 -114
- package/dist/smartfetch.js.map +1 -1
- package/dist/smartsearch.d.ts +12 -1
- package/dist/smartsearch.js +144 -24
- package/dist/smartsearch.js.map +1 -1
- package/package.json +11 -5
|
@@ -4,7 +4,7 @@ export const dcinsideProvider = {
|
|
|
4
4
|
matches: (_url, target) => target === "dcinside_post",
|
|
5
5
|
normalize(context) {
|
|
6
6
|
const html = context.active.content;
|
|
7
|
-
const bodyHtml = capture(html, /<div class=["']write_div["'][^>]*>([\s\S]*?)<div class=["']btn_recommend_box["']/i);
|
|
7
|
+
const bodyHtml = capture(html, /<div class=["']write_div["'][^>]*>([\s\S]*?)<div class=["'][^"']*btn_recommend_box[^"']*["']/i);
|
|
8
8
|
const title = stripTags(capture(html, /<span class=["']title_subject["']>([\s\S]*?)<\/span>/i));
|
|
9
9
|
const author = stripTags(capture(html, /<span class=["']nickname[^>]*><em>([\s\S]*?)<\/em><\/span>/i));
|
|
10
10
|
const body = stripTags(bodyHtml);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"dcinside.js","sourceRoot":"","sources":["../../../src/smartfetch/providers/dcinside.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,UAAU,EAAE,kBAAkB,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAA;AAGjG,MAAM,CAAC,MAAM,gBAAgB,GAAuB;IAClD,EAAE,EAAE,UAAU;IACd,OAAO,EAAE,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,CAAC,MAAM,KAAK,eAAe;IACrD,SAAS,CAAC,OAA0B;QAClC,MAAM,IAAI,GAAG,OAAO,CAAC,MAAM,CAAC,OAAO,CAAA;QACnC,MAAM,QAAQ,GAAG,OAAO,CAAC,IAAI,EAAE,
|
|
1
|
+
{"version":3,"file":"dcinside.js","sourceRoot":"","sources":["../../../src/smartfetch/providers/dcinside.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,UAAU,EAAE,kBAAkB,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAA;AAGjG,MAAM,CAAC,MAAM,gBAAgB,GAAuB;IAClD,EAAE,EAAE,UAAU;IACd,OAAO,EAAE,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,CAAC,MAAM,KAAK,eAAe;IACrD,SAAS,CAAC,OAA0B;QAClC,MAAM,IAAI,GAAG,OAAO,CAAC,MAAM,CAAC,OAAO,CAAA;QACnC,MAAM,QAAQ,GAAG,OAAO,CAAC,IAAI,EAAE,+FAA+F,CAAC,CAAA;QAC/H,MAAM,KAAK,GAAG,SAAS,CAAC,OAAO,CAAC,IAAI,EAAE,uDAAuD,CAAC,CAAC,CAAA;QAC/F,MAAM,MAAM,GAAG,SAAS,CAAC,OAAO,CAAC,IAAI,EAAE,6DAA6D,CAAC,CAAC,CAAA;QACtG,MAAM,IAAI,GAAG,SAAS,CAAC,QAAQ,CAAC,CAAA;QAChC,MAAM,EAAE,GAAG,OAAO,CAAC,KAAK,IAAI,IAAI,CAAC,CAAA;QACjC,OAAO;YACL,IAAI,EAAE,EAAE,GAAG,EAAE,OAAO,CAAC,GAAG,EAAE,KAAK,EAAE,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,MAAM,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,wBAAwB,EAAE;YACnG,MAAM,EAAE,EAAE;YACV,QAAQ,EAAE,EAAE;YACZ,cAAc,EAAE,UAAU,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,EAAE,GAAG,WAAW,CAAC,QAAQ,CAAC,EAAE,GAAG,kBAAkB,CAAC,QAAQ,CAAC,CAAC,CAAC;YAChH,OAAO,EAAE,CAAC,EAAE;YACZ,MAAM,EAAE,EAAE;gBACR,CAAC,CAAC,OAAO,CAAC,MAAM;gBAChB,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,uBAAuB,EAAE,OAAO,EAAE,4BAA4B,EAAE,CAAC;YAC1H,MAAM,EAAE,OAAO,CAAC,MAAM,CAAC,MAAM;SAC9B,CAAA;IACH,CAAC;CACF,CAAA"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"naver-blog.js","sourceRoot":"","sources":["../../../src/smartfetch/providers/naver-blog.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,sBAAsB,EAAE,mBAAmB,EAAE,oBAAoB,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAA;AACvI,OAAO,EAAE,iBAAiB,EAAE,MAAM,uBAAuB,CAAA;AAGzD,MAAM,UAAU,GAAG,iHAAiH,CAAA;AAEpI,SAAS,sBAAsB,CAAC,GAAW,EAAE,IAAY;IACvD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAA;QAC3B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,4DAA4D,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;QAC3F,IAAI,KAAK;YAAE,OAAO,IAAI,GAAG,CAAC,KAAK,EAAE,GAAG,MAAM,CAAC,QAAQ,KAAK,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAA;QACjF,IAAI,MAAM,CAAC,QAAQ,KAAK,gBAAgB,EAAE,CAAC;YACzC,OAAO,2BAA2B,MAAM,CAAC,QAAQ,GAAG,MAAM,CAAC,MAAM,EAAE,CAAA;QACrE,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,
|
|
1
|
+
{"version":3,"file":"naver-blog.js","sourceRoot":"","sources":["../../../src/smartfetch/providers/naver-blog.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,sBAAsB,EAAE,mBAAmB,EAAE,oBAAoB,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAA;AACvI,OAAO,EAAE,iBAAiB,EAAE,MAAM,uBAAuB,CAAA;AAGzD,MAAM,UAAU,GAAG,iHAAiH,CAAA;AAEpI,SAAS,sBAAsB,CAAC,GAAW,EAAE,IAAY;IACvD,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAA;QAC3B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,4DAA4D,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;QAC3F,IAAI,KAAK;YAAE,OAAO,IAAI,GAAG,CAAC,KAAK,EAAE,GAAG,MAAM,CAAC,QAAQ,KAAK,MAAM,CAAC,IAAI,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAA;QACjF,IAAI,MAAM,CAAC,QAAQ,KAAK,gBAAgB,EAAE,CAAC;YACzC,OAAO,2BAA2B,MAAM,CAAC,QAAQ,GAAG,MAAM,CAAC,MAAM,EAAE,CAAA;QACrE,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,oEAAoE;IACtE,CAAC;IACD,OAAO,GAAG,CAAA;AACZ,CAAC;AAED,SAAS,gBAAgB,CAAC,IAAY;IACpC,MAAM,QAAQ,GAAG;QACf,8EAA8E;QAC9E,oEAAoE;QACpE,0DAA0D;QAC1D,sEAAsE;KACvE,CAAA;IACD,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAA;QACjC,IAAI,KAAK,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YACf,MAAM,IAAI,GAAG,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAA;YAChC,IAAI,IAAI;gBAAE,OAAO,IAAI,CAAA;QACvB,CAAC;IACH,CAAC;IACD,OAAO,SAAS,CAAC,IAAI,CAAC,CAAA;AACxB,CAAC;AAED,MAAM,CAAC,MAAM,iBAAiB,GAAuB;IACnD,EAAE,EAAE,YAAY;IAChB,OAAO,EAAE,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,CAAC,MAAM,KAAK,YAAY;IAClD,KAAK,CAAC,SAAS,CAAC,OAA0B;QACxC,MAAM,UAAU,GAAG,sBAAsB,CAAC,OAAO,CAAC,GAAG,EAAE,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;QAC9E,IAAI,IAAI,GAAG,OAAO,CAAC,MAAM,CAAC,OAAO,CAAA;QACjC,IAAI,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,MAAM,CAAA;QAElC,IAAI,UAAU,KAAK,OAAO,CAAC,GAAG,EAAE,CAAC;YAC/B,MAAM,MAAM,GAAG,MAAM,iBAAiB,CAAC,UAAU,EAAE,OAAO,CAAC,SAAS,EAAE;gBACpE,OAAO,EAAE;oBACP,MAAM,EAAE,0BAA0B;oBAClC,YAAY,EAAE,UAAU;oBACxB,OAAO,EAAE,OAAO,CAAC,GAAG;iBACrB;aACF,EAAE;gBACD,IAAI,EAAE,aAAa;gBACnB,SAAS,EAAE,OAAO,CAAC,GAAG;aACvB,CAAC,CAAA;YACF,IAAI,MAAM,CAAC,EAAE,EAAE,CAAC;gBACd,IAAI,GAAG,MAAM,CAAC,IAAI,CAAA;gBAClB,MAAM,GAAG,kBAAkB,CAAA;YAC7B,CAAC;iBAAM,CAAC;gBACN,OAAO,GAAG;oBACR,GAAG,OAAO;oBACV,MAAM,EAAE,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,MAAM,CAAC,KAAK,CAAC;iBAC1C,CAAA;YACH,CAAC;QACH,CAAC;QAED,MAAM,KAAK,GAAG,mBAAmB,CAAC,IAAI,EAAE,UAAU,CAAC,IAAI,oBAAoB,CAAC,IAAI,CAAC,CAAA;QACjF,MAAM,WAAW,GAAG,mBAAmB,CAAC,IAAI,EAAE,gBAAgB,CAAC,IAAI,sBAAsB,CAAC,IAAI,CAAC,CAAA;QAC/F,MAAM,IAAI,GAAG,gBAAgB,CAAC,IAAI,CAAC,CAAA;QAEnC,OAAO;YACL,IAAI,EAAE;gBACJ,GAAG,EAAE,OAAO,CAAC,GAAG;gBAChB,aAAa,EAAE,UAAU;gBACzB,KAAK;gBACL,WAAW;gBACX,IAAI,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC;gBAC1B,MAAM,EAAE,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,wBAAwB;aAC/C;YACD,MAAM,EAAE,EAAE;YACV,QAAQ,EAAE,EAAE;YACZ,cAAc,EAAE,UAAU,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,EAAE,GAAG,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC;YAC3E,OAAO,EAAE,CAAC,IAAI;YACd,MAAM,EAAE,OAAO,CAAC,MAAM;YACtB,MAAM;SACP,CAAA;IACH,CAAC;CACF,CAAA"}
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import { asNumber, asString, dedupeUrls, envEnabled,
|
|
1
|
+
import { asNumber, asString, dedupeUrls, envEnabled, extractUrls, stripTags } from "../../shared.js";
|
|
2
2
|
import { fetchProviderJson } from "../provider-policy.js";
|
|
3
3
|
const USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36";
|
|
4
4
|
function localOnlyMode() {
|
|
5
|
-
return
|
|
5
|
+
return envEnabled("SMART_WEB_LOCAL_ONLY", false, { private: true });
|
|
6
6
|
}
|
|
7
7
|
function allowFxTwitterRelay() {
|
|
8
8
|
if (localOnlyMode())
|
|
@@ -26,7 +26,7 @@ function extractXStatus(url) {
|
|
|
26
26
|
}
|
|
27
27
|
}
|
|
28
28
|
catch {
|
|
29
|
-
//
|
|
29
|
+
// Fall back to the generic path when the status URL cannot be parsed.
|
|
30
30
|
}
|
|
31
31
|
return null;
|
|
32
32
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"x.js","sourceRoot":"","sources":["../../../src/smartfetch/providers/x.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,UAAU,EAAE,UAAU,EAAE,
|
|
1
|
+
{"version":3,"file":"x.js","sourceRoot":"","sources":["../../../src/smartfetch/providers/x.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,UAAU,EAAE,UAAU,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAA;AACpG,OAAO,EAAE,iBAAiB,EAAE,MAAM,uBAAuB,CAAA;AAGzD,MAAM,UAAU,GAAG,iHAAiH,CAAA;AAEpI,SAAS,aAAa;IACpB,OAAO,UAAU,CAAC,sBAAsB,EAAE,KAAK,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,CAAC,CAAA;AACrE,CAAC;AAED,SAAS,mBAAmB;IAC1B,IAAI,aAAa,EAAE;QAAE,OAAO,KAAK,CAAA;IACjC,OAAO,UAAU,CAAC,6BAA6B,EAAE,KAAK,CAAC,CAAA;AACzD,CAAC;AAED,SAAS,YAAY;IACnB,MAAM,YAAY,GAAG,CAAC,aAAa,EAAE,CAAA;IACrC,OAAO,UAAU,CAAC,4BAA4B,EAAE,YAAY,CAAC,CAAA;AAC/D,CAAC;AAED,SAAS,cAAc,CAAC,GAAW;IACjC,IAAI,CAAC;QACH,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,GAAG,CAAC,CAAA;QAC3B,MAAM,KAAK,GAAG,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;QACxD,MAAM,WAAW,GAAG,KAAK,CAAC,SAAS,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,KAAK,QAAQ,CAAC,CAAA;QAChE,IAAI,WAAW,IAAI,CAAC,IAAI,KAAK,CAAC,WAAW,GAAG,CAAC,CAAC,EAAE,CAAC;YAC/C,OAAO;gBACL,UAAU,EAAE,KAAK,CAAC,WAAW,GAAG,CAAC,CAAC;gBAClC,EAAE,EAAE,KAAK,CAAC,WAAW,GAAG,CAAC,CAAC;aAC3B,CAAA;QACH,CAAC;IACH,CAAC;IAAC,MAAM,CAAC;QACP,sEAAsE;IACxE,CAAC;IACD,OAAO,IAAI,CAAA;AACb,CAAC;AAED,KAAK,UAAU,oBAAoB,CAAC,GAAW,EAAE,SAAiB;IAChE,IAAI,CAAC,mBAAmB,EAAE;QAAE,OAAO,IAAI,CAAA;IACvC,MAAM,MAAM,GAAG,cAAc,CAAC,GAAG,CAAC,CAAA;IAClC,IAAI,CAAC,MAAM;QAAE,OAAO,IAAI,CAAA;IACxB,MAAM,MAAM,GAAG,MAAM,iBAAiB,CACpC,6BAA6B,kBAAkB,CAAC,MAAM,CAAC,UAAU,IAAI,EAAE,CAAC,WAAW,kBAAkB,CAAC,MAAM,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,EACxH,SAAS,EACT;QACE,OAAO,EAAE;YACP,MAAM,EAAE,iCAAiC;YACzC,YAAY,EAAE,UAAU;SACzB;KACF,EACD;QACE,IAAI,EAAE,OAAO;QACb,SAAS,EAAE,GAAG;KACf,CACF,CAAA;IACD,IAAI,CAAC,MAAM,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,IAAI,IAAI,OAAO,MAAM,CAAC,IAAI,KAAK,QAAQ;QAAE,OAAO,IAAI,CAAA;IAC9E,OAAO,MAAM,CAAC,IAA+B,CAAA;AAC/C,CAAC;AAED,SAAS,mBAAmB,CAAC,KAAU;IACrC,MAAM,KAAK,GAAG,KAAK,CAAC,OAAO,CAAC,KAAK,EAAE,KAAK,EAAE,GAAG,CAAC;QAC5C,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG;QACjB,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,EAAE,KAAK,CAAC;YAC3B,CAAC,CAAC,KAAK,CAAC,KAAK;YACb,CAAC,CAAC,EAAE,CAAA;IAER,OAAO,KAAK;SACT,GAAG,CAAC,CAAC,IAAS,EAAE,EAAE,CAAC,MAAM,CAAC,IAAI,EAAE,GAAG,IAAI,IAAI,EAAE,eAAe,IAAI,IAAI,EAAE,aAAa,IAAI,IAAI,EAAE,mBAAmB,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;SAC/H,MAAM,CAAC,OAAO,CAAC;SACf,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAA;AACjB,CAAC;AAED,MAAM,CAAC,MAAM,SAAS,GAAuB;IAC3C,EAAE,EAAE,GAAG;IACP,OAAO,EAAE,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,CAAC,MAAM,KAAK,QAAQ;IAC9C,KAAK,CAAC,SAAS,CAAC,OAA0B;QACxC,MAAM,EAAE,GAAG,MAAM,oBAAoB,CAAC,OAAO,CAAC,GAAG,EAAE,OAAO,CAAC,SAAS,CAAC,CAAA;QACrE,IAAK,EAAU,EAAE,KAAK,EAAE,CAAC;YACvB,MAAM,KAAK,GAAI,EAAU,CAAC,KAAK,IAAI,EAAE,CAAA;YACrC,MAAM,MAAM,GAAG,KAAK,CAAC,MAAM,IAAI,EAAE,CAAA;YACjC,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,IAAI,EAAE,CAAA;YAC/B,MAAM,KAAK,GAAG,mBAAmB,CAAC,KAAK,CAAC,CAAA;YACxC,OAAO;gBACL,IAAI,EAAE;oBACJ,GAAG,EAAE,QAAQ,CAAC,KAAK,CAAC,GAAG,IAAI,OAAO,CAAC,GAAG,CAAC;oBACvC,IAAI,EAAE,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC;oBAC1B,UAAU,EAAE,QAAQ,CAAC,KAAK,CAAC,UAAU,CAAC;oBACtC,WAAW,EAAE,QAAQ,CAAC,MAAM,CAAC,IAAI,CAAC;oBAClC,kBAAkB,EAAE,QAAQ,CAAC,MAAM,CAAC,WAAW,CAAC;oBAChD,KAAK,EAAE,QAAQ,CAAC,KAAK,CAAC,KAAK,CAAC;oBAC5B,QAAQ,EAAE,QAAQ,CAAC,KAAK,CAAC,QAAQ,CAAC;oBAClC,OAAO,EAAE,QAAQ,CAAC,KAAK,CAAC,OAAO,CAAC;oBAChC,KAAK,EAAE,QAAQ,CAAC,KAAK,CAAC,KAAK,CAAC;oBAC5B,KAAK,EAAE,KAAK,EAAE,IAAI;wBAChB,CAAC,CAAC;4BACE,IAAI,EAAE,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC;4BAC1B,kBAAkB,EAAE,QAAQ,CAAC,KAAK,EAAE,MAAM,EAAE,WAAW,CAAC;yBACzD;wBACH,CAAC,CAAC,IAAI;oBACR,KAAK;iBACN;gBACD,MAAM,EAAE,EAAE;gBACV,QAAQ,EAAE,EAAE;gBACZ,cAAc,EAAE,UAAU,CAAC;oBACzB,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK;oBACvB,GAAG,KAAK;oBACR,GAAG,WAAW,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC;oBACxC,GAAG,WAAW,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC;iBACzC,CAAC;gBACF,OAAO,EAAE,KAAK;gBACd,MAAM,EAAE,OAAO,CAAC,MAAM;gBACtB,MAAM,EAAE,oBAAoB;aAC7B,CAAA;QACH,CAAC;QAED,MAAM,IAAI,GAAG,SAAS,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;QAC9C,IAAI,IAAI,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,sBAAsB,CAAC,EAAE,CAAC;YACjE,OAAO;gBACL,IAAI,EAAE,EAAE,GAAG,EAAE,OAAO,CAAC,GAAG,EAAE,IAAI,EAAE,MAAM,EAAE,mBAAmB,EAAE;gBAC7D,MAAM,EAAE,EAAE;gBACV,QAAQ,EAAE,EAAE;gBACZ,cAAc,EAAE,UAAU,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,EAAE,GAAG,WAAW,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC;gBAC7F,OAAO,EAAE,IAAI;gBACb,MAAM,EAAE;oBACN,GAAG,OAAO,CAAC,MAAM;oBACjB,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,sBAAsB,EAAE,OAAO,EAAE,mCAAmC,EAAE;iBACxG;gBACD,MAAM,EAAE,OAAO,CAAC,MAAM,CAAC,MAAM;aAC9B,CAAA;QACH,CAAC;QAED,IAAI,CAAC,YAAY,EAAE,EAAE,CAAC;YACpB,OAAO;gBACL,IAAI,EAAE,EAAE,GAAG,EAAE,OAAO,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,EAAE,MAAM,EAAE,wBAAwB,EAAE;gBACtE,MAAM,EAAE,EAAE;gBACV,QAAQ,EAAE,EAAE;gBACZ,cAAc,EAAE,UAAU,CAAC,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC;gBAChD,OAAO,EAAE,IAAI;gBACb,MAAM,EAAE;oBACN,GAAG,OAAO,CAAC,MAAM;oBACjB,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,mBAAmB,EAAE,OAAO,EAAE,6CAA6C,EAAE;iBAC/G;gBACD,MAAM,EAAE,OAAO,CAAC,MAAM,CAAC,MAAM;aAC9B,CAAA;QACH,CAAC;QAED,MAAM,QAAQ,GAAG,MAAM,iBAAiB,CAAC,0CAA0C,kBAAkB,CAAC,OAAO,CAAC,GAAG,CAAC,gBAAgB,EAAE,OAAO,CAAC,SAAS,EAAE;YACrJ,OAAO,EAAE;gBACP,MAAM,EAAE,iCAAiC;gBACzC,YAAY,EAAE,UAAU;aACzB;SACF,EAAE;YACD,IAAI,EAAE,OAAO;YACb,SAAS,EAAE,OAAO,CAAC,GAAG;SACvB,CAAC,CAAA;QACF,MAAM,IAAI,GAAG,QAAQ,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAE,QAAQ,CAAC,IAAY,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,EAAE,CAAA;QACtE,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,0BAA0B,CAAC,CAAA;QACpD,MAAM,UAAU,GAAG,KAAK,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAA;QACxD,MAAM,aAAa,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAA;QAC1D,OAAO;YACL,IAAI,EAAE,EAAE,GAAG,EAAE,OAAO,CAAC,GAAG,EAAE,IAAI,EAAE,UAAU,EAAE,MAAM,EAAE,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,wBAAwB,EAAE;YAClG,MAAM,EAAE,EAAE;YACV,QAAQ,EAAE,EAAE;YACZ,cAAc,EAAE,UAAU,CAAC,CAAC,GAAG,WAAW,CAAC,IAAI,CAAC,EAAE,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;YAC3E,OAAO,EAAE,IAAI;YACb,MAAM,EAAE,UAAU;gBAChB,CAAC,CAAC,OAAO,CAAC,MAAM;gBAChB,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,EAAE,EAAE,QAAQ,EAAE,aAAa,EAAE,QAAQ,IAAI,aAAa,EAAE,IAAI,EAAE,aAAa,EAAE,IAAI,IAAI,oBAAoB,EAAE,OAAO,EAAE,aAAa,EAAE,OAAO,IAAI,UAAU,QAAQ,CAAC,MAAM,EAAE,EAAE,CAAC;YAClM,MAAM,EAAE,UAAU,CAAC,CAAC,CAAC,iBAAiB,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM;SAC/D,CAAA;IACH,CAAC;CACF,CAAA"}
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { execFileSync } from "node:child_process";
|
|
1
2
|
import { dedupeUrls, envEnabled, extractMetaDescription, extractMetaProperty, extractTitleFromHtml, extractUrls, stripTags } from "../../shared.js";
|
|
2
3
|
import { fetchProviderJson } from "../provider-policy.js";
|
|
3
4
|
function allowYoutubeTranscript() {
|
|
@@ -16,9 +17,24 @@ async function fetchYoutubeMetadata(url, timeoutMs) {
|
|
|
16
17
|
return null;
|
|
17
18
|
return result.data;
|
|
18
19
|
}
|
|
19
|
-
async function fetchYoutubeTranscript(url) {
|
|
20
|
-
|
|
21
|
-
|
|
20
|
+
async function fetchYoutubeTranscript(url, timeoutMs) {
|
|
21
|
+
try {
|
|
22
|
+
const script = `
|
|
23
|
+
const mod = await import("youtube-transcript/dist/youtube-transcript.esm.js")
|
|
24
|
+
const transcript = await mod.fetchTranscript(${JSON.stringify(url)})
|
|
25
|
+
console.log(JSON.stringify(transcript))
|
|
26
|
+
`;
|
|
27
|
+
const output = execFileSync(process.execPath, ["--input-type=module", "-e", script], {
|
|
28
|
+
timeout: Math.max(1000, timeoutMs),
|
|
29
|
+
encoding: "utf-8",
|
|
30
|
+
maxBuffer: 5 * 1024 * 1024,
|
|
31
|
+
}).trim();
|
|
32
|
+
return (output ? JSON.parse(output) : []);
|
|
33
|
+
}
|
|
34
|
+
catch (error) {
|
|
35
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
36
|
+
throw new Error(message.toLowerCase().includes("timed out") ? `Transcript fetch timed out after ${timeoutMs}ms` : message);
|
|
37
|
+
}
|
|
22
38
|
}
|
|
23
39
|
export const youtubeProvider = {
|
|
24
40
|
id: "youtube",
|
|
@@ -29,6 +45,7 @@ export const youtubeProvider = {
|
|
|
29
45
|
const description = extractMetaProperty(context.active.content, "og:description") || extractMetaDescription(context.active.content);
|
|
30
46
|
const fallbackText = stripTags(context.active.content);
|
|
31
47
|
const author = String(oembed?.author_name || "");
|
|
48
|
+
const thumbnail = String(oembed?.thumbnail_url || "");
|
|
32
49
|
if (!allowYoutubeTranscript()) {
|
|
33
50
|
return {
|
|
34
51
|
post: {
|
|
@@ -36,6 +53,7 @@ export const youtubeProvider = {
|
|
|
36
53
|
title,
|
|
37
54
|
description,
|
|
38
55
|
...(author ? { author } : {}),
|
|
56
|
+
...(thumbnail ? { thumbnail } : {}),
|
|
39
57
|
text: fallbackText.slice(0, 30000),
|
|
40
58
|
transcript_available: false,
|
|
41
59
|
status: fallbackText ? "partial_text_only" : "blocked_or_unavailable",
|
|
@@ -52,7 +70,7 @@ export const youtubeProvider = {
|
|
|
52
70
|
};
|
|
53
71
|
}
|
|
54
72
|
try {
|
|
55
|
-
const transcript = await fetchYoutubeTranscript(context.url);
|
|
73
|
+
const transcript = await fetchYoutubeTranscript(context.url, context.timeoutMs);
|
|
56
74
|
const transcriptText = transcript.map((item) => String(item.text || "").replace(/\s+/g, " ").trim()).filter(Boolean).join(" ");
|
|
57
75
|
return {
|
|
58
76
|
post: {
|
|
@@ -60,6 +78,7 @@ export const youtubeProvider = {
|
|
|
60
78
|
title,
|
|
61
79
|
description,
|
|
62
80
|
...(author ? { author } : {}),
|
|
81
|
+
...(thumbnail ? { thumbnail } : {}),
|
|
63
82
|
text: transcriptText.slice(0, 50000),
|
|
64
83
|
transcript_available: Boolean(transcriptText),
|
|
65
84
|
transcript_segment_count: transcript.length,
|
|
@@ -81,6 +100,7 @@ export const youtubeProvider = {
|
|
|
81
100
|
title,
|
|
82
101
|
description,
|
|
83
102
|
...(author ? { author } : {}),
|
|
103
|
+
...(thumbnail ? { thumbnail } : {}),
|
|
84
104
|
text: fallbackText.slice(0, 30000),
|
|
85
105
|
transcript_available: false,
|
|
86
106
|
status: fallbackText ? "partial_text_only" : "blocked_or_unavailable",
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"youtube.js","sourceRoot":"","sources":["../../../src/smartfetch/providers/youtube.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,UAAU,EAAE,sBAAsB,EAAE,mBAAmB,EAAE,oBAAoB,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAA;AACnJ,OAAO,EAAE,iBAAiB,EAAE,MAAM,uBAAuB,CAAA;AAGzD,SAAS,sBAAsB;IAC7B,OAAO,UAAU,CAAC,sCAAsC,EAAE,IAAI,CAAC,CAAA;AACjE,CAAC;AAED,KAAK,UAAU,oBAAoB,CAAC,GAAW,EAAE,SAAiB;IAChE,MAAM,MAAM,GAAG,MAAM,iBAAiB,CACpC,sCAAsC,kBAAkB,CAAC,GAAG,CAAC,cAAc,EAC3E,SAAS,EACT;QACE,OAAO,EAAE;YACP,MAAM,EAAE,iCAAiC;SAC1C;KACF,EACD;QACE,IAAI,EAAE,OAAO;QACb,SAAS,EAAE,GAAG;KACf,CACF,CAAA;IACD,IAAI,CAAC,MAAM,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,IAAI,IAAI,OAAO,MAAM,CAAC,IAAI,KAAK,QAAQ;QAAE,OAAO,IAAI,CAAA;IAC9E,OAAO,MAAM,CAAC,IAA+B,CAAA;AAC/C,CAAC;AAED,KAAK,UAAU,sBAAsB,CAAC,GAAW;
|
|
1
|
+
{"version":3,"file":"youtube.js","sourceRoot":"","sources":["../../../src/smartfetch/providers/youtube.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAA;AAEjD,OAAO,EAAE,UAAU,EAAE,UAAU,EAAE,sBAAsB,EAAE,mBAAmB,EAAE,oBAAoB,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,iBAAiB,CAAA;AACnJ,OAAO,EAAE,iBAAiB,EAAE,MAAM,uBAAuB,CAAA;AAGzD,SAAS,sBAAsB;IAC7B,OAAO,UAAU,CAAC,sCAAsC,EAAE,IAAI,CAAC,CAAA;AACjE,CAAC;AAED,KAAK,UAAU,oBAAoB,CAAC,GAAW,EAAE,SAAiB;IAChE,MAAM,MAAM,GAAG,MAAM,iBAAiB,CACpC,sCAAsC,kBAAkB,CAAC,GAAG,CAAC,cAAc,EAC3E,SAAS,EACT;QACE,OAAO,EAAE;YACP,MAAM,EAAE,iCAAiC;SAC1C;KACF,EACD;QACE,IAAI,EAAE,OAAO;QACb,SAAS,EAAE,GAAG;KACf,CACF,CAAA;IACD,IAAI,CAAC,MAAM,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,IAAI,IAAI,OAAO,MAAM,CAAC,IAAI,KAAK,QAAQ;QAAE,OAAO,IAAI,CAAA;IAC9E,OAAO,MAAM,CAAC,IAA+B,CAAA;AAC/C,CAAC;AAED,KAAK,UAAU,sBAAsB,CAAC,GAAW,EAAE,SAAiB;IAClE,IAAI,CAAC;QACH,MAAM,MAAM,GAAG;;qDAEkC,IAAI,CAAC,SAAS,CAAC,GAAG,CAAC;;KAEnE,CAAA;QACD,MAAM,MAAM,GAAG,YAAY,CAAC,OAAO,CAAC,QAAQ,EAAE,CAAC,qBAAqB,EAAE,IAAI,EAAE,MAAM,CAAC,EAAE;YACnF,OAAO,EAAE,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,SAAS,CAAC;YAClC,QAAQ,EAAE,OAAO;YACjB,SAAS,EAAE,CAAC,GAAG,IAAI,GAAG,IAAI;SAC3B,CAAC,CAAC,IAAI,EAAE,CAAA;QACT,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,EAAE,CAAsB,CAAA;IAChE,CAAC;IAAC,OAAO,KAAc,EAAE,CAAC;QACxB,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;QACtE,MAAM,IAAI,KAAK,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,oCAAoC,SAAS,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,CAAA;IAC5H,CAAC;AACH,CAAC;AAED,MAAM,CAAC,MAAM,eAAe,GAAuB;IACjD,EAAE,EAAE,SAAS;IACb,OAAO,EAAE,CAAC,IAAI,EAAE,MAAM,EAAE,EAAE,CAAC,MAAM,KAAK,eAAe;IACrD,KAAK,CAAC,SAAS,CAAC,OAA0B;QACxC,MAAM,MAAM,GAAG,MAAM,oBAAoB,CAAC,OAAO,CAAC,GAAG,EAAE,OAAO,CAAC,SAAS,CAAC,CAAA;QACzE,MAAM,KAAK,GAAG,mBAAmB,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO,EAAE,UAAU,CAAC,IAAI,MAAM,CAAC,MAAM,EAAE,KAAK,IAAI,EAAE,CAAC,IAAI,oBAAoB,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;QACpJ,MAAM,WAAW,GAAG,mBAAmB,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO,EAAE,gBAAgB,CAAC,IAAI,sBAAsB,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;QACnI,MAAM,YAAY,GAAG,SAAS,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC,CAAA;QACtD,MAAM,MAAM,GAAG,MAAM,CAAC,MAAM,EAAE,WAAW,IAAI,EAAE,CAAC,CAAA;QAChD,MAAM,SAAS,GAAG,MAAM,CAAC,MAAM,EAAE,aAAa,IAAI,EAAE,CAAC,CAAA;QAErD,IAAI,CAAC,sBAAsB,EAAE,EAAE,CAAC;YAC9B,OAAO;gBACL,IAAI,EAAE;oBACJ,GAAG,EAAE,OAAO,CAAC,GAAG;oBAChB,KAAK;oBACL,WAAW;oBACX,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;oBAC7B,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;oBACnC,IAAI,EAAE,YAAY,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC;oBAClC,oBAAoB,EAAE,KAAK;oBAC3B,MAAM,EAAE,YAAY,CAAC,CAAC,CAAC,mBAAmB,CAAC,CAAC,CAAC,wBAAwB;iBACtE;gBACD,MAAM,EAAE,EAAE;gBACV,QAAQ,EAAE,EAAE;gBACZ,cAAc,EAAE,UAAU,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,EAAE,GAAG,WAAW,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC;gBAC7F,OAAO,EAAE,IAAI;gBACb,MAAM,EAAE;oBACN,GAAG,OAAO,CAAC,MAAM;oBACjB,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,6BAA6B,EAAE,OAAO,EAAE,yDAAyD,EAAE;iBACrI;gBACD,MAAM,EAAE,OAAO,CAAC,MAAM,CAAC,MAAM;aAC9B,CAAA;QACH,CAAC;QAED,IAAI,CAAC;YACH,MAAM,UAAU,GAAG,MAAM,sBAAsB,CAAC,OAAO,CAAC,GAAG,EAAE,OAAO,CAAC,SAAS,CAAC,CAAA;YAC/E,MAAM,cAAc,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC,IAAqB,EAAE,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,IAAI,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;YAC/I,OAAO;gBACL,IAAI,EAAE;oBACJ,GAAG,EAAE,OAAO,CAAC,GAAG;oBAChB,KAAK;oBACL,WAAW;oBACX,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;oBAC7B,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;oBACnC,IAAI,EAAE,cAAc,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC;oBACpC,oBAAoB,EAAE,OAAO,CAAC,cAAc,CAAC;oBAC7C,wBAAwB,EAAE,UAAU,CAAC,MAAM;oBAC3C,MAAM,EAAE,cAAc,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,wBAAwB;iBACzD;gBACD,MAAM,EAAE,EAAE;gBACV,QAAQ,EAAE,EAAE;gBACZ,cAAc,EAAE,UAAU,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,EAAE,GAAG,WAAW,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC;gBAC7F,OAAO,EAAE,CAAC,cAAc;gBACxB,MAAM,EAAE,OAAO,CAAC,MAAM;gBACtB,MAAM,EAAE,oBAAoB;aAC7B,CAAA;QACH,CAAC;QAAC,OAAO,KAAc,EAAE,CAAC;YACxB,MAAM,OAAO,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAA;YACtE,OAAO;gBACL,IAAI,EAAE;oBACJ,GAAG,EAAE,OAAO,CAAC,GAAG;oBAChB,KAAK;oBACL,WAAW;oBACX,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;oBAC7B,GAAG,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;oBACnC,IAAI,EAAE,YAAY,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC;oBAClC,oBAAoB,EAAE,KAAK;oBAC3B,MAAM,EAAE,YAAY,CAAC,CAAC,CAAC,mBAAmB,CAAC,CAAC,CAAC,wBAAwB;iBACtE;gBACD,MAAM,EAAE,EAAE;gBACV,QAAQ,EAAE,EAAE;gBACZ,cAAc,EAAE,UAAU,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC,KAAK,EAAE,GAAG,WAAW,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC;gBAC7F,OAAO,EAAE,IAAI;gBACb,MAAM,EAAE;oBACN,GAAG,OAAO,CAAC,MAAM;oBACjB,EAAE,QAAQ,EAAE,aAAa,EAAE,IAAI,EAAE,gCAAgC,EAAE,OAAO,EAAE;iBAC7E;gBACD,MAAM,EAAE,OAAO,CAAC,MAAM,CAAC,MAAM;aAC9B,CAAA;QACH,CAAC;IACH,CAAC;CACF,CAAA"}
|
package/dist/smartfetch.d.ts
CHANGED
|
@@ -1,4 +1,11 @@
|
|
|
1
1
|
import { SmartfetchOutput, SmartfetchRenderFormat, Target } from "./shared.js";
|
|
2
|
+
import { runBrowserSession } from "./browser-session.js";
|
|
3
|
+
export declare function choosePlaywrightWaitUntil(target: Target): "domcontentloaded" | "networkidle";
|
|
4
|
+
export declare function choosePlaywrightContent(page: {
|
|
5
|
+
html?: string;
|
|
6
|
+
text?: string;
|
|
7
|
+
}): string;
|
|
8
|
+
export declare function shouldUseProviderSeed(target: Target, forceDynamic?: boolean): boolean;
|
|
2
9
|
export type SmartfetchOptions = {
|
|
3
10
|
url: string;
|
|
4
11
|
target?: Target;
|
|
@@ -6,6 +13,9 @@ export type SmartfetchOptions = {
|
|
|
6
13
|
forceDynamic?: boolean;
|
|
7
14
|
strictSchema?: boolean;
|
|
8
15
|
};
|
|
9
|
-
export
|
|
16
|
+
export type SmartfetchRuntime = {
|
|
17
|
+
runBrowserSession?: typeof runBrowserSession;
|
|
18
|
+
};
|
|
19
|
+
export declare function runSmartfetch(options: SmartfetchOptions, runtime?: SmartfetchRuntime): Promise<SmartfetchOutput>;
|
|
10
20
|
export declare function renderSmartfetch(output: SmartfetchOutput, format: SmartfetchRenderFormat): string;
|
|
11
21
|
export declare function renderSmartfetchUrl(url: string, format: SmartfetchRenderFormat, timeoutMs: number): Promise<string>;
|
package/dist/smartfetch.js
CHANGED
|
@@ -1,10 +1,8 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
4
|
-
import { asString, cleanLinks, dedupeUrls, envFlag, extractAnchorHrefs, extractUrls, inferTarget, isDcinsideUrl, needsDynamicCrawl, validateOutboundUrl, } from "./shared.js";
|
|
1
|
+
import { asString, cleanLinks, dedupeUrls, envFlag, extractAnchorHrefs, extractUrls, inferTarget, isDcinsideUrl, needsDynamicCrawl, resolveValidatedUrl, validateOutboundUrl, } from "./shared.js";
|
|
2
|
+
import { runBrowserSession } from "./browser-session.js";
|
|
3
|
+
import { discoverAssets } from "./smartfetch/assets.js";
|
|
5
4
|
import { maybeUseArchiveFallback } from "./smartfetch/archive-fallback.js";
|
|
6
5
|
import { resolveSmartfetchProvider } from "./smartfetch/providers/index.js";
|
|
7
|
-
const thisDir = typeof __dirname !== "undefined" ? __dirname : dirname(fileURLToPath(import.meta.url));
|
|
8
6
|
const DEFAULT_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36";
|
|
9
7
|
let impitCache = undefined;
|
|
10
8
|
function allowPrivateHosts() {
|
|
@@ -22,18 +20,36 @@ async function getImpit() {
|
|
|
22
20
|
return impitCache;
|
|
23
21
|
}
|
|
24
22
|
async function runNativeFetch(url, timeoutMs) {
|
|
23
|
+
const headers = {
|
|
24
|
+
accept: "application/json,text/html,text/plain,*/*",
|
|
25
|
+
"user-agent": DEFAULT_USER_AGENT,
|
|
26
|
+
};
|
|
27
|
+
if (isDcinsideUrl(url)) {
|
|
28
|
+
headers["accept-language"] = "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7";
|
|
29
|
+
headers.referer = "https://gall.dcinside.com/";
|
|
30
|
+
}
|
|
31
|
+
const resolved = await resolveValidatedUrl(url, {
|
|
32
|
+
allowPrivateHosts: allowPrivateHosts(),
|
|
33
|
+
timeoutMs,
|
|
34
|
+
headers,
|
|
35
|
+
});
|
|
36
|
+
if (!resolved.ok) {
|
|
37
|
+
return {
|
|
38
|
+
ok: false,
|
|
39
|
+
method: "plain_fetch",
|
|
40
|
+
content: "",
|
|
41
|
+
links: [],
|
|
42
|
+
error: {
|
|
43
|
+
category: "block",
|
|
44
|
+
code: resolved.reason,
|
|
45
|
+
message: resolved.message,
|
|
46
|
+
},
|
|
47
|
+
};
|
|
48
|
+
}
|
|
25
49
|
const controller = new AbortController();
|
|
26
50
|
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
27
51
|
try {
|
|
28
|
-
const
|
|
29
|
-
accept: "application/json,text/html,text/plain,*/*",
|
|
30
|
-
"user-agent": DEFAULT_USER_AGENT,
|
|
31
|
-
};
|
|
32
|
-
if (isDcinsideUrl(url)) {
|
|
33
|
-
headers["accept-language"] = "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7";
|
|
34
|
-
headers.referer = "https://gall.dcinside.com/";
|
|
35
|
-
}
|
|
36
|
-
const response = await fetch(url, { signal: controller.signal, headers });
|
|
52
|
+
const response = await fetch(resolved.url, { signal: controller.signal, headers, redirect: "error" });
|
|
37
53
|
const body = await response.text();
|
|
38
54
|
if (!response.ok) {
|
|
39
55
|
return {
|
|
@@ -73,14 +89,32 @@ async function runImpitFetch(url, timeoutMs) {
|
|
|
73
89
|
const Impit = await getImpit();
|
|
74
90
|
if (!Impit)
|
|
75
91
|
return runNativeFetch(url, timeoutMs);
|
|
92
|
+
const headers = {};
|
|
93
|
+
if (isDcinsideUrl(url)) {
|
|
94
|
+
headers.Referer = "https://gall.dcinside.com/";
|
|
95
|
+
headers["Accept-Language"] = "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7";
|
|
96
|
+
}
|
|
97
|
+
const resolved = await resolveValidatedUrl(url, {
|
|
98
|
+
allowPrivateHosts: allowPrivateHosts(),
|
|
99
|
+
timeoutMs,
|
|
100
|
+
headers,
|
|
101
|
+
});
|
|
102
|
+
if (!resolved.ok) {
|
|
103
|
+
return {
|
|
104
|
+
ok: false,
|
|
105
|
+
method: "impit_fetch",
|
|
106
|
+
content: "",
|
|
107
|
+
links: [],
|
|
108
|
+
error: {
|
|
109
|
+
category: "block",
|
|
110
|
+
code: resolved.reason,
|
|
111
|
+
message: resolved.message,
|
|
112
|
+
},
|
|
113
|
+
};
|
|
114
|
+
}
|
|
76
115
|
try {
|
|
77
116
|
const client = new Impit({ browser: "chrome", timeout: timeoutMs });
|
|
78
|
-
const
|
|
79
|
-
if (isDcinsideUrl(url)) {
|
|
80
|
-
headers.Referer = "https://gall.dcinside.com/";
|
|
81
|
-
headers["Accept-Language"] = "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7";
|
|
82
|
-
}
|
|
83
|
-
const response = await client.fetch(url, { headers });
|
|
117
|
+
const response = await client.fetch(resolved.url, { headers, redirect: "error" });
|
|
84
118
|
const body = await response.text();
|
|
85
119
|
if (!response.ok) {
|
|
86
120
|
return {
|
|
@@ -113,103 +147,44 @@ async function runImpitFetch(url, timeoutMs) {
|
|
|
113
147
|
};
|
|
114
148
|
}
|
|
115
149
|
}
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
let lastError = null
|
|
131
|
-
for (const options of attempts) {
|
|
132
|
-
if (options.executablePath === undefined) delete options.executablePath
|
|
133
|
-
try {
|
|
134
|
-
return await chromium.launch(options)
|
|
135
|
-
} catch (error) {
|
|
136
|
-
lastError = error
|
|
137
|
-
}
|
|
138
|
-
}
|
|
139
|
-
throw lastError
|
|
140
|
-
}
|
|
141
|
-
|
|
142
|
-
const browser = await launchBrowser()
|
|
143
|
-
const context = await browser.newContext({
|
|
144
|
-
userAgent: ${JSON.stringify(DEFAULT_USER_AGENT)},
|
|
145
|
-
locale: "ko-KR",
|
|
146
|
-
})
|
|
147
|
-
const page = await context.newPage()
|
|
148
|
-
await page.goto(${JSON.stringify(url)}, { waitUntil: "networkidle", timeout: ${timeoutMs} })
|
|
149
|
-
|
|
150
|
-
const html = await page.content()
|
|
151
|
-
const text = await page.evaluate(() => document.body?.innerText || "")
|
|
152
|
-
const links = await page.evaluate(() => Array.from(document.querySelectorAll("a[href]")).map((node) => node.href).filter(Boolean))
|
|
153
|
-
|
|
154
|
-
await browser.close()
|
|
155
|
-
|
|
156
|
-
console.log(JSON.stringify({
|
|
157
|
-
ok: Boolean(text.trim() || html.trim()),
|
|
158
|
-
method: "playwright_stealth",
|
|
159
|
-
content: html,
|
|
160
|
-
text,
|
|
161
|
-
links,
|
|
162
|
-
}))
|
|
163
|
-
} catch (error) {
|
|
164
|
-
console.log(JSON.stringify({
|
|
165
|
-
ok: false,
|
|
166
|
-
method: "playwright_stealth",
|
|
167
|
-
content: "",
|
|
168
|
-
links: [],
|
|
169
|
-
error: String(error && error.message ? error.message : error),
|
|
170
|
-
}))
|
|
171
|
-
}
|
|
172
|
-
})()
|
|
173
|
-
`;
|
|
174
|
-
try {
|
|
175
|
-
const output = execFileSync("node", ["-e", script], {
|
|
176
|
-
cwd: resolve(thisDir, ".."),
|
|
177
|
-
timeout: Math.max(timeoutMs + 10000, 20000),
|
|
178
|
-
encoding: "utf-8",
|
|
179
|
-
maxBuffer: 10 * 1024 * 1024,
|
|
180
|
-
}).trim();
|
|
181
|
-
const parsed = JSON.parse(output);
|
|
182
|
-
if (!parsed.ok) {
|
|
183
|
-
return {
|
|
184
|
-
ok: false,
|
|
185
|
-
method: "playwright_stealth",
|
|
186
|
-
content: "",
|
|
187
|
-
links: [],
|
|
188
|
-
error: {
|
|
189
|
-
category: String(parsed.error || "").toLowerCase().includes("timeout") ? "timeout" : "unavailable",
|
|
190
|
-
code: "playwright_fetch_failed",
|
|
191
|
-
message: String(parsed.error || "unknown playwright failure"),
|
|
192
|
-
},
|
|
193
|
-
};
|
|
194
|
-
}
|
|
195
|
-
const content = String(parsed.text || parsed.content || "");
|
|
196
|
-
const links = Array.isArray(parsed.links) ? parsed.links.map((item) => asString(item)) : [];
|
|
197
|
-
return { ok: true, method: "playwright_stealth", content, links: dedupeUrls(links) };
|
|
150
|
+
export function choosePlaywrightWaitUntil(target) {
|
|
151
|
+
return target === "dcinside_post" ? "domcontentloaded" : "networkidle";
|
|
152
|
+
}
|
|
153
|
+
export function choosePlaywrightContent(page) {
|
|
154
|
+
return String(page.html || page.text || "");
|
|
155
|
+
}
|
|
156
|
+
export function shouldUseProviderSeed(target, forceDynamic) {
|
|
157
|
+
return !forceDynamic && (target === "youtube_video" || target === "x_post");
|
|
158
|
+
}
|
|
159
|
+
async function runPlaywrightFetch(url, timeoutMs, target) {
|
|
160
|
+
const extraHeaders = { "user-agent": DEFAULT_USER_AGENT };
|
|
161
|
+
if (isDcinsideUrl(url)) {
|
|
162
|
+
extraHeaders["accept-language"] = "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7";
|
|
163
|
+
extraHeaders.referer = "https://gall.dcinside.com/";
|
|
198
164
|
}
|
|
199
|
-
|
|
200
|
-
|
|
165
|
+
const result = await runBrowserSession({
|
|
166
|
+
url,
|
|
167
|
+
timeoutMs,
|
|
168
|
+
allowPrivateHosts: allowPrivateHosts(),
|
|
169
|
+
waitUntil: choosePlaywrightWaitUntil(target),
|
|
170
|
+
extraHeaders,
|
|
171
|
+
maxAnchors: 400,
|
|
172
|
+
});
|
|
173
|
+
if (!result.ok) {
|
|
201
174
|
return {
|
|
202
175
|
ok: false,
|
|
203
|
-
method:
|
|
176
|
+
method: result.method,
|
|
204
177
|
content: "",
|
|
205
178
|
links: [],
|
|
206
|
-
error:
|
|
207
|
-
category: message.toLowerCase().includes("timeout") ? "timeout" : "unavailable",
|
|
208
|
-
code: "playwright_exec_failed",
|
|
209
|
-
message,
|
|
210
|
-
},
|
|
179
|
+
error: result.error,
|
|
211
180
|
};
|
|
212
181
|
}
|
|
182
|
+
return {
|
|
183
|
+
ok: true,
|
|
184
|
+
method: result.method,
|
|
185
|
+
content: choosePlaywrightContent(result),
|
|
186
|
+
links: dedupeUrls(result.links.map((item) => asString(item))),
|
|
187
|
+
};
|
|
213
188
|
}
|
|
214
189
|
function baseOutput(url, target) {
|
|
215
190
|
return {
|
|
@@ -223,6 +198,7 @@ function baseOutput(url, target) {
|
|
|
223
198
|
post: null,
|
|
224
199
|
thread: [],
|
|
225
200
|
comments: [],
|
|
201
|
+
assets: [],
|
|
226
202
|
outbound_links: [],
|
|
227
203
|
evidence: {
|
|
228
204
|
impit_attempted: false,
|
|
@@ -245,6 +221,7 @@ function validateOutput(output) {
|
|
|
245
221
|
"post",
|
|
246
222
|
"thread",
|
|
247
223
|
"comments",
|
|
224
|
+
"assets",
|
|
248
225
|
"outbound_links",
|
|
249
226
|
"evidence",
|
|
250
227
|
];
|
|
@@ -254,7 +231,7 @@ function validateOutput(output) {
|
|
|
254
231
|
}
|
|
255
232
|
return "";
|
|
256
233
|
}
|
|
257
|
-
export async function runSmartfetch(options) {
|
|
234
|
+
export async function runSmartfetch(options, runtime = {}) {
|
|
258
235
|
const target = inferTarget(options.url, options.target || "auto");
|
|
259
236
|
const timeoutMs = Math.max(3000, Math.floor(Number(options.timeoutMs || 25000)));
|
|
260
237
|
const output = baseOutput(options.url, target);
|
|
@@ -274,13 +251,22 @@ export async function runSmartfetch(options) {
|
|
|
274
251
|
}
|
|
275
252
|
let active;
|
|
276
253
|
if (options.forceDynamic) {
|
|
277
|
-
active = await
|
|
254
|
+
active = await runPlaywrightFetchWithRuntime(options.url, timeoutMs, target, runtime);
|
|
278
255
|
output.evidence.playwright_attempted = true;
|
|
279
256
|
output.evidence.playwright_success = active.ok;
|
|
280
257
|
output.retrieval_method.push(active.method);
|
|
281
258
|
if (active.error)
|
|
282
259
|
output.errors.push(active.error);
|
|
283
260
|
}
|
|
261
|
+
else if (shouldUseProviderSeed(target, options.forceDynamic)) {
|
|
262
|
+
active = {
|
|
263
|
+
ok: true,
|
|
264
|
+
method: target === "youtube_video" ? "youtube_metadata_seed" : "x_public_seed",
|
|
265
|
+
content: "",
|
|
266
|
+
links: [],
|
|
267
|
+
};
|
|
268
|
+
output.retrieval_method.push(active.method);
|
|
269
|
+
}
|
|
284
270
|
else {
|
|
285
271
|
const tier1 = await runImpitFetch(options.url, timeoutMs);
|
|
286
272
|
output.evidence.impit_attempted = true;
|
|
@@ -290,7 +276,7 @@ export async function runSmartfetch(options) {
|
|
|
290
276
|
output.errors.push(tier1.error);
|
|
291
277
|
active = tier1;
|
|
292
278
|
if (needsDynamicCrawl(active)) {
|
|
293
|
-
const tier2 = await
|
|
279
|
+
const tier2 = await runPlaywrightFetchWithRuntime(options.url, timeoutMs, target, runtime);
|
|
294
280
|
output.evidence.playwright_attempted = true;
|
|
295
281
|
output.evidence.playwright_success = tier2.ok;
|
|
296
282
|
output.retrieval_method.push(tier2.method);
|
|
@@ -323,8 +309,10 @@ export async function runSmartfetch(options) {
|
|
|
323
309
|
output.outbound_links = normalized.outbound_links;
|
|
324
310
|
output.partial = normalized.partial;
|
|
325
311
|
output.errors = normalized.errors;
|
|
312
|
+
output.assets = discoverAssets(options.url, active.content, output.post, output.outbound_links);
|
|
326
313
|
output.retrieval_method = output.retrieval_method.filter((value, index, list) => value && list.indexOf(value) === index);
|
|
327
314
|
output.outbound_links = dedupeUrls(cleanLinks(dedupeUrls(output.outbound_links)));
|
|
315
|
+
output.assets = output.assets.filter((asset, index, list) => asset.url && list.findIndex((item) => item.url === asset.url) === index);
|
|
328
316
|
if (options.strictSchema !== false) {
|
|
329
317
|
const error = validateOutput(output);
|
|
330
318
|
if (error) {
|
|
@@ -355,7 +343,10 @@ export function renderSmartfetch(output, format) {
|
|
|
355
343
|
return body ? `${author}: ${body}` : "";
|
|
356
344
|
}).filter(Boolean)
|
|
357
345
|
: [];
|
|
358
|
-
|
|
346
|
+
const assets = Array.isArray(output.assets)
|
|
347
|
+
? output.assets.slice(0, 15).map((asset) => `${asset.kind}: ${asset.download_url}`)
|
|
348
|
+
: [];
|
|
349
|
+
return [title, meta, text, comments.join("\n"), cleanLinks(output.outbound_links || []).join("\n"), assets.join("\n")].filter(Boolean).join("\n\n").trim();
|
|
359
350
|
}
|
|
360
351
|
const parts = [];
|
|
361
352
|
const url = String(post?.url || output.url || "").trim();
|
|
@@ -388,6 +379,11 @@ export function renderSmartfetch(output, format) {
|
|
|
388
379
|
const links = cleanLinks(output.outbound_links || []).slice(0, 20).map((item) => `- ${item}`);
|
|
389
380
|
if (links.length > 0)
|
|
390
381
|
parts.push(`## Links\n\n${links.join("\n")}`);
|
|
382
|
+
const assets = Array.isArray(output.assets)
|
|
383
|
+
? output.assets.slice(0, 20).map((asset) => `- ${asset.kind}: ${asset.download_url}`)
|
|
384
|
+
: [];
|
|
385
|
+
if (assets.length > 0)
|
|
386
|
+
parts.push(`## Assets\n\n${assets.join("\n")}`);
|
|
391
387
|
if (Array.isArray(output.errors) && output.errors.length > 0) {
|
|
392
388
|
const errors = output.errors.map((item) => `- ${item?.code || "error"}: ${item?.message || "unknown"}`);
|
|
393
389
|
parts.push(`## Notes\n\n${errors.join("\n")}`);
|
|
@@ -402,4 +398,36 @@ export async function renderSmartfetchUrl(url, format, timeoutMs) {
|
|
|
402
398
|
});
|
|
403
399
|
return renderSmartfetch(output, format);
|
|
404
400
|
}
|
|
401
|
+
async function runPlaywrightFetchWithRuntime(url, timeoutMs, target, runtime) {
|
|
402
|
+
if (!runtime.runBrowserSession)
|
|
403
|
+
return runPlaywrightFetch(url, timeoutMs, target);
|
|
404
|
+
const extraHeaders = { "user-agent": DEFAULT_USER_AGENT };
|
|
405
|
+
if (isDcinsideUrl(url)) {
|
|
406
|
+
extraHeaders["accept-language"] = "ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7";
|
|
407
|
+
extraHeaders.referer = "https://gall.dcinside.com/";
|
|
408
|
+
}
|
|
409
|
+
const result = await runtime.runBrowserSession({
|
|
410
|
+
url,
|
|
411
|
+
timeoutMs,
|
|
412
|
+
allowPrivateHosts: allowPrivateHosts(),
|
|
413
|
+
waitUntil: choosePlaywrightWaitUntil(target),
|
|
414
|
+
extraHeaders,
|
|
415
|
+
maxAnchors: 400,
|
|
416
|
+
});
|
|
417
|
+
if (!result.ok) {
|
|
418
|
+
return {
|
|
419
|
+
ok: false,
|
|
420
|
+
method: result.method,
|
|
421
|
+
content: "",
|
|
422
|
+
links: [],
|
|
423
|
+
error: result.error,
|
|
424
|
+
};
|
|
425
|
+
}
|
|
426
|
+
return {
|
|
427
|
+
ok: true,
|
|
428
|
+
method: result.method,
|
|
429
|
+
content: choosePlaywrightContent(result),
|
|
430
|
+
links: dedupeUrls(result.links.map((item) => asString(item))),
|
|
431
|
+
};
|
|
432
|
+
}
|
|
405
433
|
//# sourceMappingURL=smartfetch.js.map
|