@isdk/web-searcher 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.cn.md +196 -7
- package/README.md +196 -7
- package/dist/index.d.mts +234 -11
- package/dist/index.d.ts +234 -11
- package/dist/index.js +1 -1
- package/dist/index.mjs +1 -1
- package/docs/README.md +196 -7
- package/docs/classes/GoogleSearcher.md +289 -60
- package/docs/classes/WebSearcher.md +264 -61
- package/docs/functions/extractDate.md +42 -0
- package/docs/functions/extractMetadataFrom.md +40 -0
- package/docs/functions/fetchHeaders.md +34 -0
- package/docs/functions/fetchPartial.md +41 -0
- package/docs/functions/normalizeDate.md +29 -0
- package/docs/functions/parseHeaders.md +28 -0
- package/docs/functions/parseHtml.md +31 -0
- package/docs/functions/testUrlsByLatency.md +42 -0
- package/docs/globals.md +18 -0
- package/docs/interfaces/CustomTimeRange.md +3 -3
- package/docs/interfaces/ExtractOptions.md +54 -0
- package/docs/interfaces/FetchExtractorOptions.md +35 -0
- package/docs/interfaces/FetcherOptions.md +436 -0
- package/docs/interfaces/HtmlData.md +53 -0
- package/docs/interfaces/MetadataResult.md +27 -0
- package/docs/interfaces/PaginationConfig.md +9 -9
- package/docs/interfaces/SearchContext.md +30 -4
- package/docs/interfaces/SearchOptions.md +77 -11
- package/docs/interfaces/StandardSearchResult.md +10 -10
- package/docs/interfaces/VerifiedUrl.md +25 -0
- package/docs/type-aliases/MetadataType.md +13 -0
- package/docs/type-aliases/SafeSearchLevel.md +1 -1
- package/docs/type-aliases/SearchCategory.md +2 -2
- package/docs/type-aliases/SearchTimeRange.md +1 -1
- package/docs/type-aliases/SearchTimeRangePreset.md +1 -1
- package/docs/type-aliases/SearcherConstructor.md +2 -2
- package/package.json +3 -2
package/dist/index.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
"use strict";var e,t=Object.defineProperty,r=Object.getOwnPropertyDescriptor,s=Object.getOwnPropertyNames,a=Object.prototype.hasOwnProperty,i={};((e,r)=>{for(var s in r)t(e,s,{get:r[s],enumerable:!0})})(i,{GoogleSearcher:()=>f,WebSearcher:()=>h}),module.exports=(e=i,((e,i,n,o)=>{if(i&&"object"==typeof i||"function"==typeof i)for(let c of s(i))a.call(e,c)||c===n||t(e,c,{get:()=>i[c],enumerable:!(o=r(i,c))||o.enumerable});return e})(t({},"__esModule",{value:!0}),e));var n=require("@isdk/web-fetcher"),o=require("custom-factory"),c=require("lodash-es");function l(e,t){if("string"==typeof e)return e.replace(/\$\{(.*?)\}/g,(e,r)=>{const s=t[r.trim()];return void 0!==s?String(s):""});if(Array.isArray(e))return e.map(e=>l(e,t));if((0,c.isPlainObject)(e)){const r={};for(const s in e)Object.prototype.hasOwnProperty.call(e,s)&&(r[s]=l(e[s],t));return r}return e}var u=require("lodash-es"),h=class extends n.FetchSession{static async search(e,t,r={}){const s=this.createObject(e,r);if(!s)throw new Error(`Search engine not found: ${e}`);try{return await s.search(t,r)}finally{await s.dispose()}}get pagination(){}createContext(e=this.options){const t=this.template,r=(0,u.defaultsDeep)({},t,e);return t.engine&&"auto"!==t.engine||!e.engine||(r.engine=e.engine),super.createContext(r)}async search(e,t={}){const r=t.limit||10,s=[];let a=0;const i=this.pagination?.startValue??0,n=this.pagination?.increment??1,o=t.maxPages||this.pagination?.maxPages||10;for(;s.length<r;){const c=this.formatOptions(t),h=i+a*n,f={...t,...c,query:e,page:a+i,offset:h,limit:r},m=l(this.template,f),{actions:d,...g}=t,p=(0,u.defaultsDeep)({},m,g),w=[],y=p.actions||[];if(0===a||"url-param"===this.pagination?.type){if(p.url){y.some(e=>"goto"===(e.id??e.name??e.action)&&e.params?.url===p.url)||w.push({id:"goto",params:{url:p.url}})}}else"click-next"===this.pagination?.type&&this.pagination.nextButtonSelector&&(w.push({id:"click",params:{selector:this.pagination.nextButtonSelector}}),w.push({id:"waitFor",params:{networkIdle:!0,ms:500}}));w.push(...y),p.engine&&this.context.engine!==p.engine&&p.engine;const{outputs:b}=await this.executeAll(w),q={query:e,page:a,limit:t.limit};let $=[];if($=await this.transform(b,q),t.transform&&($=await t.transform($,q)),!$||0===$.length)break;if(s.push(...$),s.length>=r||!this.pagination)break;if(a++,a>=o)break}return s.slice(0,r)}async transform(e,t){return e.results||[]}formatOptions(e){return{...e}}};h._isFactory=!1,(0,o.addBaseFactoryAbility)(h),h.prototype.name="Searcher";var f=class extends h{get template(){return{engine:"browser",browser:{headless:!1},url:"https://www.google.com/search?q=${query}&start=${offset}&tbs=${tbs}&tbm=${tbm}&gl=${gl}&hl=${hl}&safe=${safe}",actions:[{id:"extract",storeAs:"results",params:{type:"array",selector:"#main #search",items:{url:{selector:"a:has(h3)",attribute:"href",required:!0},title:{selector:"a:has(h3) h3",required:!0,mode:"innerText"},snippet:{selector:"div[style*='-webkit-line-clamp']",type:"html"}}}}]}}get pagination(){return{type:"url-param",paramName:"start",startValue:0,increment:10}}formatOptions(e){const t={};if(e.timeRange)if("string"==typeof e.timeRange){const r={hour:"qdr:h",day:"qdr:d",week:"qdr:w",month:"qdr:m",year:"qdr:y"};r[e.timeRange]&&(t.tbs=r[e.timeRange])}else{const r=new Date(e.timeRange.from),s=e.timeRange.to?new Date(e.timeRange.to):new Date;if(!isNaN(r.getTime())&&!isNaN(s.getTime())){const e=e=>`${e.getMonth()+1}/${e.getDate()}/${e.getFullYear()}`;t.tbs=`cdr:1,cd_min:${e(r)},cd_max:${e(s)}`}}if(e.category){const r={images:"isch",videos:"vid",news:"nws"};r[e.category]&&(t.tbm=r[e.category])}return e.region&&(t.gl=e.region),e.language&&(t.hl=e.language),e.safeSearch&&("strict"===e.safeSearch?t.safe="active":"off"===e.safeSearch&&(t.safe="images")),t}async transform(e){const t=e.results||[];return Array.isArray(t)?t.map(e=>{if(e.url&&e.url.startsWith("/url?q="))try{const t=new URL(e.url,"https://www.google.com").searchParams.get("q");t&&(e.url=t)}catch(e){}return e}):[]}};f.alias=["google"];
|
|
1
|
+
"use strict";var t,e=Object.defineProperty,r=Object.getOwnPropertyDescriptor,n=Object.getOwnPropertyNames,s=Object.prototype.hasOwnProperty,i={};async function a(t,e={}){const{timeout:r=5e3,headers:n}=e,s=new AbortController,i=setTimeout(()=>s.abort(),r);try{return(await fetch(t,{method:"HEAD",signal:s.signal,headers:{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",...n}})).headers}catch(t){return null}finally{clearTimeout(i)}}async function o(t,e=32768,r={}){const{timeout:n=1e4,headers:s}=r,i=new AbortController,a=setTimeout(()=>i.abort(),n);let o="",c=new Headers;try{const r=await fetch(t,{signal:i.signal,headers:{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",...s}});if(c=r.headers,!r.ok||!r.body)return null;const n=r.headers.get("content-type"),a=n?.match(/charset=([\w-]+)/i),l=a?a[1]:"utf-8",u=r.body.getReader(),f=new TextDecoder(l);let d=0;for(;;)try{const{done:t,value:r}=await u.read();if(t)break;if(d+=r.length,o+=f.decode(r,{stream:!0}),d>=e){i.abort();break}}catch(t){if("AbortError"===t.name)break;throw t}return{content:o,headers:c}}catch(t){return o.length>0?{content:o,headers:c}:null}finally{clearTimeout(a)}}function c(t){const e={};return t.forEach((t,r)=>{e[r.toLowerCase()]=t}),e}function l(t){const e={meta:{},jsonLd:[],time:[]},r=/<meta\s+([^>]+?)>/gi;let n;for(;null!==(n=r.exec(t));){const t=f(n[1]),r=t.name||t.property||t.itemprop,s=t.content;r&&s&&(e.meta[r.toLowerCase()]=s)}const s=/<script\s+[^>]*?type\s*=\s*["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;for(;null!==(n=s.exec(t));){const t=n[1];try{const r=JSON.parse(t);e.jsonLd.push(r)}catch(r){const n=u(t);n&&e.jsonLd.push(n)}}const i=/<time([^>]*?)>([\s\S]*?)<\/time>/gi;for(;null!==(n=i.exec(t));){const t=f(n[1]).datetime,r=n[2].replace(/<[^>]*>/g,"").trim();e.time.push({datetime:t,text:r})}return e}function u(t){const e=["datePublished","dateModified","pubDate","publishedAt"],r={};let n=!1;for(const s of e){const e=new RegExp(`"${s}"\\s*:\\s*"([^"]+)"`,"i"),i=t.match(e);i&&(r[s]=i[1],n=!0)}return n?r:null}function f(t){const e={},r=/([a-z0-9:._-]+)(?:\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)')|([^>\s]+)))?/gi;let n;for(;null!==(n=r.exec(t));){const t=n[1].toLowerCase(),r=n[2]??n[3]??n[4]??"";e[t]=r}return e}function d(t){if(!t)return null;try{let e=t.trim();if(!e)return null;e=e.replace(/^(?:last|first|posted|originally)\s*(?:published|updated|date|posted|modified)\s*(?:on|at)?[:\s]*/i,""),e=e.replace(/^(?:published|updated|date|posted|modified)\s*(?:on|at)?[:\s]*/i,""),e=e.split(/[\(|\|]|by\s+|[-–—]\s*\d+\s*min/i)[0].trim();const r=new Date(e);if(!isNaN(r.getTime())){const t=r.getUTCFullYear(),e=(new Date).getUTCFullYear();if(t>=-1e4&&t<=e+20)return r.toISOString()}}catch(t){}return null}function h(t,e){const r=l(t.content);return"date"===e?function(t,e){const r=function(t){const e=["datePublished","dateModified","pubDate","publishedAt"],r=t=>{if(!t||"object"!=typeof t)return null;for(const r of e)if("string"==typeof t[r])return t[r];if(Array.isArray(t))for(const e of t){const t=r(e);if(t)return t}else if(t["@graph"]&&Array.isArray(t["@graph"]))return r(t["@graph"]);return null};return r(t)}(t.jsonLd),n=d(r);if(n)return n;const s=function(t){const e=["article:published_time","og:published_time","datepublished","date","pubdate","publishdate","dc.date.issued","bt:pubdate","sailthru.date","article:modified_time","og:updated_time","modifieddate"];for(const r of e)if(t[r])return t[r];return null}(t.meta),i=d(s);if(i)return i;for(const e of t.time){const t=d(e.datetime||e.text);if(t)return t}const a=c(e);return d(a["last-modified"])}(r,t.headers):null}async function p(t,e={}){const r=await o(t,e.maxBytes,e);return r?h(r,"date"):null}((t,r)=>{for(var n in r)e(t,n,{get:r[n],enumerable:!0})})(i,{FetcherOptions:()=>y.FetcherOptions,GoogleSearcher:()=>O,WebSearcher:()=>A,extractDate:()=>p,extractMetadataFrom:()=>h,fetchHeaders:()=>a,fetchPartial:()=>o,normalizeDate:()=>d,parseHeaders:()=>c,parseHtml:()=>l,testUrlsByLatency:()=>b}),module.exports=(t=i,((t,i,a,o)=>{if(i&&"object"==typeof i||"function"==typeof i)for(let c of n(i))s.call(t,c)||c===a||e(t,c,{get:()=>i[c],enumerable:!(o=r(i,c))||o.enumerable});return t})(e({},"__esModule",{value:!0}),t));var m=require("@isdk/web-fetcher");async function b(t,e={}){const{timeout:r=5e3,limit:n,testPath:s="",proxy:i}=e;let a=await Promise.all(t.map(async t=>{const e=Date.now();try{const n=s?(t.endsWith("/")?t.slice(0,-1):t)+(s.startsWith("/")?s:"/"+s):t;return await(0,m.fetchWeb)(n,{timeoutMs:r,proxy:i,throwHttpErrors:!0,enableSmart:!1,engine:"http"}),{url:t,latency:Date.now()-e}}catch(t){return}}));return a=a.filter(t=>null!=t).sort((t,e)=>t.latency-e.latency),"number"==typeof n&&n&&(a=a.slice(0,n)),a}var y=require("@isdk/web-fetcher"),w=require("custom-factory"),g=require("lodash-es");function k(t,e){if("string"==typeof t)return t.replace(/\$\{(.*?)\}/g,(t,r)=>{const n=e[r.trim()];return void 0!==n?String(n):""});if(Array.isArray(t))return t.map(t=>k(t,e));if((0,g.isPlainObject)(t)){const r={};for(const n in t)Object.prototype.hasOwnProperty.call(t,n)&&(r[n]=k(t[n],e));return r}return t}var $=require("lodash-es"),q=class t extends y.FetchSession{static get defaultOptions(){return Object.prototype.hasOwnProperty.call(this,"_defaultOptions")||(this._defaultOptions={}),this._defaultOptions}static set defaultOptions(t){this._defaultOptions=t}static getDefaultOptions(){const e=[];let r=this;for(;r&&r!==Object.prototype&&(Object.prototype.hasOwnProperty.call(r,"_defaultOptions")&&r._defaultOptions&&e.push(r._defaultOptions),r!==t);)r=Object.getPrototypeOf(r);return e.length>0?(0,$.defaultsDeep)({},...e):{}}static async search(t,e,r={}){const n=Array.isArray(t)?t:[t],s=[];for(let t=0;t<n.length;t++){const i=n[t],a=this.get(i),o=a?a.getDefaultOptions():this.getDefaultOptions(),c=(0,$.defaultsDeep)({},r,o),l=c.limit||10;if(s.length>=l)break;const u=l-s.length,f={...r,limit:u},d=this.createObject(i,f);if(!d)throw new Error(`Search engine not found: ${i}`);try{const t=await d.search(e,f);for(const e of t)e.url&&!s.some(t=>t.url===e.url)&&s.push(e);if(s.length>=l)break;if(!1===c.fillLimit)break}catch(e){if(console.warn(`[WebSearcher] Engine '${i}' failed completely:`,e),t===n.length-1&&0===s.length)throw e}finally{await d.dispose()}}return s}get template(){return{}}get pagination(){}getTemplate(t,e){return(0,$.cloneDeep)(this.template)}createContext(t=this.options){const{actions:e,...r}=this.template,n=(0,$.defaultsDeep)({},r,t);return r.engine&&"auto"!==r.engine||!t.engine||(n.engine=t.engine),super.createContext(n)}async search(t,e={}){const r=this.constructor,n=(e=(0,$.defaultsDeep)({},e,this.options,r.getDefaultOptions())).limit||10,s=[],i=new Set;let a=e.startPage||0;const o=this.pagination?.startValue??0,c=this.pagination?.increment??1,l=e.maxPages||this.pagination?.maxPages||10,u=this.constructor.name;let f;e.baseUrls&&(Array.isArray(e.baseUrls)?f=e.baseUrls:"object"==typeof e.baseUrls&&(f=e.baseUrls[u]||e.baseUrls[this.constructor.alias?.[0]])),f&&0!==f.length||(f=this.constructor.defaultBaseUrls);const d=f&&f.length>0;let h=0;d&&"number"==typeof this.constructor.currentInstanceIndex&&(h=this.constructor.currentInstanceIndex);let p=!1;for(;s.length<n;){let r=!1,m=null;const b=d?f.length:1;let y=0;for(;y<b;){const l=d?f[h]:void 0,b=this.formatOptions(e),w=o+a*c,g={...e,...b,query:t,page:a+o,offset:w,limit:n,baseUrl:l?.endsWith("/")?l.slice(0,-1):l},q=k(this.getTemplate(g,e),g),{actions:A,...O}=e,v=(0,$.defaultsDeep)({},q,O),x=[],j=v.actions||[];if(a===(e.startPage||0)||"url-param"===this.pagination?.type){if(v.url){j.some(t=>"goto"===(t.id??t.name??t.action)&&t.params?.url===v.url)||x.push({id:"goto",params:{url:v.url}})}}else"click-next"===this.pagination?.type&&this.pagination.nextButtonSelector&&(x.push({id:"click",params:{selector:this.pagination.nextButtonSelector}}),x.push({id:"waitFor",params:{networkIdle:!0,ms:500}}));x.push(...j),v.engine&&this.context.engine!==v.engine&&v.engine;try{const{outputs:n}=await this.executeAll(x,e),o={...e,query:t,page:a,baseUrl:l,engine:u};let c=await this.transform(n,o);e.transform&&(c=await e.transform(c,o));let f=!0;if(this.validateFetchResult&&(f=await this.validateFetchResult(c,o)),f&&e.validator&&(f=await e.validator(c,o)),!f)throw new Error(`Results validation failed for engine: ${u}, url: ${l}`);if(c&&0!==c.length)for(const t of c)t.url&&!i.has(t.url)&&(i.add(t.url),s.push(t));else p=!0;r=!0;break}catch(t){m=t,d&&(h=(h+1)%f.length,this.constructor.currentInstanceIndex=h),y++}}if(!r)throw m||new Error(`All instances failed for engine: ${u}`);if(p)break;if(s.length>=n||!this.pagination)break;if(a++,a>=l)break}return s.slice(0,n)}async validateFetchResult(t,e){return!0}async transform(t,e){return t.results||[]}formatOptions(t){return{...t}}};q._isFactory=!1;var A=q;(0,w.addBaseFactoryAbility)(A),A.prototype.name="Searcher";var O=class extends A{get template(){return{engine:"browser",browser:{headless:!1},url:"https://www.google.com/search?q=${query}&start=${offset}&tbs=${tbs}&tbm=${tbm}&gl=${gl}&hl=${hl}&safe=${safe}",actions:[{id:"extract",storeAs:"results",params:{type:"array",selector:"#main #search",items:{url:{selector:"a:has(h3)",attribute:"href",required:!0},title:{selector:"a:has(h3) h3",required:!0,mode:"innerText"},snippet:{selector:"div[style*='-webkit-line-clamp']",type:"html"}}}}]}}get pagination(){return{type:"url-param",paramName:"start",startValue:0,increment:10}}formatOptions(t){const e={};if(t.timeRange)if("string"==typeof t.timeRange){const r={hour:"qdr:h",day:"qdr:d",week:"qdr:w",month:"qdr:m",year:"qdr:y"};r[t.timeRange]&&(e.tbs=r[t.timeRange])}else{const r=new Date(t.timeRange.from),n=t.timeRange.to?new Date(t.timeRange.to):new Date;if(!isNaN(r.getTime())&&!isNaN(n.getTime())){const t=t=>`${t.getMonth()+1}/${t.getDate()}/${t.getFullYear()}`;e.tbs=`cdr:1,cd_min:${t(r)},cd_max:${t(n)}`}}if(t.category){const r={images:"isch",videos:"vid",news:"nws"};r[t.category]&&(e.tbm=r[t.category])}return t.region&&(e.gl=t.region),t.language&&(e.hl=t.language),t.safeSearch&&("strict"===t.safeSearch?e.safe="active":"off"===t.safeSearch&&(e.safe="images")),e}async transform(t){const e=t.results||[];return Array.isArray(e)?e.map(t=>{if(t.url&&t.url.startsWith("/url?q="))try{const e=new URL(t.url,"https://www.google.com").searchParams.get("q");e&&(t.url=e)}catch(t){}return t}):[]}};O.alias=["google"];
|
package/dist/index.mjs
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
import{FetchSession as t}from"@isdk/web-fetcher";import{addBaseFactoryAbility as r}from"custom-factory";import{isPlainObject as e}from"lodash-es";function s(t,r){if("string"==typeof t)return t.replace(/\$\{(.*?)\}/g,(t,e)=>{const s=r[e.trim()];return void 0!==s?String(s):""});if(Array.isArray(t))return t.map(t=>s(t,r));if(e(t)){const e={};for(const a in t)Object.prototype.hasOwnProperty.call(t,a)&&(e[a]=s(t[a],r));return e}return t}import{defaultsDeep as a}from"lodash-es";var i=class extends t{static async search(t,r,e={}){const s=this.createObject(t,e);if(!s)throw new Error(`Search engine not found: ${t}`);try{return await s.search(r,e)}finally{await s.dispose()}}get pagination(){}createContext(t=this.options){const r=this.template,e=a({},r,t);return r.engine&&"auto"!==r.engine||!t.engine||(e.engine=t.engine),super.createContext(e)}async search(t,r={}){const e=r.limit||10,i=[];let o=0;const n=this.pagination?.startValue??0,c=this.pagination?.increment??1,h=r.maxPages||this.pagination?.maxPages||10;for(;i.length<e;){const l=this.formatOptions(r),m=n+o*c,f={...r,...l,query:t,page:o+n,offset:m,limit:e},u=s(this.template,f),{actions:p,...d}=r,w=a({},u,d),g=[],y=w.actions||[];if(0===o||"url-param"===this.pagination?.type){if(w.url){y.some(t=>"goto"===(t.id??t.name??t.action)&&t.params?.url===w.url)||g.push({id:"goto",params:{url:w.url}})}}else"click-next"===this.pagination?.type&&this.pagination.nextButtonSelector&&(g.push({id:"click",params:{selector:this.pagination.nextButtonSelector}}),g.push({id:"waitFor",params:{networkIdle:!0,ms:500}}));g.push(...y),w.engine&&this.context.engine!==w.engine&&w.engine;const{outputs:$}=await this.executeAll(g),b={query:t,page:o,limit:r.limit};let q=[];if(q=await this.transform($,b),r.transform&&(q=await r.transform(q,b)),!q||0===q.length)break;if(i.push(...q),i.length>=e||!this.pagination)break;if(o++,o>=h)break}return i.slice(0,e)}async transform(t,r){return t.results||[]}formatOptions(t){return{...t}}};i._isFactory=!1,r(i),i.prototype.name="Searcher";var o=class extends i{get template(){return{engine:"browser",browser:{headless:!1},url:"https://www.google.com/search?q=${query}&start=${offset}&tbs=${tbs}&tbm=${tbm}&gl=${gl}&hl=${hl}&safe=${safe}",actions:[{id:"extract",storeAs:"results",params:{type:"array",selector:"#main #search",items:{url:{selector:"a:has(h3)",attribute:"href",required:!0},title:{selector:"a:has(h3) h3",required:!0,mode:"innerText"},snippet:{selector:"div[style*='-webkit-line-clamp']",type:"html"}}}}]}}get pagination(){return{type:"url-param",paramName:"start",startValue:0,increment:10}}formatOptions(t){const r={};if(t.timeRange)if("string"==typeof t.timeRange){const e={hour:"qdr:h",day:"qdr:d",week:"qdr:w",month:"qdr:m",year:"qdr:y"};e[t.timeRange]&&(r.tbs=e[t.timeRange])}else{const e=new Date(t.timeRange.from),s=t.timeRange.to?new Date(t.timeRange.to):new Date;if(!isNaN(e.getTime())&&!isNaN(s.getTime())){const t=t=>`${t.getMonth()+1}/${t.getDate()}/${t.getFullYear()}`;r.tbs=`cdr:1,cd_min:${t(e)},cd_max:${t(s)}`}}if(t.category){const e={images:"isch",videos:"vid",news:"nws"};e[t.category]&&(r.tbm=e[t.category])}return t.region&&(r.gl=t.region),t.language&&(r.hl=t.language),t.safeSearch&&("strict"===t.safeSearch?r.safe="active":"off"===t.safeSearch&&(r.safe="images")),r}async transform(t){const r=t.results||[];return Array.isArray(r)?r.map(t=>{if(t.url&&t.url.startsWith("/url?q="))try{const r=new URL(t.url,"https://www.google.com").searchParams.get("q");r&&(t.url=r)}catch(t){}return t}):[]}};o.alias=["google"];export{o as GoogleSearcher,i as WebSearcher};
|
|
1
|
+
async function t(t,e={}){const{timeout:r=5e3,headers:n}=e,s=new AbortController,i=setTimeout(()=>s.abort(),r);try{return(await fetch(t,{method:"HEAD",signal:s.signal,headers:{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",...n}})).headers}catch(t){return null}finally{clearTimeout(i)}}async function e(t,e=32768,r={}){const{timeout:n=1e4,headers:s}=r,i=new AbortController,o=setTimeout(()=>i.abort(),n);let a="",c=new Headers;try{const r=await fetch(t,{signal:i.signal,headers:{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",...s}});if(c=r.headers,!r.ok||!r.body)return null;const n=r.headers.get("content-type"),o=n?.match(/charset=([\w-]+)/i),l=o?o[1]:"utf-8",u=r.body.getReader(),f=new TextDecoder(l);let d=0;for(;;)try{const{done:t,value:r}=await u.read();if(t)break;if(d+=r.length,a+=f.decode(r,{stream:!0}),d>=e){i.abort();break}}catch(t){if("AbortError"===t.name)break;throw t}return{content:a,headers:c}}catch(t){return a.length>0?{content:a,headers:c}:null}finally{clearTimeout(o)}}function r(t){const e={};return t.forEach((t,r)=>{e[r.toLowerCase()]=t}),e}function n(t){const e={meta:{},jsonLd:[],time:[]},r=/<meta\s+([^>]+?)>/gi;let n;for(;null!==(n=r.exec(t));){const t=i(n[1]),r=t.name||t.property||t.itemprop,s=t.content;r&&s&&(e.meta[r.toLowerCase()]=s)}const o=/<script\s+[^>]*?type\s*=\s*["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;for(;null!==(n=o.exec(t));){const t=n[1];try{const r=JSON.parse(t);e.jsonLd.push(r)}catch(r){const n=s(t);n&&e.jsonLd.push(n)}}const a=/<time([^>]*?)>([\s\S]*?)<\/time>/gi;for(;null!==(n=a.exec(t));){const t=i(n[1]).datetime,r=n[2].replace(/<[^>]*>/g,"").trim();e.time.push({datetime:t,text:r})}return e}function s(t){const e=["datePublished","dateModified","pubDate","publishedAt"],r={};let n=!1;for(const s of e){const e=new RegExp(`"${s}"\\s*:\\s*"([^"]+)"`,"i"),i=t.match(e);i&&(r[s]=i[1],n=!0)}return n?r:null}function i(t){const e={},r=/([a-z0-9:._-]+)(?:\s*=\s*(?:(?:"([^"]*)")|(?:'([^']*)')|([^>\s]+)))?/gi;let n;for(;null!==(n=r.exec(t));){const t=n[1].toLowerCase(),r=n[2]??n[3]??n[4]??"";e[t]=r}return e}function o(t){if(!t)return null;try{let e=t.trim();if(!e)return null;e=e.replace(/^(?:last|first|posted|originally)\s*(?:published|updated|date|posted|modified)\s*(?:on|at)?[:\s]*/i,""),e=e.replace(/^(?:published|updated|date|posted|modified)\s*(?:on|at)?[:\s]*/i,""),e=e.split(/[\(|\|]|by\s+|[-–—]\s*\d+\s*min/i)[0].trim();const r=new Date(e);if(!isNaN(r.getTime())){const t=r.getUTCFullYear(),e=(new Date).getUTCFullYear();if(t>=-1e4&&t<=e+20)return r.toISOString()}}catch(t){}return null}function a(t,e){const s=n(t.content);return"date"===e?function(t,e){const n=function(t){const e=["datePublished","dateModified","pubDate","publishedAt"],r=t=>{if(!t||"object"!=typeof t)return null;for(const r of e)if("string"==typeof t[r])return t[r];if(Array.isArray(t))for(const e of t){const t=r(e);if(t)return t}else if(t["@graph"]&&Array.isArray(t["@graph"]))return r(t["@graph"]);return null};return r(t)}(t.jsonLd),s=o(n);if(s)return s;const i=function(t){const e=["article:published_time","og:published_time","datepublished","date","pubdate","publishdate","dc.date.issued","bt:pubdate","sailthru.date","article:modified_time","og:updated_time","modifieddate"];for(const r of e)if(t[r])return t[r];return null}(t.meta),a=o(i);if(a)return a;for(const e of t.time){const t=o(e.datetime||e.text);if(t)return t}const c=r(e);return o(c["last-modified"])}(s,t.headers):null}async function c(t,r={}){const n=await e(t,r.maxBytes,r);return n?a(n,"date"):null}import{fetchWeb as l}from"@isdk/web-fetcher";async function u(t,e={}){const{timeout:r=5e3,limit:n,testPath:s="",proxy:i}=e;let o=await Promise.all(t.map(async t=>{const e=Date.now();try{const n=s?(t.endsWith("/")?t.slice(0,-1):t)+(s.startsWith("/")?s:"/"+s):t;return await l(n,{timeoutMs:r,proxy:i,throwHttpErrors:!0,enableSmart:!1,engine:"http"}),{url:t,latency:Date.now()-e}}catch(t){return}}));return o=o.filter(t=>null!=t).sort((t,e)=>t.latency-e.latency),"number"==typeof n&&n&&(o=o.slice(0,n)),o}import{FetcherOptions as f,FetchSession as d}from"@isdk/web-fetcher";import{addBaseFactoryAbility as h}from"custom-factory";import{isPlainObject as m}from"lodash-es";function p(t,e){if("string"==typeof t)return t.replace(/\$\{(.*?)\}/g,(t,r)=>{const n=e[r.trim()];return void 0!==n?String(n):""});if(Array.isArray(t))return t.map(t=>p(t,e));if(m(t)){const r={};for(const n in t)Object.prototype.hasOwnProperty.call(t,n)&&(r[n]=p(t[n],e));return r}return t}import{cloneDeep as y,defaultsDeep as b}from"lodash-es";var w=class t extends d{static get defaultOptions(){return Object.prototype.hasOwnProperty.call(this,"_defaultOptions")||(this._defaultOptions={}),this._defaultOptions}static set defaultOptions(t){this._defaultOptions=t}static getDefaultOptions(){const e=[];let r=this;for(;r&&r!==Object.prototype&&(Object.prototype.hasOwnProperty.call(r,"_defaultOptions")&&r._defaultOptions&&e.push(r._defaultOptions),r!==t);)r=Object.getPrototypeOf(r);return e.length>0?b({},...e):{}}static async search(t,e,r={}){const n=Array.isArray(t)?t:[t],s=[];for(let t=0;t<n.length;t++){const i=n[t],o=this.get(i),a=o?o.getDefaultOptions():this.getDefaultOptions(),c=b({},r,a),l=c.limit||10;if(s.length>=l)break;const u=l-s.length,f={...r,limit:u},d=this.createObject(i,f);if(!d)throw new Error(`Search engine not found: ${i}`);try{const t=await d.search(e,f);for(const e of t)e.url&&!s.some(t=>t.url===e.url)&&s.push(e);if(s.length>=l)break;if(!1===c.fillLimit)break}catch(e){if(console.warn(`[WebSearcher] Engine '${i}' failed completely:`,e),t===n.length-1&&0===s.length)throw e}finally{await d.dispose()}}return s}get template(){return{}}get pagination(){}getTemplate(t,e){return y(this.template)}createContext(t=this.options){const{actions:e,...r}=this.template,n=b({},r,t);return r.engine&&"auto"!==r.engine||!t.engine||(n.engine=t.engine),super.createContext(n)}async search(t,e={}){const r=this.constructor,n=(e=b({},e,this.options,r.getDefaultOptions())).limit||10,s=[],i=new Set;let o=e.startPage||0;const a=this.pagination?.startValue??0,c=this.pagination?.increment??1,l=e.maxPages||this.pagination?.maxPages||10,u=this.constructor.name;let f;e.baseUrls&&(Array.isArray(e.baseUrls)?f=e.baseUrls:"object"==typeof e.baseUrls&&(f=e.baseUrls[u]||e.baseUrls[this.constructor.alias?.[0]])),f&&0!==f.length||(f=this.constructor.defaultBaseUrls);const d=f&&f.length>0;let h=0;d&&"number"==typeof this.constructor.currentInstanceIndex&&(h=this.constructor.currentInstanceIndex);let m=!1;for(;s.length<n;){let r=!1,y=null;const w=d?f.length:1;let g=0;for(;g<w;){const l=d?f[h]:void 0,w=this.formatOptions(e),k=a+o*c,$={...e,...w,query:t,page:o+a,offset:k,limit:n,baseUrl:l?.endsWith("/")?l.slice(0,-1):l},A=p(this.getTemplate($,e),$),{actions:x,...q}=e,O=b({},A,q),v=[],D=O.actions||[];if(o===(e.startPage||0)||"url-param"===this.pagination?.type){if(O.url){D.some(t=>"goto"===(t.id??t.name??t.action)&&t.params?.url===O.url)||v.push({id:"goto",params:{url:O.url}})}}else"click-next"===this.pagination?.type&&this.pagination.nextButtonSelector&&(v.push({id:"click",params:{selector:this.pagination.nextButtonSelector}}),v.push({id:"waitFor",params:{networkIdle:!0,ms:500}}));v.push(...D),O.engine&&this.context.engine!==O.engine&&O.engine;try{const{outputs:n}=await this.executeAll(v,e),a={...e,query:t,page:o,baseUrl:l,engine:u};let c=await this.transform(n,a);e.transform&&(c=await e.transform(c,a));let f=!0;if(this.validateFetchResult&&(f=await this.validateFetchResult(c,a)),f&&e.validator&&(f=await e.validator(c,a)),!f)throw new Error(`Results validation failed for engine: ${u}, url: ${l}`);if(c&&0!==c.length)for(const t of c)t.url&&!i.has(t.url)&&(i.add(t.url),s.push(t));else m=!0;r=!0;break}catch(t){y=t,d&&(h=(h+1)%f.length,this.constructor.currentInstanceIndex=h),g++}}if(!r)throw y||new Error(`All instances failed for engine: ${u}`);if(m)break;if(s.length>=n||!this.pagination)break;if(o++,o>=l)break}return s.slice(0,n)}async validateFetchResult(t,e){return!0}async transform(t,e){return t.results||[]}formatOptions(t){return{...t}}};w._isFactory=!1;var g=w;h(g),g.prototype.name="Searcher";var k=class extends g{get template(){return{engine:"browser",browser:{headless:!1},url:"https://www.google.com/search?q=${query}&start=${offset}&tbs=${tbs}&tbm=${tbm}&gl=${gl}&hl=${hl}&safe=${safe}",actions:[{id:"extract",storeAs:"results",params:{type:"array",selector:"#main #search",items:{url:{selector:"a:has(h3)",attribute:"href",required:!0},title:{selector:"a:has(h3) h3",required:!0,mode:"innerText"},snippet:{selector:"div[style*='-webkit-line-clamp']",type:"html"}}}}]}}get pagination(){return{type:"url-param",paramName:"start",startValue:0,increment:10}}formatOptions(t){const e={};if(t.timeRange)if("string"==typeof t.timeRange){const r={hour:"qdr:h",day:"qdr:d",week:"qdr:w",month:"qdr:m",year:"qdr:y"};r[t.timeRange]&&(e.tbs=r[t.timeRange])}else{const r=new Date(t.timeRange.from),n=t.timeRange.to?new Date(t.timeRange.to):new Date;if(!isNaN(r.getTime())&&!isNaN(n.getTime())){const t=t=>`${t.getMonth()+1}/${t.getDate()}/${t.getFullYear()}`;e.tbs=`cdr:1,cd_min:${t(r)},cd_max:${t(n)}`}}if(t.category){const r={images:"isch",videos:"vid",news:"nws"};r[t.category]&&(e.tbm=r[t.category])}return t.region&&(e.gl=t.region),t.language&&(e.hl=t.language),t.safeSearch&&("strict"===t.safeSearch?e.safe="active":"off"===t.safeSearch&&(e.safe="images")),e}async transform(t){const e=t.results||[];return Array.isArray(e)?e.map(t=>{if(t.url&&t.url.startsWith("/url?q="))try{const e=new URL(t.url,"https://www.google.com").searchParams.get("q");e&&(t.url=e)}catch(t){}return t}):[]}};k.alias=["google"];export{f as FetcherOptions,k as GoogleSearcher,g as WebSearcher,c as extractDate,a as extractMetadataFrom,t as fetchHeaders,e as fetchPartial,o as normalizeDate,r as parseHeaders,n as parseHtml,u as testUrlsByLatency};
|
package/docs/README.md
CHANGED
|
@@ -23,8 +23,9 @@ This module encapsulates these patterns into a reusable `WebSearcher` class.
|
|
|
23
23
|
|
|
24
24
|
> **⚠️ Note on `GoogleSearcher`**: The `GoogleSearcher` class used in these examples is a **demo implementation** included for educational purposes. It is not intended for production use.
|
|
25
25
|
>
|
|
26
|
-
> *
|
|
27
|
-
> *
|
|
26
|
+
> * **Strict Anti-Bot Detection**: Currently, it has been found that even when attempting to simulate simple "human behavior" in `browser` mode (such as waiting for a few seconds before automatically filling in the search box and submitting), it is still detected as an automated program by Google. This indicates that simple operation simulation is not enough to pass the detection.
|
|
27
|
+
> * **Scalability Limitations**: It lacks advanced countermeasures like CAPTCHA solving, fingerprint spoofing, or high-quality proxy rotation required for reliable scraping.
|
|
28
|
+
> * **Fragility**: The extracted data may be **inaccurate or misaligned** due to Google's frequent DOM changes and A/B testing.
|
|
28
29
|
|
|
29
30
|
Use the static `WebSearcher.search` method for quick, disposable tasks. It automatically creates a session, fetches results, and cleans up.
|
|
30
31
|
|
|
@@ -42,16 +43,105 @@ const results = await WebSearcher.search('Google', 'open source', { limit: 20 })
|
|
|
42
43
|
console.log(results);
|
|
43
44
|
```
|
|
44
45
|
|
|
45
|
-
###
|
|
46
|
+
### 3. Multi-Engine Orchestration
|
|
47
|
+
|
|
48
|
+
`WebSearcher.search` features a built-in **Waterfall** compensation mechanism. When you provide an array of engine names, it executes them sequentially and automatically fills the result count:
|
|
49
|
+
|
|
50
|
+
- **Automatic Completion**: If the preceding engines return fewer results than the `limit`, it automatically requests subsequent engines to fill the gap.
|
|
51
|
+
- **Failover & Degradation**: If an engine fails (e.g., blocked, timeout), it automatically skips it and tries the next one, ensuring results are returned whenever possible.
|
|
52
|
+
- **Auto Deduplication**: It automatically de-duplicates results based on their `url` during the merging process.
|
|
53
|
+
|
|
54
|
+
```typescript
|
|
55
|
+
// Waterfall search: Google first, Bing as fallback, SearXNG as final backup
|
|
56
|
+
const results = await WebSearcher.search(['Google', 'Bing', 'SearXNG'], 'open source', {
|
|
57
|
+
limit: 20,
|
|
58
|
+
fillLimit: true // Enabled by default
|
|
59
|
+
});
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### 4. Stateful Session
|
|
46
63
|
|
|
47
64
|
Since `WebSearcher` extends `FetchSession`, you can instantiate it to keep cookies and storage alive across multiple requests. This is useful for authenticated searches or avoiding bot detection by behaving like a human.
|
|
48
65
|
|
|
66
|
+
### 5. Default Search Parameters
|
|
67
|
+
|
|
68
|
+
You can set default search parameters at three levels: **Global**, **Engine-specific**, and **Instance-level**. This avoids passing repetitive options to every `search()` call.
|
|
69
|
+
|
|
70
|
+
The priority order (from highest to lowest) is:
|
|
71
|
+
`search(query, options)` (Call) > `this.options` (Instance) > `Engine.defaultOptions` (Static Engine) > `WebSearcher.defaultOptions` (Static Global)
|
|
72
|
+
|
|
73
|
+
#### A. Global Static Defaults
|
|
74
|
+
|
|
75
|
+
Affects all search engines.
|
|
76
|
+
|
|
77
|
+
```typescript
|
|
78
|
+
import { WebSearcher } from '@isdk/web-fetcher';
|
|
79
|
+
|
|
80
|
+
// Set global limit for all searchers
|
|
81
|
+
WebSearcher.defaultOptions = { limit: 20, safeSearch: 'strict' };
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
#### B. Engine-Specific Static Defaults
|
|
85
|
+
|
|
86
|
+
Affects only a specific engine (and its subclasses).
|
|
87
|
+
|
|
88
|
+
```typescript
|
|
89
|
+
import { GoogleSearcher } from '@isdk/web-fetcher';
|
|
90
|
+
|
|
91
|
+
// Only Google will use these defaults
|
|
92
|
+
GoogleSearcher.defaultOptions = { region: 'US', language: 'en' };
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
#### C. Instance-Level Defaults
|
|
96
|
+
|
|
97
|
+
Set when creating a searcher instance.
|
|
98
|
+
|
|
99
|
+
```typescript
|
|
100
|
+
const google = new GoogleSearcher({ limit: 5, category: 'news' });
|
|
101
|
+
|
|
102
|
+
// This search will use limit: 5 and category: 'news' automatically
|
|
103
|
+
const results = await google.search('open source');
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### 🧬 Dynamic Templates
|
|
107
|
+
|
|
108
|
+
While a static `template` works for simple search engines, many sites (like Google) change their HTML structure drastically based on the search category (e.g., 'Web' vs 'Images' vs 'News').
|
|
109
|
+
|
|
110
|
+
To handle this, you can override the `getTemplate(variables, options)` method.
|
|
111
|
+
|
|
112
|
+
- **`variables`**: The calculated variables (from `formatOptions`, pagination, etc.).
|
|
113
|
+
- **`options`**: The original `SearchOptions` provided by the user.
|
|
114
|
+
|
|
115
|
+
```typescript
|
|
116
|
+
export class MyAdvancedSearcher extends WebSearcher {
|
|
117
|
+
get template(): FetcherOptions {
|
|
118
|
+
// Default template (usually for web search)
|
|
119
|
+
return {
|
|
120
|
+
url: '...',
|
|
121
|
+
actions: [ { id: 'extract', params: { selector: '.web-result' } } ]
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
protected override getTemplate(variables: Record<string, any>, options: SearchOptions): FetcherOptions {
|
|
126
|
+
if (options.category === 'images') {
|
|
127
|
+
return {
|
|
128
|
+
url: 'https://site.com/images?q=${query}',
|
|
129
|
+
actions: [ { id: 'extract', params: { selector: '.img-item' } } ]
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
// Fallback to the default template getter
|
|
133
|
+
return super.getTemplate(variables, options);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
```
|
|
137
|
+
|
|
49
138
|
### 🛡️ Core Principle: Template is Law
|
|
50
139
|
|
|
51
|
-
The `template`
|
|
140
|
+
The `template` (or the dynamic template returned by `getTemplate`) acts as the authoritative "blueprint".
|
|
52
141
|
|
|
53
142
|
- **Template Priority**: If the template defines a property (e.g., `engine: 'browser'`, `headers`), that value is **locked** and cannot be overridden by user options. This ensures engine stability.
|
|
54
143
|
- **Immutable Actions**: The `actions` array in the template is strictly protected. Users cannot append, replace, or modify the execution steps via `options`. This prevents external logic from breaking the scraper's flow.
|
|
144
|
+
- **Session Context**: To maintain a clean session, **actions are filtered out** of the session's persistent context. They are only used during the execution of a `search()` call. This ensures that session-level settings (like cookies or engine type) are preserved without being cluttered by search-specific extraction rules.
|
|
55
145
|
- **User Flexibility**: Properties **not** explicitly defined in the template (such as `proxy`, `timeoutMs`, or custom variables) can be freely set by the user in the constructor or `search()` method.
|
|
56
146
|
|
|
57
147
|
```typescript
|
|
@@ -171,12 +261,17 @@ protected override get pagination() {
|
|
|
171
261
|
|
|
172
262
|
### Step 3: Transform & Clean Data
|
|
173
263
|
|
|
174
|
-
Override `transform` to clean data. Since `WebSearcher` is a `FetchSession`, you can also make extra requests (like resolving redirects) using `this`.
|
|
264
|
+
Override `transform` to clean data. The `context` parameter contains the current search state and any custom parameters you passed to `search()`. Since `WebSearcher` is a `FetchSession`, you can also make extra requests (like resolving redirects) using `this`.
|
|
175
265
|
|
|
176
266
|
```typescript
|
|
177
|
-
protected override async transform(outputs: Record<string, any
|
|
267
|
+
protected override async transform(outputs: Record<string, any>, context: SearchContext) {
|
|
178
268
|
const results = outputs['results'] || [];
|
|
179
269
|
|
|
270
|
+
// You can access custom parameters from context
|
|
271
|
+
if (context.myCustomFlag) {
|
|
272
|
+
// ... logic
|
|
273
|
+
}
|
|
274
|
+
|
|
180
275
|
// Clean data or filter
|
|
181
276
|
return results.map(item => ({
|
|
182
277
|
...item,
|
|
@@ -213,8 +308,10 @@ This is extremely powerful for **filtering out ads** or irrelevant content. If t
|
|
|
213
308
|
```typescript
|
|
214
309
|
await google.search('test', {
|
|
215
310
|
limit: 20,
|
|
311
|
+
myCustomFlag: true,
|
|
216
312
|
// Example: Filter out sponsored results and only keep PDFs
|
|
217
|
-
transform: (results) => {
|
|
313
|
+
transform: (results, context) => {
|
|
314
|
+
console.log('Searching for:', context.query);
|
|
218
315
|
return results.filter(r => {
|
|
219
316
|
const isAd = r.isSponsored || r.url.includes('googleadservices.com');
|
|
220
317
|
return !isAd && r.url.endsWith('.pdf');
|
|
@@ -240,6 +337,40 @@ const results = await google.search('open source', {
|
|
|
240
337
|
});
|
|
241
338
|
```
|
|
242
339
|
|
|
340
|
+
#### Search Options Reference
|
|
341
|
+
|
|
342
|
+
| Option | Type | Description |
|
|
343
|
+
| :--- | :--- | :--- |
|
|
344
|
+
| `limit` | `number` | The target number of total results to retrieve. The searcher will automatically paginate to reach this number. |
|
|
345
|
+
| `maxPages` | `number` | The maximum number of pages (fetch cycles) to fetch. Safety threshold to prevent infinite loops. Default: `10`. |
|
|
346
|
+
| `timeRange` | `string` \| `object` | Filter by time. Presets: `'all'`, `'hour'`, `'day'`, `'week'`, `'month'`, `'year'`. <br/> Or `{ from: Date\|string, to?: Date\|string }` |
|
|
347
|
+
| `category` | `string` | Search category: `'all'`, `'images'`, `'videos'`, `'news'`. |
|
|
348
|
+
| `region` | `string` | ISO 3166-1 alpha-2 region code (e.g., `'US'`, `'CN'`). |
|
|
349
|
+
| `language` | `string` | ISO 639-1 language code (e.g., `'en'`, `'zh-CN'`). |
|
|
350
|
+
| `safeSearch` | `string` | Safe search level: `'off'`, `'moderate'`, `'strict'`. |
|
|
351
|
+
| `transform` | `function` | A custom function to filter or modify results at runtime. Runs after the engine's built-in transform. |
|
|
352
|
+
| `baseUrls` | `string[]` \| `Record<string, string[]>` | Override the base URLs for engines. Can be an array for a single engine, or a map of engine names to URL arrays. |
|
|
353
|
+
| `fillLimit` | `boolean` | If `true` (default), continues to subsequent engines in the chain when the current engine returns fewer results than `limit`. |
|
|
354
|
+
| `startPage` | `number` | The page index to start from. Useful when delegating pagination across different sessions. Default: `0`. |
|
|
355
|
+
| `validator` | `function` | Custom callback to validate fetched results. If it returns `false`, triggers failover/retry. Signature: `(results, context) => boolean \| Promise<boolean>`. |
|
|
356
|
+
| `...custom` | `any` | Any other keys are passed as custom variables to the template (e.g., `${myVar}`). |
|
|
357
|
+
|
|
358
|
+
#### Standard Search Result
|
|
359
|
+
|
|
360
|
+
Each result in the returned array follows this structure:
|
|
361
|
+
|
|
362
|
+
| Field | Type | Description |
|
|
363
|
+
| :--- | :--- | :--- |
|
|
364
|
+
| `title` | `string` | The title of the search result. |
|
|
365
|
+
| `url` | `string` | The absolute URL of the result. |
|
|
366
|
+
| `snippet` | `string` | A brief snippet or description. |
|
|
367
|
+
| `image` | `string` | (Optional) URL of a thumbnail or associated image. |
|
|
368
|
+
| `date` | `string`\|`Date` | (Optional) Publication date. |
|
|
369
|
+
| `author` | `string` | (Optional) Author or source name. |
|
|
370
|
+
| `favicon` | `string` | (Optional) Favicon URL of the source website. |
|
|
371
|
+
| `rank` | `number` | (Optional) 1-indexed position in the results. |
|
|
372
|
+
| `source` | `string` | (Optional) Source website name (e.g., 'GitHub'). |
|
|
373
|
+
|
|
243
374
|
To support these in your own engine, override the `formatOptions` method:
|
|
244
375
|
|
|
245
376
|
```typescript
|
|
@@ -254,6 +385,36 @@ protected override formatOptions(options: SearchOptions): Record<string, any> {
|
|
|
254
385
|
Then use these variables in your `template.url`:
|
|
255
386
|
`url: 'https://www.google.com/search?q=${query}&tbs=${tbs}'`
|
|
256
387
|
|
|
388
|
+
### 🚀 Implementing Multi-instance Support
|
|
389
|
+
|
|
390
|
+
If a search engine supports multiple mirrors or distributed deployment, you can easily add failover capabilities:
|
|
391
|
+
|
|
392
|
+
1. **Configure Base URLs**: Support a list of addresses in the constructor.
|
|
393
|
+
2. **Validate Results**: Override `validateFetchResult(outputs, context)`. If it returns `false`, the searcher automatically tries the next address in the list.
|
|
394
|
+
3. **Template Variables**: Use the `${baseUrl}` placeholder in your template URL.
|
|
395
|
+
|
|
396
|
+
```typescript
|
|
397
|
+
export class MyDistributedSearcher extends WebSearcher {
|
|
398
|
+
protected get template(): FetcherOptions {
|
|
399
|
+
return {
|
|
400
|
+
url: '${baseUrl}/search?q=${query}',
|
|
401
|
+
// ...
|
|
402
|
+
};
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
protected override validateFetchResult(outputs: Record<string, any>, context: SearchContext): boolean {
|
|
406
|
+
const results = outputs['results'] || [];
|
|
407
|
+
// If no results, trigger failover to the next node
|
|
408
|
+
return results.length > 0;
|
|
409
|
+
}
|
|
410
|
+
}
|
|
411
|
+
|
|
412
|
+
// Usage
|
|
413
|
+
const searcher = new MyDistributedSearcher({
|
|
414
|
+
baseUrls: ['https://node1.com', 'https://node2.com']
|
|
415
|
+
});
|
|
416
|
+
```
|
|
417
|
+
|
|
257
418
|
### Custom Variables
|
|
258
419
|
|
|
259
420
|
You can pass custom variables to `search()` and use them in your template.
|
|
@@ -266,6 +427,34 @@ await google.search('test', { category: 'news' });
|
|
|
266
427
|
url: 'https://site.com?q=${query}&cat=${category}'
|
|
267
428
|
```
|
|
268
429
|
|
|
430
|
+
## 🛡️ Resilient Search & Latency Tools
|
|
431
|
+
|
|
432
|
+
This module provides a set of general utility functions to evaluate node health and implement failover.
|
|
433
|
+
|
|
434
|
+
### 1. General Latency Testing Utility
|
|
435
|
+
|
|
436
|
+
We provide a general latency testing function `testUrlsByLatency` based on `web-fetcher` that can be used for real-time response testing and sorting of any URL list.
|
|
437
|
+
|
|
438
|
+
```typescript
|
|
439
|
+
import { testUrlsByLatency } from '@isdk/web-searcher/utils';
|
|
440
|
+
|
|
441
|
+
const urls = ['https://google.com', 'https://bing.com', 'https://baidu.com'];
|
|
442
|
+
const sorted = await testUrlsByLatency(urls, { timeout: 5000 });
|
|
443
|
+
|
|
444
|
+
// Returns [{ url: '...', latency: 123 }, ...], sorted by latency ascending.
|
|
445
|
+
```
|
|
446
|
+
|
|
447
|
+
### 2. Engine-Specific Resilient Discovery
|
|
448
|
+
|
|
449
|
+
For engines like **SearXNG** that support multiple instances and can be unstable, we provide specialized failover and discovery mechanisms.
|
|
450
|
+
|
|
451
|
+
- **Automatic Failover**: Configure multiple `baseUrls` to automatically switch nodes on connection failure.
|
|
452
|
+
- **Dynamic Discovery**: Automatically fetch and filter high-quality nodes from `searx.space` or GitHub.
|
|
453
|
+
|
|
454
|
+
For more details, see: [SearXNG Resilient Search Documentation](./src/engines/searxng.md).
|
|
455
|
+
|
|
456
|
+
---
|
|
457
|
+
|
|
269
458
|
## Pagination Guide
|
|
270
459
|
|
|
271
460
|
### 1. Offset-based (e.g., Google)
|