npm - @isdk/web-searcher - Versions diffs - 0.1.3 → 0.1.5 - Mend

@isdk/web-searcher 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/README.cn.md +168 -8
package/README.md +168 -8
package/dist/index.d.mts +221 -12
package/dist/index.d.ts +221 -12
package/dist/index.js +1 -1
package/dist/index.mjs +1 -1
package/docs/README.md +168 -8
package/docs/classes/GoogleSearcher.md +171 -44
package/docs/classes/WebSearcher.md +158 -45
package/docs/functions/extractDate.md +42 -0
package/docs/functions/extractMetadataFrom.md +40 -0
package/docs/functions/fetchHeaders.md +34 -0
package/docs/functions/fetchPartial.md +41 -0
package/docs/functions/normalizeDate.md +29 -0
package/docs/functions/parseHeaders.md +28 -0
package/docs/functions/parseHtml.md +31 -0
package/docs/functions/testUrlsByLatency.md +38 -0
package/docs/globals.md +18 -0
package/docs/interfaces/CustomTimeRange.md +3 -3
package/docs/interfaces/ExtractOptions.md +54 -0
package/docs/interfaces/FetchExtractorOptions.md +35 -0
package/docs/interfaces/FetcherOptions.md +424 -0
package/docs/interfaces/HtmlData.md +53 -0
package/docs/interfaces/MetadataResult.md +27 -0
package/docs/interfaces/PaginationConfig.md +9 -9
package/docs/interfaces/SearchContext.md +30 -4
package/docs/interfaces/SearchOptions.md +77 -11
package/docs/interfaces/StandardSearchResult.md +10 -10
package/docs/interfaces/VerifiedUrl.md +25 -0
package/docs/type-aliases/MetadataType.md +13 -0
package/docs/type-aliases/SafeSearchLevel.md +1 -1
package/docs/type-aliases/SearchCategory.md +2 -2
package/docs/type-aliases/SearchTimeRange.md +1 -1
package/docs/type-aliases/SearchTimeRangePreset.md +2 -2
package/docs/type-aliases/SearcherConstructor.md +2 -2
package/package.json +3 -2

package/dist/index.d.mts CHANGED Viewed

@@ -1,5 +1,6 @@
 import * as _isdk_web_fetcher from '@isdk/web-fetcher';
 import { FetcherOptions, FetchSession } from '@isdk/web-fetcher';
+export { FetcherOptions } from '@isdk/web-fetcher';
 import { IBaseFactoryOptions } from 'custom-factory';
 /**
@@ -83,8 +84,14 @@ interface SearchContext {
     page: number;
     /** The requested limit of results. */
     limit?: number;
+    /** Allows for custom variables passed via search options. */
+    [key: string]: any;
+    /** The baseUrl used for this specific fetch (if multi-instance is enabled) */
+    baseUrl?: string;
+    /** The name of the engine executing the search */
+    engine?: string;
 }
-type SearchTimeRangePreset = 'all' | 'day' | 'week' | 'month' | 'year';
+type SearchTimeRangePreset = 'all' | 'hour' | 'day' | 'week' | 'month' | 'year';
 interface CustomTimeRange {
     /** Start date (Date object or string like 'YYYY-MM-DD'). */
     from: Date | string;
@@ -92,7 +99,7 @@ interface CustomTimeRange {
     to?: Date | string;
 }
 type SearchTimeRange = SearchTimeRangePreset | CustomTimeRange;
-type SearchCategory = 'all' | 'images' | 'videos' | 'news';
+type SearchCategory = 'all' | 'images' | 'videos' | 'news' | string;
 type SafeSearchLevel = 'off' | 'moderate' | 'strict';
 /**
  * Options provided when executing a search.
@@ -139,12 +146,187 @@ interface SearchOptions {
     transform?: (results: StandardSearchResult[], context: SearchContext) => Promise<StandardSearchResult[]> | StandardSearchResult[];
     /** Any other custom variables to be injected into the template. */
     [key: string]: any;
+    /**
+     * Allows the user to dynamically specify or override the base URLs for the engines.
+     * Can be an array of URLs for a single engine, or a map of engine names to URL arrays.
+     */
+    baseUrls?: string[] | Record<string, string[]>;
+    /**
+     * User-defined callback to validate the fetched results for a page.
+     * If it returns false, the fetch is considered a failure, triggering the retry/failover mechanism.
+     */
+    validator?: (results: StandardSearchResult[], context: SearchContext) => boolean | Promise<boolean>;
+    /**
+     * If true (default), the searcher will attempt to fulfill the requested `limit`
+     * by falling back to subsequent engines in the chain if previous ones are exhausted.
+     * If false, it will stop after the first successful engine regardless of whether
+     * the limit was reached.
+     */
+    fillLimit?: boolean;
+    /**
+     * Specifies which page index to start the search from.
+     * Useful when delegating pagination across different sessions.
+     * @default 0
+     */
+    startPage?: number;
+}
+/**
+ * Options for network requests.
+ */
+interface FetchExtractorOptions {
+    /** Timeout in milliseconds. Defaults vary by function (5s to 10s). */
+    timeout?: number;
+    /** Custom HTTP headers to include in the request. */
+    headers?: Record<string, string>;
+}
+/**
+ * Fetches only the HTTP headers for a given URL using a HEAD request.
+ * Useful for checking 'last-modified' without downloading the body.
+ *
+ * @param url - The URL to check.
+ * @param options - Request options.
+ * @returns The Headers object, or null on failure.
+ */
+declare function fetchHeaders(url: string, options?: FetchExtractorOptions): Promise<Headers | null>;
+/**
+ * Fetches a partial amount of content from a URL.
+ * Automatically handles character set detection from the Content-Type header.
+ * Aborts the request once the specified maxBytes is reached.
+ *
+ * @param url - The URL to fetch.
+ * @param maxBytes - The maximum number of bytes to read. Defaults to 32KB.
+ * @param options - Request options.
+ * @returns An object containing the decoded content string and the response headers.
+ */
+declare function fetchPartial(url: string, maxBytes?: number, options?: FetchExtractorOptions): Promise<{
+    content: string;
+    headers: Headers;
+} | null>;
+/**
+ * Represents structured data extracted from an HTML document.
+ */
+interface HtmlData {
+    /** Map of meta tag names/properties to their content. Keys are lowercase. */
+    meta: Record<string, string>;
+    /** Array of parsed JSON-LD objects found in the document. */
+    jsonLd: any[];
+    /** Array of data from HTML <time> tags. */
+    time: Array<{
+        /** The value of the 'datetime' attribute, if present. */
+        datetime: string | null;
+        /** The text content within the <time> tag, with HTML stripped. */
+        text: string;
+    }>;
 }
+/**
+ * Converts a Web API Headers object into a plain JavaScript record.
+ * All header names are converted to lowercase for consistent access.
+ *
+ * @param headers - The Headers object to parse.
+ * @returns A record where keys are lowercase header names.
+ */
+declare function parseHeaders(headers: Headers): Record<string, string>;
+/**
+ * Parses an HTML string to extract generic metadata structures (Meta tags, JSON-LD, Time tags).
+ * This function does not perform field-specific logic (like finding a date); it simply
+ * collects available structured data.
+ *
+ * @param html - The raw HTML content to parse.
+ * @returns An object containing grouped metadata from the HTML.
+ */
+declare function parseHtml(html: string): HtmlData;
+/**
+ * Result object for generic metadata extraction.
+ */
+interface MetadataResult {
+    /** The extracted and normalized date, if any. */
+    date?: string | null;
+    /** Placeholders for future metadata fields. */
+    [key: string]: any;
+}
+/**
+ * Supported metadata types for extraction.
+ */
+type MetadataType = 'date' | string;
+/**
+ * Extracts specific metadata from parsed HTML and headers based on a requested type.
+ * Currently supports 'date' extraction with a prioritized fallback mechanism.
+ *
+ * @param result - An object containing the raw HTML content and response headers.
+ * @param type - The type of metadata to extract.
+ * @returns The extracted and normalized value, or null if not found.
+ */
+declare function extractMetadataFrom(result: {
+    content: string;
+    headers: Headers;
+}, type: MetadataType): string | null;
+/**
+ * Normalizes a date string into a standard ISO 8601 format (UTC).
+ * It handles various formats (YYYY-MM-DD, RFC2822, etc.) and performs
+ * aggressive cleaning and sanity checks.
+ *
+ * @param dateStr - The raw date string to normalize.
+ * @returns An ISO 8601 string (e.g., "2024-01-20T00:00:00.000Z") or null if invalid.
+ */
+declare function normalizeDate(dateStr: string | null): string | null;
+/**
+ * Options for the extractDate function.
+ */
+interface ExtractOptions extends FetchExtractorOptions {
+    /**
+     * Maximum number of bytes to download from the URL.
+     * Defaults to 32768 (32KB), which is usually enough for the HTML <head>.
+     */
+    maxBytes?: number;
+}
+/**
+ * High-level convenience function to extract the publication or modification date from a URL.
+ * It performs a partial fetch of the content and applies multiple extraction rules
+ * (LD+JSON, Meta tags, Time tags, Headers) to find the most reliable date.
+ *
+ * @param url - The web page URL to analyze.
+ * @param options - Fetch and extraction options.
+ * @returns An ISO 8601 date string, or null if no valid date could be found.
+ *
+ * @example
+ * ```ts
+ * const date = await extractDate('https://example.com/article');
+ * console.log(date); // "2024-01-20T12:00:00.000Z"
+ * ```
+ */
+declare function extractDate(url: string, options?: ExtractOptions): Promise<string | null>;
+interface VerifiedUrl {
+    url: string;
+    latency: number;
+}
+/**
+ * A general utility to test a list of URLs for availability and latency.
+ * Returns a list of verified URLs sorted by response time.
+ */
+declare function testUrlsByLatency(urls: string[], options?: {
+    timeout?: number;
+    limit?: number;
+    testPath?: string;
+}): Promise<VerifiedUrl[]>;
 /**
  * Constructor definition for Searcher subclasses.
  */
 type SearcherConstructor = new (options?: FetcherOptions) => WebSearcher;
 /**
  * The abstract base class for all search engines.
  *
@@ -176,6 +358,10 @@ declare abstract class WebSearcher extends FetchSession {
      * Useful for registering shorthand names (e.g., 'g' for 'Google').
      */
     static alias?: string | string[];
+    /** Default base URLs for engines that support multiple instances. */
+    static defaultBaseUrls?: string[];
+    /** Globally shared index for tracking the currently active instance (node) across sessions. */
+    static currentInstanceIndex?: number;
     /**
      * Registers a search engine class.
      *
@@ -219,23 +405,25 @@ declare abstract class WebSearcher extends FetchSession {
      */
     static setAliases: (ctor: typeof WebSearcher, ...aliases: string[]) => void;
     /**
-     * Static helper to execute a one-off search.
+     * Static helper to execute a one-off search or a fallback chain.
      *
-     * It creates an instance of the specified engine, executes the search, and then
-     * automatically disposes of the session.
+     * It creates an instance of the specified engine(s), executes the search, and automatically
+     * falls back to the next engine in the list if the current one fails or is exhausted.
      *
-     * @param engineName - The name of the engine to use (e.g., 'Google').
+     * @param engineNames - The name(s) of the engine(s) to use (e.g., 'Google' or ['SearXNG', 'Google']).
      * @param query - The search query string.
      * @param options - Combined search options and fetcher options.
      * @returns A promise resolving to an array of standardized search results.
      */
-    static search(engineName: string, query: string, options?: SearchOptions & FetcherOptions): Promise<StandardSearchResult[]>;
+    static search(engineNames: string | string[], query: string, options?: SearchOptions & FetcherOptions): Promise<StandardSearchResult[]>;
     /**
      * The declarative template for the fetch options.
      *
-     * Subclasses **must** implement this getter to provide the engine configuration,
+     * Subclasses can implement this getter to provide the engine configuration,
      * including the base URL, search parameters pattern, and extraction rules.
      *
+     * This getter is **optional** if you override {@link getTemplate}.
+     *
      * Supports variable injection using syntax like `${query}`, `${offset}`, etc.
      *
      * @example
@@ -248,7 +436,7 @@ declare abstract class WebSearcher extends FetchSession {
      * }
      * ```
      */
-    abstract get template(): FetcherOptions;
+    get template(): FetcherOptions;
     /**
      * Optional pagination configuration.
      * Defines how the searcher navigates to subsequent pages.
@@ -256,18 +444,39 @@ declare abstract class WebSearcher extends FetchSession {
      * If undefined, the searcher will only fetch the first page.
      */
     get pagination(): PaginationConfig | undefined;
+    /**
+     * Dynamically retrieves the fetch template based on current variables and search options.
+     *
+     * Subclasses can override this method to return different extraction rules (actions)
+     * or URL patterns based on the search category, region, or other parameters.
+     *
+     * @param variables - The calculated variables (from formatOptions, pagination, etc.).
+     * @param options - The original search options provided by the user.
+     * @returns The fetcher configuration to be used for the current request.
+     */
+    protected getTemplate(variables: Record<string, any>, options: SearchOptions): FetcherOptions;
     protected createContext(options?: FetcherOptions): _isdk_web_fetcher.FetchContext;
     /**
      * Executes a search query.
      *
-     * This method handles the pagination loop, variable injection, fetching,
-     * and result transformation.
+     * This method handles the pagination loop, multi-instance failover, variable injection,
+     * fetching, and result transformation.
      *
      * @param query - The search query string.
      * @param options - Optional search parameters (e.g., limit, timeRange).
      * @returns A promise resolving to an array of standardized search results.
      */
     search(query: string, options?: SearchOptions): Promise<StandardSearchResult[]>;
+    /**
+     * Hook for subclasses to validate fetched results before they are accepted.
+     * If this returns false, the instance manager will consider the fetch a failure
+     * and automatically switch to the next available baseUrl (if any).
+     *
+     * @param results - The extracted results.
+     * @param context - Context including the current baseUrl and page.
+     * @returns A promise resolving to true if valid, false otherwise.
+     */
+    protected validateFetchResult(results: StandardSearchResult[], context: SearchContext): Promise<boolean>;
     /**
      * Transform and clean the raw extracted results.
      *
@@ -347,4 +556,4 @@ declare class GoogleSearcher extends WebSearcher {
     protected transform(outputs: Record<string, any>): Promise<any[]>;
 }
-export { type CustomTimeRange, GoogleSearcher, type PaginationConfig, type SafeSearchLevel, type SearchCategory, type SearchContext, type SearchOptions, type SearchTimeRange, type SearchTimeRangePreset, type SearcherConstructor, type StandardSearchResult, WebSearcher };
+export { type CustomTimeRange, type ExtractOptions, type FetchExtractorOptions, GoogleSearcher, type HtmlData, type MetadataResult, type MetadataType, type PaginationConfig, type SafeSearchLevel, type SearchCategory, type SearchContext, type SearchOptions, type SearchTimeRange, type SearchTimeRangePreset, type SearcherConstructor, type StandardSearchResult, type VerifiedUrl, WebSearcher, extractDate, extractMetadataFrom, fetchHeaders, fetchPartial, normalizeDate, parseHeaders, parseHtml, testUrlsByLatency };

package/dist/index.d.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 import * as _isdk_web_fetcher from '@isdk/web-fetcher';
 import { FetcherOptions, FetchSession } from '@isdk/web-fetcher';
+export { FetcherOptions } from '@isdk/web-fetcher';
 import { IBaseFactoryOptions } from 'custom-factory';
 /**
@@ -83,8 +84,14 @@ interface SearchContext {
     page: number;
     /** The requested limit of results. */
     limit?: number;
+    /** Allows for custom variables passed via search options. */
+    [key: string]: any;
+    /** The baseUrl used for this specific fetch (if multi-instance is enabled) */
+    baseUrl?: string;
+    /** The name of the engine executing the search */
+    engine?: string;
 }
-type SearchTimeRangePreset = 'all' | 'day' | 'week' | 'month' | 'year';
+type SearchTimeRangePreset = 'all' | 'hour' | 'day' | 'week' | 'month' | 'year';
 interface CustomTimeRange {
     /** Start date (Date object or string like 'YYYY-MM-DD'). */
     from: Date | string;
@@ -92,7 +99,7 @@ interface CustomTimeRange {
     to?: Date | string;
 }
 type SearchTimeRange = SearchTimeRangePreset | CustomTimeRange;
-type SearchCategory = 'all' | 'images' | 'videos' | 'news';
+type SearchCategory = 'all' | 'images' | 'videos' | 'news' | string;
 type SafeSearchLevel = 'off' | 'moderate' | 'strict';
 /**
  * Options provided when executing a search.
@@ -139,12 +146,187 @@ interface SearchOptions {
     transform?: (results: StandardSearchResult[], context: SearchContext) => Promise<StandardSearchResult[]> | StandardSearchResult[];
     /** Any other custom variables to be injected into the template. */
     [key: string]: any;
+    /**
+     * Allows the user to dynamically specify or override the base URLs for the engines.
+     * Can be an array of URLs for a single engine, or a map of engine names to URL arrays.
+     */
+    baseUrls?: string[] | Record<string, string[]>;
+    /**
+     * User-defined callback to validate the fetched results for a page.
+     * If it returns false, the fetch is considered a failure, triggering the retry/failover mechanism.
+     */
+    validator?: (results: StandardSearchResult[], context: SearchContext) => boolean | Promise<boolean>;
+    /**
+     * If true (default), the searcher will attempt to fulfill the requested `limit`
+     * by falling back to subsequent engines in the chain if previous ones are exhausted.
+     * If false, it will stop after the first successful engine regardless of whether
+     * the limit was reached.
+     */
+    fillLimit?: boolean;
+    /**
+     * Specifies which page index to start the search from.
+     * Useful when delegating pagination across different sessions.
+     * @default 0
+     */
+    startPage?: number;
+}
+/**
+ * Options for network requests.
+ */
+interface FetchExtractorOptions {
+    /** Timeout in milliseconds. Defaults vary by function (5s to 10s). */
+    timeout?: number;
+    /** Custom HTTP headers to include in the request. */
+    headers?: Record<string, string>;
+}
+/**
+ * Fetches only the HTTP headers for a given URL using a HEAD request.
+ * Useful for checking 'last-modified' without downloading the body.
+ *
+ * @param url - The URL to check.
+ * @param options - Request options.
+ * @returns The Headers object, or null on failure.
+ */
+declare function fetchHeaders(url: string, options?: FetchExtractorOptions): Promise<Headers | null>;
+/**
+ * Fetches a partial amount of content from a URL.
+ * Automatically handles character set detection from the Content-Type header.
+ * Aborts the request once the specified maxBytes is reached.
+ *
+ * @param url - The URL to fetch.
+ * @param maxBytes - The maximum number of bytes to read. Defaults to 32KB.
+ * @param options - Request options.
+ * @returns An object containing the decoded content string and the response headers.
+ */
+declare function fetchPartial(url: string, maxBytes?: number, options?: FetchExtractorOptions): Promise<{
+    content: string;
+    headers: Headers;
+} | null>;
+/**
+ * Represents structured data extracted from an HTML document.
+ */
+interface HtmlData {
+    /** Map of meta tag names/properties to their content. Keys are lowercase. */
+    meta: Record<string, string>;
+    /** Array of parsed JSON-LD objects found in the document. */
+    jsonLd: any[];
+    /** Array of data from HTML <time> tags. */
+    time: Array<{
+        /** The value of the 'datetime' attribute, if present. */
+        datetime: string | null;
+        /** The text content within the <time> tag, with HTML stripped. */
+        text: string;
+    }>;
 }
+/**
+ * Converts a Web API Headers object into a plain JavaScript record.
+ * All header names are converted to lowercase for consistent access.
+ *
+ * @param headers - The Headers object to parse.
+ * @returns A record where keys are lowercase header names.
+ */
+declare function parseHeaders(headers: Headers): Record<string, string>;
+/**
+ * Parses an HTML string to extract generic metadata structures (Meta tags, JSON-LD, Time tags).
+ * This function does not perform field-specific logic (like finding a date); it simply
+ * collects available structured data.
+ *
+ * @param html - The raw HTML content to parse.
+ * @returns An object containing grouped metadata from the HTML.
+ */
+declare function parseHtml(html: string): HtmlData;
+/**
+ * Result object for generic metadata extraction.
+ */
+interface MetadataResult {
+    /** The extracted and normalized date, if any. */
+    date?: string | null;
+    /** Placeholders for future metadata fields. */
+    [key: string]: any;
+}
+/**
+ * Supported metadata types for extraction.
+ */
+type MetadataType = 'date' | string;
+/**
+ * Extracts specific metadata from parsed HTML and headers based on a requested type.
+ * Currently supports 'date' extraction with a prioritized fallback mechanism.
+ *
+ * @param result - An object containing the raw HTML content and response headers.
+ * @param type - The type of metadata to extract.
+ * @returns The extracted and normalized value, or null if not found.
+ */
+declare function extractMetadataFrom(result: {
+    content: string;
+    headers: Headers;
+}, type: MetadataType): string | null;
+/**
+ * Normalizes a date string into a standard ISO 8601 format (UTC).
+ * It handles various formats (YYYY-MM-DD, RFC2822, etc.) and performs
+ * aggressive cleaning and sanity checks.
+ *
+ * @param dateStr - The raw date string to normalize.
+ * @returns An ISO 8601 string (e.g., "2024-01-20T00:00:00.000Z") or null if invalid.
+ */
+declare function normalizeDate(dateStr: string | null): string | null;
+/**
+ * Options for the extractDate function.
+ */
+interface ExtractOptions extends FetchExtractorOptions {
+    /**
+     * Maximum number of bytes to download from the URL.
+     * Defaults to 32768 (32KB), which is usually enough for the HTML <head>.
+     */
+    maxBytes?: number;
+}
+/**
+ * High-level convenience function to extract the publication or modification date from a URL.
+ * It performs a partial fetch of the content and applies multiple extraction rules
+ * (LD+JSON, Meta tags, Time tags, Headers) to find the most reliable date.
+ *
+ * @param url - The web page URL to analyze.
+ * @param options - Fetch and extraction options.
+ * @returns An ISO 8601 date string, or null if no valid date could be found.
+ *
+ * @example
+ * ```ts
+ * const date = await extractDate('https://example.com/article');
+ * console.log(date); // "2024-01-20T12:00:00.000Z"
+ * ```
+ */
+declare function extractDate(url: string, options?: ExtractOptions): Promise<string | null>;
+interface VerifiedUrl {
+    url: string;
+    latency: number;
+}
+/**
+ * A general utility to test a list of URLs for availability and latency.
+ * Returns a list of verified URLs sorted by response time.
+ */
+declare function testUrlsByLatency(urls: string[], options?: {
+    timeout?: number;
+    limit?: number;
+    testPath?: string;
+}): Promise<VerifiedUrl[]>;
 /**
  * Constructor definition for Searcher subclasses.
  */
 type SearcherConstructor = new (options?: FetcherOptions) => WebSearcher;
 /**
  * The abstract base class for all search engines.
  *
@@ -176,6 +358,10 @@ declare abstract class WebSearcher extends FetchSession {
      * Useful for registering shorthand names (e.g., 'g' for 'Google').
      */
     static alias?: string | string[];
+    /** Default base URLs for engines that support multiple instances. */
+    static defaultBaseUrls?: string[];
+    /** Globally shared index for tracking the currently active instance (node) across sessions. */
+    static currentInstanceIndex?: number;
     /**
      * Registers a search engine class.
      *
@@ -219,23 +405,25 @@ declare abstract class WebSearcher extends FetchSession {
      */
     static setAliases: (ctor: typeof WebSearcher, ...aliases: string[]) => void;
     /**
-     * Static helper to execute a one-off search.
+     * Static helper to execute a one-off search or a fallback chain.
      *
-     * It creates an instance of the specified engine, executes the search, and then
-     * automatically disposes of the session.
+     * It creates an instance of the specified engine(s), executes the search, and automatically
+     * falls back to the next engine in the list if the current one fails or is exhausted.
      *
-     * @param engineName - The name of the engine to use (e.g., 'Google').
+     * @param engineNames - The name(s) of the engine(s) to use (e.g., 'Google' or ['SearXNG', 'Google']).
      * @param query - The search query string.
      * @param options - Combined search options and fetcher options.
      * @returns A promise resolving to an array of standardized search results.
      */
-    static search(engineName: string, query: string, options?: SearchOptions & FetcherOptions): Promise<StandardSearchResult[]>;
+    static search(engineNames: string | string[], query: string, options?: SearchOptions & FetcherOptions): Promise<StandardSearchResult[]>;
     /**
      * The declarative template for the fetch options.
      *
-     * Subclasses **must** implement this getter to provide the engine configuration,
+     * Subclasses can implement this getter to provide the engine configuration,
      * including the base URL, search parameters pattern, and extraction rules.
      *
+     * This getter is **optional** if you override {@link getTemplate}.
+     *
      * Supports variable injection using syntax like `${query}`, `${offset}`, etc.
      *
      * @example
@@ -248,7 +436,7 @@ declare abstract class WebSearcher extends FetchSession {
      * }
      * ```
      */
-    abstract get template(): FetcherOptions;
+    get template(): FetcherOptions;
     /**
      * Optional pagination configuration.
      * Defines how the searcher navigates to subsequent pages.
@@ -256,18 +444,39 @@ declare abstract class WebSearcher extends FetchSession {
      * If undefined, the searcher will only fetch the first page.
      */
     get pagination(): PaginationConfig | undefined;
+    /**
+     * Dynamically retrieves the fetch template based on current variables and search options.
+     *
+     * Subclasses can override this method to return different extraction rules (actions)
+     * or URL patterns based on the search category, region, or other parameters.
+     *
+     * @param variables - The calculated variables (from formatOptions, pagination, etc.).
+     * @param options - The original search options provided by the user.
+     * @returns The fetcher configuration to be used for the current request.
+     */
+    protected getTemplate(variables: Record<string, any>, options: SearchOptions): FetcherOptions;
     protected createContext(options?: FetcherOptions): _isdk_web_fetcher.FetchContext;
     /**
      * Executes a search query.
      *
-     * This method handles the pagination loop, variable injection, fetching,
-     * and result transformation.
+     * This method handles the pagination loop, multi-instance failover, variable injection,
+     * fetching, and result transformation.
      *
      * @param query - The search query string.
      * @param options - Optional search parameters (e.g., limit, timeRange).
      * @returns A promise resolving to an array of standardized search results.
      */
     search(query: string, options?: SearchOptions): Promise<StandardSearchResult[]>;
+    /**
+     * Hook for subclasses to validate fetched results before they are accepted.
+     * If this returns false, the instance manager will consider the fetch a failure
+     * and automatically switch to the next available baseUrl (if any).
+     *
+     * @param results - The extracted results.
+     * @param context - Context including the current baseUrl and page.
+     * @returns A promise resolving to true if valid, false otherwise.
+     */
+    protected validateFetchResult(results: StandardSearchResult[], context: SearchContext): Promise<boolean>;
     /**
      * Transform and clean the raw extracted results.
      *
@@ -347,4 +556,4 @@ declare class GoogleSearcher extends WebSearcher {
     protected transform(outputs: Record<string, any>): Promise<any[]>;
 }
-export { type CustomTimeRange, GoogleSearcher, type PaginationConfig, type SafeSearchLevel, type SearchCategory, type SearchContext, type SearchOptions, type SearchTimeRange, type SearchTimeRangePreset, type SearcherConstructor, type StandardSearchResult, WebSearcher };
+export { type CustomTimeRange, type ExtractOptions, type FetchExtractorOptions, GoogleSearcher, type HtmlData, type MetadataResult, type MetadataType, type PaginationConfig, type SafeSearchLevel, type SearchCategory, type SearchContext, type SearchOptions, type SearchTimeRange, type SearchTimeRangePreset, type SearcherConstructor, type StandardSearchResult, type VerifiedUrl, WebSearcher, extractDate, extractMetadataFrom, fetchHeaders, fetchPartial, normalizeDate, parseHeaders, parseHtml, testUrlsByLatency };

package/dist/index.js CHANGED Viewed

	@@ -1 +1 @@
1	- "use strict";var t,e=Object.defineProperty,r=Object.getOwnPropertyDescriptor,s=Object.getOwnPropertyNames,a=Object.prototype.hasOwnProperty,i={};((t,r)=>{for(~~var~~ s in r)e(t,s,{~~get~~:r[s],enumerable:!0})})(i,{GoogleSearcher:()=>f,WebSearcher:()=>h}),module.exports=(t=i,((t,i,n,o)=>{if(i&&"object"==typeof i\|\|"function"==typeof i)for(let c of s(i))a.call(t,c)\|\|c===n\|\|e(t,c,{get:()=>i[c],enumerable:!(o=r(i,c))\|\|o.enumerable});return t})(e({},"__esModule",{value:!0}),t));var n=require("@isdk/web-fetcher"),o=require("custom-factory"),c=require("lodash-es");function l(t,e){if("string"==typeof t)return t.replace(/\$\{(.?)\}/g,(t,r)=>{const s=e[r.trim()];return void 0!==s?String(s):""});if(Array.isArray(t))return t.map(t=>l(t,e));if((0,c.isPlainObject)(t)){const r={};for(const s in t)Object.prototype.hasOwnProperty.call(t,s)&&(r[s]=l(t[s],e));return r}return t}var u=require("lodash-es"),h=class extends n.FetchSession{static async search(t,e,r={}){const s=this.createObject(t,r);if(!s)throw new Error(`Search engine not found: ${t}`);try{~~return~~ await s.search(e,r)}finally{await s.dispose()}}get pagination(){}createContext(t=this.options){const e=this.template,r=(0~~,u.~~defaultsDeep)({},e,t);return e.engine&&"auto"!==e.engine\|\|!t.engine\|\|(r.engine=t.engine),super.createContext(r)}async search(t,e={}){const r=e.limit\|\|10,s=[];let a=0;const i=this.pagination?.startValue??0,n=this.pagination?.increment??1,o=e.maxPages\|\|this.pagination?.maxPages\|\|10;for(;s.length<r;){const c=this.formatOptions(e),h=i+an,f={...e,...c,query:t,page:a+i,offset:h,limit:r},m=l(this.~~template~~,f),d=(0~~,u.~~defaultsDeep)({},m,e),g=[];if(0===a\|\|"url-param"===this.pagination?.type?d.url&&g.push({id:"goto",params:{url:d.url}}):"click-next"===this.pagination?.type&&this.pagination.nextButtonSelector&&(g.push({id:"click",params:{selector:this.pagination.nextButtonSelector}}),g.push({id:"waitFor",params:{networkIdle:!0,ms:500}}))~~,d.actions){const t=d.actions.filter(t=>!(g.length>0&&"goto"===g[0].id&&"goto"===t.id))~~;g.push(...t)}d.engine&&this.context.engine!==d.engine&&d.engine;const{outputs:p}=await this.executeAll(g),w={query:t,page:a,~~limit~~:~~e.limit~~};let y=[];if(y=await this.~~transform~~(p,w),e.~~transform~~&&(y=await e.~~transform~~(y,w)),!~~y\|\|~~0~~===y~~.length)~~break;if~~(s.push(~~...y~~),s.length>=r\|\|!this.pagination)break;if(a++,a>=o)break}return s.slice(0,r)}async transform(t,e){return t.results\|\|[]}formatOptions(t){return{...t}}};h._isFactory=!1,(0,o.addBaseFactoryAbility)(h),h.prototype.name="Searcher";var f=class extends h{get template(){return{engine:"browser",browser:{headless:!1},url:"https://www.google.com/search?q=${query}&start=${offset}&tbs=${tbs}&tbm=${tbm}&gl=${gl}&hl=${hl}&safe=${safe}",actions:[{id:"extract",storeAs:"results",params:{type:"array",selector:"#main #search",items:{url:{selector:"a:has(h3)",attribute:"href",required:!0},title:{selector:"a:has(h3) h3",required:!0,mode:"innerText"},snippet:{selector:"div[style*='-webkit-line-clamp']",type:"html"}}}}]}}get pagination(){return{type:"url-param",paramName:"start",startValue:0,increment:10}}formatOptions(t){const e={};if(t.timeRange)if("string"==typeof t.timeRange){const r={day:"qdr:d",week:"qdr:w",month:"qdr:m",year:"qdr:y"};r[t.timeRange]&&(e.tbs=r[t.timeRange])}else{const r=new Date(t.timeRange.from),s=t.timeRange.to?new Date(t.timeRange.to):new Date;if(!isNaN(r.getTime())&&!isNaN(s.getTime())){const t=t=>`${t.getMonth()+1}/${t.getDate()}/${t.getFullYear()}`;e.tbs=`cdr:1,cd_min:${t(r)},cd_max:${t(s)}`}}if(t.category){const r={images:"isch",videos:"vid",news:"nws"};r[t.category]&&(e.tbm=r[t.category])}return t.region&&(e.gl=t.region),t.language&&(e.hl=t.language),t.safeSearch&&("strict"===t.safeSearch?e.safe="active":"off"===t.safeSearch&&(e.safe="images")),e}async transform(t){const e=t.results\|\|[];return Array.isArray(e)?e.map(t=>{if(t.url&&t.url.startsWith("/url?q="))try{const e=new URL(t.url,"https://www.google.com").searchParams.get("q");e&&(t.url=e)}catch(t){}return t}):[]}};f.alias=["google"];
1	+ "use strict";var t,e=Object.defineProperty,r=Object.getOwnPropertyDescriptor,n=Object.getOwnPropertyNames,s=Object.prototype.hasOwnProperty,i={};async function a(t,e={}){const{timeout:r=5e3,headers:n}=e,s=new AbortController,i=setTimeout(()=>s.abort(),r);try{return(await fetch(t,{method:"HEAD",signal:s.signal,headers:{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",...n}})).headers}catch(t){return null}finally{clearTimeout(i)}}async function o(t,e=32768,r={}){const{timeout:n=1e4,headers:s}=r,i=new AbortController,a=setTimeout(()=>i.abort(),n);let o="",c=new Headers;try{const r=await fetch(t,{signal:i.signal,headers:{"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",...s}});if(c=r.headers,!r.ok\|\|!r.body)return null;const n=r.headers.get("content-type"),a=n?.match(/charset=([\w-]+)/i),l=a?a[1]:"utf-8",u=r.body.getReader(),f=new TextDecoder(l);let d=0;for(;;)try{const{done:t,value:r}=await u.read();if(t)break;if(d+=r.length,o+=f.decode(r,{stream:!0}),d>=e){i.abort();break}}catch(t){if("AbortError"===t.name)break;throw t}return{content:o,headers:c}}catch(t){return o.length>0?{content:o,headers:c}:null}finally{clearTimeout(a)}}function c(t){const e={};return t.forEach((t,r)=>{e[r.toLowerCase()]=t}),e}function l(t){const e={meta:{},jsonLd:[],time:[]},r=/<meta\s+([^>]+?)>/gi;let n;for(;null!==(n=r.exec(t));){const t=f(n[1]),r=t.name\|\|t.property\|\|t.itemprop,s=t.content;r&&s&&(e.meta[r.toLowerCase()]=s)}const s=/<script\s+[^>]?type\s=\s["']application\/ld\+json["'][^>]>([\s\S]?)<\/script>/gi;for(;null!==(n=s.exec(t));){const t=n[1];try{const r=JSON.parse(t);e.jsonLd.push(r)}catch(r){const n=u(t);n&&e.jsonLd.push(n)}}const i=/<time([^>]?)>([\s\S]?)<\/time>/gi;for(;null!==(n=i.exec(t));){const t=f(n[1]).datetime,r=n[2].replace(/<[^>]>/g,"").trim();e.time.push({datetime:t,text:r})}return e}function u(t){const e=["datePublished","dateModified","pubDate","publishedAt"],r={};let n=!1;for(const s of e){const e=new RegExp(`"${s}"\\s:\\s"([^"]+)"`,"i"),i=t.match(e);i&&(r[s]=i[1],n=!0)}return n?r:null}function f(t){const e={},r=/([a-z0-9:._-]+)(?:\s=\s(?:(?:"([^"])")\|(?:'([^'])')\|([^>\s]+)))?/gi;let n;for(;null!==(n=r.exec(t));){const t=n[1].toLowerCase(),r=n[2]??n[3]??n[4]??"";e[t]=r}return e}function d(t){if(!t)return null;try{let e=t.trim();if(!e)return null;e=e.replace(/^(?:last\|first\|posted\|originally)\s(?:published\|updated\|date\|posted\|modified)\s(?:on\|at)?[:\s]/i,""),e=e.replace(/^(?:published\|updated\|date\|posted\|modified)\s(?:on\|at)?[:\s]/i,""),e=e.split(/[\(\|\\|]\|by\s+\|[-–—]\s\d+\smin/i)[0].trim();const r=new Date(e);if(!isNaN(r.getTime())){const t=r.getUTCFullYear(),e=(new Date).getUTCFullYear();if(t>=-1e4&&t<=e+20)return r.toISOString()}}catch(t){}return null}function h(t,e){const r=l(t.content);return"date"===e?function(t,e){const r=function(t){const e=["datePublished","dateModified","pubDate","publishedAt"],r=t=>{if(!t\|\|"object"!=typeof t)return null;for(const r of e)if("string"==typeof t[r])return t[r];if(Array.isArray(t))for(const e of t){const t=r(e);if(t)return t}else if(t["@graph"]&&Array.isArray(t["@graph"]))return r(t["@graph"]);return null};return r(t)}(t.jsonLd),n=d(r);if(n)return n;const s=function(t){const e=["article:published_time","og:published_time","datepublished","date","pubdate","publishdate","dc.date.issued","bt:pubdate","sailthru.date","article:modified_time","og:updated_time","modifieddate"];for(const r of e)if(t[r])return t[r];return null}(t.meta),i=d(s);if(i)return i;for(const e of t.time){const t=d(e.datetime\|\|e.text);if(t)return t}const a=c(e);return d(a["last-modified"])}(r,t.headers):null}async function m(t,e={}){const r=await o(t,e.maxBytes,e);return r?h(r,"date"):null}((t,r)=>{for(var n in r)e(t,n,{get:r[n],enumerable:!0})})(i,{FetcherOptions:()=>y.FetcherOptions,GoogleSearcher:()=>A,WebSearcher:()=>q,extractDate:()=>m,extractMetadataFrom:()=>h,fetchHeaders:()=>a,fetchPartial:()=>o,normalizeDate:()=>d,parseHeaders:()=>c,parseHtml:()=>l,testUrlsByLatency:()=>b}),module.exports=(t=i,((t,i,a,o)=>{if(i&&"object"==typeof i\|\|"function"==typeof i)for(let c of n(i))s.call(t,c)\|\|c===a\|\|e(t,c,{get:()=>i[c],enumerable:!(o=r(i,c))\|\|o.enumerable});return t})(e({},"__esModule",{value:!0}),t));var p=require("@isdk/web-fetcher");async function b(t,e={}){const{timeout:r=5e3,limit:n=20,testPath:s=""}=e;return(await Promise.all(t.map(async t=>{const e=Date.now();try{const n=s?(t.endsWith("/")?t.slice(0,-1):t)+(s.startsWith("/")?s:"/"+s):t;return await(0,p.fetchWeb)(n,{timeoutMs:r}),{url:t,latency:Date.now()-e}}catch(t){return null}}))).filter(t=>null!==t).sort((t,e)=>t.latency-e.latency).slice(0,n)}var y=require("@isdk/web-fetcher"),w=require("custom-factory"),g=require("lodash-es");function k(t,e){if("string"==typeof t)return t.replace(/\$\{(.?)\}/g,(t,r)=>{const n=e[r.trim()];return void 0!==n?String(n):""});if(Array.isArray(t))return t.map(t=>k(t,e));if((0,g.isPlainObject)(t)){const r={};for(const n in t)Object.prototype.hasOwnProperty.call(t,n)&&(r[n]=k(t[n],e));return r}return t}var $=require("lodash-es"),q=class extends y.FetchSession{static async search(t,e,r={}){const n=Array.isArray(t)?t:[t],s=r.limit\|\|10,i=r.fillLimit??!0,a=[];for(let t=0;t<n.length;t++){const o=n[t];if(a.length>=s)break;const c=s-a.length,l={...r,limit:c},u=this.createObject(o,l);if(!u)throw new Error(`Search engine not found: ${o}`);try{const t=await u.search(e,l);for(const e of t)e.url&&!a.some(t=>t.url===e.url)&&a.push(e);if(a.length>=s)break;if(!1===i)break}catch(e){if(console.warn(`[WebSearcher] Engine '${o}' failed completely:`,e),t===n.length-1&&0===a.length)throw e}finally{await u.dispose()}}return a}get template(){return{}}get pagination(){}getTemplate(t,e){return(0,$.cloneDeep)(this.template)}createContext(t=this.options){const{actions:e,...r}=this.template,n=(0,$.defaultsDeep)({},r,t);return r.engine&&"auto"!==r.engine\|\|!t.engine\|\|(n.engine=t.engine),super.createContext(n)}async search(t,e={}){const r=e.limit\|\|10,n=[],s=new Set;let i=e.startPage\|\|0;const a=this.pagination?.startValue??0,o=this.pagination?.increment??1,c=e.maxPages\|\|this.pagination?.maxPages\|\|10,l=this.constructor.name;let u;e.baseUrls&&(Array.isArray(e.baseUrls)?u=e.baseUrls:"object"==typeof e.baseUrls&&(u=e.baseUrls[l]\|\|e.baseUrls[this.constructor.alias?.[0]])),u&&0!==u.length\|\|(u=this.constructor.defaultBaseUrls);const f=u&&u.length>0;let d=0;f&&"number"==typeof this.constructor.currentInstanceIndex&&(d=this.constructor.currentInstanceIndex);let h=!1;for(;n.length<r;){let m=!1,p=null;const b=f?u.length:1;let y=0;for(;y<b;){const c=f?u[d]:void 0,b=this.formatOptions(e),w=a+io,g={...e,...b,query:t,page:i+a,offset:w,limit:r,baseUrl:c?.endsWith("/")?c.slice(0,-1):c},q=k(this.getTemplate(g,e),g),{actions:A,...v}=e,x=(0,$.defaultsDeep)({},q,v),D=[],S=x.actions\|\|[];if(i===(e.startPage\|\|0)\|\|"url-param"===this.pagination?.type){if(x.url){S.some(t=>"goto"===(t.id??t.name??t.action)&&t.params?.url===x.url)\|\|D.push({id:"goto",params:{url:x.url}})}}else"click-next"===this.pagination?.type&&this.pagination.nextButtonSelector&&(D.push({id:"click",params:{selector:this.pagination.nextButtonSelector}}),D.push({id:"waitFor",params:{networkIdle:!0,ms:500}}));D.push(...S),x.engine&&this.context.engine!==x.engine&&x.engine;try{const{outputs:r}=await this.executeAll(D,e),a={...e,query:t,page:i,baseUrl:c,engine:l};let o=await this.transform(r,a);e.transform&&(o=await e.transform(o,a));let u=!0;if(this.validateFetchResult&&(u=await this.validateFetchResult(o,a)),u&&e.validator&&(u=await e.validator(o,a)),!u)throw new Error(`Results validation failed for engine: ${l}, url: ${c}`);if(o&&0!==o.length)for(const t of o)t.url&&!s.has(t.url)&&(s.add(t.url),n.push(t));else h=!0;m=!0;break}catch(t){p=t,f&&(d=(d+1)%u.length,this.constructor.currentInstanceIndex=d),y++}}if(!m)throw p\|\|new Error(`All instances failed for engine: ${l}`);if(h)break;if(n.length>=r\|\|!this.pagination)break;if(i++,i>=c)break}return n.slice(0,r)}async validateFetchResult(t,e){return!0}async transform(t,e){return t.results\|\|[]}formatOptions(t){return{...t}}};q._isFactory=!1,(0,w.addBaseFactoryAbility)(q),q.prototype.name="Searcher";var A=class extends q{get template(){return{engine:"browser",browser:{headless:!1},url:"https://www.google.com/search?q=${query}&start=${offset}&tbs=${tbs}&tbm=${tbm}&gl=${gl}&hl=${hl}&safe=${safe}",actions:[{id:"extract",storeAs:"results",params:{type:"array",selector:"#main #search",items:{url:{selector:"a:has(h3)",attribute:"href",required:!0},title:{selector:"a:has(h3) h3",required:!0,mode:"innerText"},snippet:{selector:"div[style='-webkit-line-clamp']",type:"html"}}}}]}}get pagination(){return{type:"url-param",paramName:"start",startValue:0,increment:10}}formatOptions(t){const e={};if(t.timeRange)if("string"==typeof t.timeRange){const r={hour:"qdr:h",day:"qdr:d",week:"qdr:w",month:"qdr:m",year:"qdr:y"};r[t.timeRange]&&(e.tbs=r[t.timeRange])}else{const r=new Date(t.timeRange.from),n=t.timeRange.to?new Date(t.timeRange.to):new Date;if(!isNaN(r.getTime())&&!isNaN(n.getTime())){const t=t=>`${t.getMonth()+1}/${t.getDate()}/${t.getFullYear()}`;e.tbs=`cdr:1,cd_min:${t(r)},cd_max:${t(n)}`}}if(t.category){const r={images:"isch",videos:"vid",news:"nws"};r[t.category]&&(e.tbm=r[t.category])}return t.region&&(e.gl=t.region),t.language&&(e.hl=t.language),t.safeSearch&&("strict"===t.safeSearch?e.safe="active":"off"===t.safeSearch&&(e.safe="images")),e}async transform(t){const e=t.results\|\|[];return Array.isArray(e)?e.map(t=>{if(t.url&&t.url.startsWith("/url?q="))try{const e=new URL(t.url,"https://www.google.com").searchParams.get("q");e&&(t.url=e)}catch(t){}return t}):[]}};A.alias=["google"];