@isdk/web-searcher 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.cn.md +274 -0
- package/README.md +274 -0
- package/dist/index.d.mts +321 -0
- package/dist/index.d.ts +321 -0
- package/dist/index.js +1 -0
- package/dist/index.mjs +1 -0
- package/docs/README.md +278 -0
- package/docs/classes/GoogleSearcher.md +695 -0
- package/docs/classes/WebSearcher.md +661 -0
- package/docs/globals.md +26 -0
- package/docs/interfaces/CustomTimeRange.md +29 -0
- package/docs/interfaces/PaginationConfig.md +86 -0
- package/docs/interfaces/SearchContext.md +41 -0
- package/docs/interfaces/SearchOptions.md +105 -0
- package/docs/interfaces/StandardSearchResult.md +58 -0
- package/docs/type-aliases/SafeSearchLevel.md +11 -0
- package/docs/type-aliases/SearchCategory.md +11 -0
- package/docs/type-aliases/SearchTimeRange.md +11 -0
- package/docs/type-aliases/SearchTimeRangePreset.md +11 -0
- package/docs/type-aliases/SearcherConstructor.md +23 -0
- package/package.json +87 -0
package/dist/index.d.mts
ADDED
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
import * as _isdk_web_fetcher from '@isdk/web-fetcher';
|
|
2
|
+
import { FetcherOptions, FetchSession } from '@isdk/web-fetcher';
|
|
3
|
+
import { IBaseFactoryOptions } from 'custom-factory';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Interface representing a standardized search result item.
|
|
7
|
+
* This ensures consistency across different search engines.
|
|
8
|
+
*/
|
|
9
|
+
interface StandardSearchResult {
|
|
10
|
+
/** The title of the search result. */
|
|
11
|
+
title: string;
|
|
12
|
+
/** The URL of the search result. */
|
|
13
|
+
url: string;
|
|
14
|
+
/** A brief snippet or description of the result. */
|
|
15
|
+
snippet?: string;
|
|
16
|
+
/** An optional image URL associated with the result. */
|
|
17
|
+
image?: string;
|
|
18
|
+
/** Allows for engine-specific extra fields (e.g., rank, author, date). */
|
|
19
|
+
[key: string]: any;
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Configuration for pagination strategies.
|
|
23
|
+
* Defines how the searcher should navigate to the next page of results.
|
|
24
|
+
*/
|
|
25
|
+
interface PaginationConfig {
|
|
26
|
+
/**
|
|
27
|
+
* The type of pagination mechanism:
|
|
28
|
+
* - 'url-param': Pagination is handled by modifying URL parameters (e.g., `?page=2` or `?start=10`).
|
|
29
|
+
* - 'click-next': Pagination is handled by clicking a "Next" button on the page (only works in 'browser' mode).
|
|
30
|
+
*/
|
|
31
|
+
type: 'url-param' | 'click-next';
|
|
32
|
+
/**
|
|
33
|
+
* The name of the URL parameter used for pagination.
|
|
34
|
+
* Required if type is 'url-param'.
|
|
35
|
+
* @example 'start' for Google, 'page' or 'p' for others.
|
|
36
|
+
*/
|
|
37
|
+
paramName?: string;
|
|
38
|
+
/**
|
|
39
|
+
* The starting value for the pagination parameter.
|
|
40
|
+
* @default 0
|
|
41
|
+
*/
|
|
42
|
+
startValue?: number;
|
|
43
|
+
/**
|
|
44
|
+
* The increment step for each page.
|
|
45
|
+
* - If the parameter represents an item offset (like Google's 'start'), this might be 10.
|
|
46
|
+
* - If the parameter represents a page number, this is usually 1.
|
|
47
|
+
* @default 1
|
|
48
|
+
*/
|
|
49
|
+
increment?: number;
|
|
50
|
+
/**
|
|
51
|
+
* The CSS selector for the "Next" page button.
|
|
52
|
+
* Required if type is 'click-next'.
|
|
53
|
+
*/
|
|
54
|
+
nextButtonSelector?: string;
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Context object passed to the transform function.
|
|
58
|
+
*/
|
|
59
|
+
interface SearchContext {
|
|
60
|
+
/** The original search query. */
|
|
61
|
+
query: string;
|
|
62
|
+
/** The current page index (0-based). */
|
|
63
|
+
page: number;
|
|
64
|
+
/** The requested limit of results. */
|
|
65
|
+
limit?: number;
|
|
66
|
+
}
|
|
67
|
+
type SearchTimeRangePreset = 'all' | 'day' | 'week' | 'month' | 'year';
|
|
68
|
+
interface CustomTimeRange {
|
|
69
|
+
/** Start date (Date object or string like 'YYYY-MM-DD'). */
|
|
70
|
+
from: Date | string;
|
|
71
|
+
/** End date (Date object or string like 'YYYY-MM-DD'). Defaults to current date if omitted. */
|
|
72
|
+
to?: Date | string;
|
|
73
|
+
}
|
|
74
|
+
type SearchTimeRange = SearchTimeRangePreset | CustomTimeRange;
|
|
75
|
+
type SearchCategory = 'all' | 'images' | 'videos' | 'news';
|
|
76
|
+
type SafeSearchLevel = 'off' | 'moderate' | 'strict';
|
|
77
|
+
/**
|
|
78
|
+
* Options provided when executing a search.
|
|
79
|
+
*/
|
|
80
|
+
interface SearchOptions {
|
|
81
|
+
/** The maximum number of results to retrieve. */
|
|
82
|
+
limit?: number;
|
|
83
|
+
/**
|
|
84
|
+
* Date range for the search results.
|
|
85
|
+
* Default: 'all'
|
|
86
|
+
*/
|
|
87
|
+
timeRange?: SearchTimeRange;
|
|
88
|
+
/**
|
|
89
|
+
* The category of results to return.
|
|
90
|
+
* Default: 'all' (web search)
|
|
91
|
+
*/
|
|
92
|
+
category?: SearchCategory;
|
|
93
|
+
/**
|
|
94
|
+
* Region code (ISO 3166-1 alpha-2) to bias results (e.g., 'US', 'CN', 'JP').
|
|
95
|
+
*/
|
|
96
|
+
region?: string;
|
|
97
|
+
/**
|
|
98
|
+
* Language code (ISO 639-1) for the interface or results (e.g., 'en', 'zh-CN').
|
|
99
|
+
*/
|
|
100
|
+
language?: string;
|
|
101
|
+
/**
|
|
102
|
+
* Safe search filtering level.
|
|
103
|
+
* Default: engine dependent (usually 'moderate' or 'strict' by default).
|
|
104
|
+
*/
|
|
105
|
+
safeSearch?: SafeSearchLevel;
|
|
106
|
+
/**
|
|
107
|
+
* A custom transform function to filter or modify results at runtime.
|
|
108
|
+
* This runs AFTER the engine-level transform.
|
|
109
|
+
*/
|
|
110
|
+
transform?: (results: StandardSearchResult[], context: SearchContext) => Promise<StandardSearchResult[]> | StandardSearchResult[];
|
|
111
|
+
/** Any other custom variables to be injected into the template. */
|
|
112
|
+
[key: string]: any;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Constructor definition for Searcher subclasses.
|
|
117
|
+
*/
|
|
118
|
+
type SearcherConstructor = new (options?: FetcherOptions) => WebSearcher;
|
|
119
|
+
/**
|
|
120
|
+
* The abstract base class for all search engines.
|
|
121
|
+
*
|
|
122
|
+
* It extends `FetchSession`, meaning each `WebSearcher` instance is an active session
|
|
123
|
+
* capable of maintaining state (e.g., cookies, local storage) across multiple search queries.
|
|
124
|
+
*
|
|
125
|
+
* Developers should extend this class to create specific search engine implementations
|
|
126
|
+
* (e.g., Google, Bing, DuckDuckGo).
|
|
127
|
+
*
|
|
128
|
+
* @example
|
|
129
|
+
* ```typescript
|
|
130
|
+
* class MySearcher extends WebSearcher {
|
|
131
|
+
* get template() {
|
|
132
|
+
* return { url: '...' };
|
|
133
|
+
* }
|
|
134
|
+
* }
|
|
135
|
+
* WebSearcher.register(MySearcher);
|
|
136
|
+
* ```
|
|
137
|
+
*/
|
|
138
|
+
declare abstract class WebSearcher extends FetchSession {
|
|
139
|
+
static _isFactory: boolean;
|
|
140
|
+
/**
|
|
141
|
+
* Custom engine name. If not provided, it is derived from the class name.
|
|
142
|
+
* For example, `GoogleSearcher` becomes `Google`.
|
|
143
|
+
*/
|
|
144
|
+
static name?: string;
|
|
145
|
+
/**
|
|
146
|
+
* Engine alias(es). Can be a single string or an array of strings.
|
|
147
|
+
* Useful for registering shorthand names (e.g., 'g' for 'Google').
|
|
148
|
+
*/
|
|
149
|
+
static alias?: string | string[];
|
|
150
|
+
/**
|
|
151
|
+
* Registers a search engine class.
|
|
152
|
+
*
|
|
153
|
+
* @param ctor - The search engine class to register.
|
|
154
|
+
* @param options - Registration options. If a string is provided, it is used as the registered name.
|
|
155
|
+
* @returns `true` if registration was successful.
|
|
156
|
+
*/
|
|
157
|
+
static register: (ctor: typeof WebSearcher, options?: IBaseFactoryOptions | string) => boolean;
|
|
158
|
+
/**
|
|
159
|
+
* Unregisters a search engine.
|
|
160
|
+
*
|
|
161
|
+
* @param name - The name or class to unregister.
|
|
162
|
+
*/
|
|
163
|
+
static unregister: (name?: string | typeof WebSearcher) => void;
|
|
164
|
+
/**
|
|
165
|
+
* Retrieves a registered search engine class by name.
|
|
166
|
+
*
|
|
167
|
+
* @param name - The name of the engine (e.g., 'Google').
|
|
168
|
+
* @returns The search engine class constructor.
|
|
169
|
+
*/
|
|
170
|
+
static get: (name: string) => typeof WebSearcher;
|
|
171
|
+
/**
|
|
172
|
+
* Creates an instance of the registered search engine.
|
|
173
|
+
*
|
|
174
|
+
* @param name - The name of the engine.
|
|
175
|
+
* @param args - Arguments to pass to the constructor.
|
|
176
|
+
* @returns An instance of the search engine.
|
|
177
|
+
*/
|
|
178
|
+
static createObject: (name: string, ...args: any[]) => WebSearcher;
|
|
179
|
+
/**
|
|
180
|
+
* Iterates over all registered engines.
|
|
181
|
+
*
|
|
182
|
+
* @param cb - Callback function to invoke for each registered engine.
|
|
183
|
+
*/
|
|
184
|
+
static forEach: (cb: (ctor: typeof WebSearcher, name: string) => void) => void;
|
|
185
|
+
/**
|
|
186
|
+
* Sets aliases for a registered engine.
|
|
187
|
+
*
|
|
188
|
+
* @param ctor - The search engine class.
|
|
189
|
+
* @param aliases - Aliases to add.
|
|
190
|
+
*/
|
|
191
|
+
static setAliases: (ctor: typeof WebSearcher, ...aliases: string[]) => void;
|
|
192
|
+
/**
|
|
193
|
+
* Static helper to execute a one-off search.
|
|
194
|
+
*
|
|
195
|
+
* It creates an instance of the specified engine, executes the search, and then
|
|
196
|
+
* automatically disposes of the session.
|
|
197
|
+
*
|
|
198
|
+
* @param engineName - The name of the engine to use (e.g., 'Google').
|
|
199
|
+
* @param query - The search query string.
|
|
200
|
+
* @param options - Combined search options and fetcher options.
|
|
201
|
+
* @returns A promise resolving to an array of standardized search results.
|
|
202
|
+
*/
|
|
203
|
+
static search(engineName: string, query: string, options?: SearchOptions & FetcherOptions): Promise<StandardSearchResult[]>;
|
|
204
|
+
/**
|
|
205
|
+
* The declarative template for the fetch options.
|
|
206
|
+
*
|
|
207
|
+
* Subclasses **must** implement this getter to provide the engine configuration,
|
|
208
|
+
* including the base URL, search parameters pattern, and extraction rules.
|
|
209
|
+
*
|
|
210
|
+
* Supports variable injection using syntax like `${query}`, `${offset}`, etc.
|
|
211
|
+
*
|
|
212
|
+
* @example
|
|
213
|
+
* ```typescript
|
|
214
|
+
* get template() {
|
|
215
|
+
* return {
|
|
216
|
+
* url: 'https://example.com/search?q=${query}',
|
|
217
|
+
* actions: [ ... ]
|
|
218
|
+
* };
|
|
219
|
+
* }
|
|
220
|
+
* ```
|
|
221
|
+
*/
|
|
222
|
+
abstract get template(): FetcherOptions;
|
|
223
|
+
/**
|
|
224
|
+
* Optional pagination configuration.
|
|
225
|
+
* Defines how the searcher navigates to subsequent pages.
|
|
226
|
+
*
|
|
227
|
+
* If undefined, the searcher will only fetch the first page.
|
|
228
|
+
*/
|
|
229
|
+
get pagination(): PaginationConfig | undefined;
|
|
230
|
+
protected createContext(options?: FetcherOptions): _isdk_web_fetcher.FetchContext;
|
|
231
|
+
/**
|
|
232
|
+
* Executes a search query.
|
|
233
|
+
*
|
|
234
|
+
* This method handles the pagination loop, variable injection, fetching,
|
|
235
|
+
* and result transformation.
|
|
236
|
+
*
|
|
237
|
+
* @param query - The search query string.
|
|
238
|
+
* @param options - Optional search parameters (e.g., limit, timeRange).
|
|
239
|
+
* @returns A promise resolving to an array of standardized search results.
|
|
240
|
+
*/
|
|
241
|
+
search(query: string, options?: SearchOptions): Promise<StandardSearchResult[]>;
|
|
242
|
+
/**
|
|
243
|
+
* Transform and clean the raw extracted results.
|
|
244
|
+
*
|
|
245
|
+
* Subclasses should override this method to provide engine-specific cleaning,
|
|
246
|
+
* normalization, or post-processing of the data extracted by the fetcher.
|
|
247
|
+
*
|
|
248
|
+
* @param outputs - The complete outputs object from the fetch actions.
|
|
249
|
+
* @param context - The search context (query, page, etc.).
|
|
250
|
+
* @returns A promise resolving to an array of standardized search results.
|
|
251
|
+
*/
|
|
252
|
+
protected transform(outputs: Record<string, any>, context: SearchContext): Promise<StandardSearchResult[]>;
|
|
253
|
+
/**
|
|
254
|
+
* Transforms standard options into engine-specific template variables.
|
|
255
|
+
*
|
|
256
|
+
* Subclasses should override this to map standard options like 'timeRange',
|
|
257
|
+
* 'category', 'region' into the specific URL parameters required by the engine
|
|
258
|
+
* (e.g., mapping `timeRange: 'day'` to `tbs: 'qdr:d'` for Google).
|
|
259
|
+
*
|
|
260
|
+
* @param options - The search options provided by the user.
|
|
261
|
+
* @returns A dictionary of variables to be injected into the template.
|
|
262
|
+
*/
|
|
263
|
+
protected formatOptions(options: SearchOptions): Record<string, any>;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
/**
|
|
267
|
+
* A sample implementation of a Google Search scraper.
|
|
268
|
+
*
|
|
269
|
+
* @remarks
|
|
270
|
+
* **⚠️ DEMO ONLY ⚠️**
|
|
271
|
+
*
|
|
272
|
+
* This class serves as a **reference implementation** to demonstrate how to extend
|
|
273
|
+
* the `WebSearcher` base class. It is **NOT intended for production use**.
|
|
274
|
+
*
|
|
275
|
+
* Google frequently changes its HTML structure and employs sophisticated anti-bot measures.
|
|
276
|
+
* A production-grade Google scraper would require robust proxy rotation, CAPTCHA solving,
|
|
277
|
+
* and constant maintenance of selectors, or usage of an official API.
|
|
278
|
+
*
|
|
279
|
+
* Use this class to understand:
|
|
280
|
+
* 1. How to define a fetch template with variable injection.
|
|
281
|
+
* 2. How to map standard options (like time range) to engine-specific URL parameters.
|
|
282
|
+
* 3. How to handle pagination.
|
|
283
|
+
* 4. How to transform and clean raw extracted data.
|
|
284
|
+
*/
|
|
285
|
+
declare class GoogleSearcher extends WebSearcher {
|
|
286
|
+
static alias: string[];
|
|
287
|
+
/**
|
|
288
|
+
* Defines the fetch template for Google Search.
|
|
289
|
+
*
|
|
290
|
+
* @returns The fetcher configuration including the URL pattern and extraction rules.
|
|
291
|
+
*/
|
|
292
|
+
get template(): FetcherOptions;
|
|
293
|
+
/**
|
|
294
|
+
* Configures pagination for Google Search results.
|
|
295
|
+
* Uses the 'start' URL parameter, incrementing by 10 for each page.
|
|
296
|
+
*/
|
|
297
|
+
get pagination(): PaginationConfig;
|
|
298
|
+
/**
|
|
299
|
+
* Maps standard `SearchOptions` to Google's specific URL parameters.
|
|
300
|
+
*
|
|
301
|
+
* - `timeRange` -> `tbs` (e.g., 'qdr:d' for day)
|
|
302
|
+
* - `category` -> `tbm` (e.g., 'isch' for images)
|
|
303
|
+
* - `region` -> `gl`
|
|
304
|
+
* - `language` -> `hl`
|
|
305
|
+
* - `safeSearch` -> `safe`
|
|
306
|
+
*
|
|
307
|
+
* @param options - The user-provided search options.
|
|
308
|
+
* @returns A map of variables to inject into the URL template.
|
|
309
|
+
*/
|
|
310
|
+
protected formatOptions(options: SearchOptions): Record<string, any>;
|
|
311
|
+
/**
|
|
312
|
+
* Cleans and normalizes the extracted results.
|
|
313
|
+
* Specifically, it unwraps Google's redirect URLs (starting with `/url?q=`).
|
|
314
|
+
*
|
|
315
|
+
* @param outputs - The raw outputs from the fetcher.
|
|
316
|
+
* @returns An array of cleaned search results.
|
|
317
|
+
*/
|
|
318
|
+
protected transform(outputs: Record<string, any>): Promise<any[]>;
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
export { type CustomTimeRange, GoogleSearcher, type PaginationConfig, type SafeSearchLevel, type SearchCategory, type SearchContext, type SearchOptions, type SearchTimeRange, type SearchTimeRangePreset, type SearcherConstructor, type StandardSearchResult, WebSearcher };
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,321 @@
|
|
|
1
|
+
import * as _isdk_web_fetcher from '@isdk/web-fetcher';
|
|
2
|
+
import { FetcherOptions, FetchSession } from '@isdk/web-fetcher';
|
|
3
|
+
import { IBaseFactoryOptions } from 'custom-factory';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Interface representing a standardized search result item.
|
|
7
|
+
* This ensures consistency across different search engines.
|
|
8
|
+
*/
|
|
9
|
+
interface StandardSearchResult {
|
|
10
|
+
/** The title of the search result. */
|
|
11
|
+
title: string;
|
|
12
|
+
/** The URL of the search result. */
|
|
13
|
+
url: string;
|
|
14
|
+
/** A brief snippet or description of the result. */
|
|
15
|
+
snippet?: string;
|
|
16
|
+
/** An optional image URL associated with the result. */
|
|
17
|
+
image?: string;
|
|
18
|
+
/** Allows for engine-specific extra fields (e.g., rank, author, date). */
|
|
19
|
+
[key: string]: any;
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Configuration for pagination strategies.
|
|
23
|
+
* Defines how the searcher should navigate to the next page of results.
|
|
24
|
+
*/
|
|
25
|
+
interface PaginationConfig {
|
|
26
|
+
/**
|
|
27
|
+
* The type of pagination mechanism:
|
|
28
|
+
* - 'url-param': Pagination is handled by modifying URL parameters (e.g., `?page=2` or `?start=10`).
|
|
29
|
+
* - 'click-next': Pagination is handled by clicking a "Next" button on the page (only works in 'browser' mode).
|
|
30
|
+
*/
|
|
31
|
+
type: 'url-param' | 'click-next';
|
|
32
|
+
/**
|
|
33
|
+
* The name of the URL parameter used for pagination.
|
|
34
|
+
* Required if type is 'url-param'.
|
|
35
|
+
* @example 'start' for Google, 'page' or 'p' for others.
|
|
36
|
+
*/
|
|
37
|
+
paramName?: string;
|
|
38
|
+
/**
|
|
39
|
+
* The starting value for the pagination parameter.
|
|
40
|
+
* @default 0
|
|
41
|
+
*/
|
|
42
|
+
startValue?: number;
|
|
43
|
+
/**
|
|
44
|
+
* The increment step for each page.
|
|
45
|
+
* - If the parameter represents an item offset (like Google's 'start'), this might be 10.
|
|
46
|
+
* - If the parameter represents a page number, this is usually 1.
|
|
47
|
+
* @default 1
|
|
48
|
+
*/
|
|
49
|
+
increment?: number;
|
|
50
|
+
/**
|
|
51
|
+
* The CSS selector for the "Next" page button.
|
|
52
|
+
* Required if type is 'click-next'.
|
|
53
|
+
*/
|
|
54
|
+
nextButtonSelector?: string;
|
|
55
|
+
}
|
|
56
|
+
/**
|
|
57
|
+
* Context object passed to the transform function.
|
|
58
|
+
*/
|
|
59
|
+
interface SearchContext {
|
|
60
|
+
/** The original search query. */
|
|
61
|
+
query: string;
|
|
62
|
+
/** The current page index (0-based). */
|
|
63
|
+
page: number;
|
|
64
|
+
/** The requested limit of results. */
|
|
65
|
+
limit?: number;
|
|
66
|
+
}
|
|
67
|
+
type SearchTimeRangePreset = 'all' | 'day' | 'week' | 'month' | 'year';
|
|
68
|
+
interface CustomTimeRange {
|
|
69
|
+
/** Start date (Date object or string like 'YYYY-MM-DD'). */
|
|
70
|
+
from: Date | string;
|
|
71
|
+
/** End date (Date object or string like 'YYYY-MM-DD'). Defaults to current date if omitted. */
|
|
72
|
+
to?: Date | string;
|
|
73
|
+
}
|
|
74
|
+
type SearchTimeRange = SearchTimeRangePreset | CustomTimeRange;
|
|
75
|
+
type SearchCategory = 'all' | 'images' | 'videos' | 'news';
|
|
76
|
+
type SafeSearchLevel = 'off' | 'moderate' | 'strict';
|
|
77
|
+
/**
|
|
78
|
+
* Options provided when executing a search.
|
|
79
|
+
*/
|
|
80
|
+
interface SearchOptions {
|
|
81
|
+
/** The maximum number of results to retrieve. */
|
|
82
|
+
limit?: number;
|
|
83
|
+
/**
|
|
84
|
+
* Date range for the search results.
|
|
85
|
+
* Default: 'all'
|
|
86
|
+
*/
|
|
87
|
+
timeRange?: SearchTimeRange;
|
|
88
|
+
/**
|
|
89
|
+
* The category of results to return.
|
|
90
|
+
* Default: 'all' (web search)
|
|
91
|
+
*/
|
|
92
|
+
category?: SearchCategory;
|
|
93
|
+
/**
|
|
94
|
+
* Region code (ISO 3166-1 alpha-2) to bias results (e.g., 'US', 'CN', 'JP').
|
|
95
|
+
*/
|
|
96
|
+
region?: string;
|
|
97
|
+
/**
|
|
98
|
+
* Language code (ISO 639-1) for the interface or results (e.g., 'en', 'zh-CN').
|
|
99
|
+
*/
|
|
100
|
+
language?: string;
|
|
101
|
+
/**
|
|
102
|
+
* Safe search filtering level.
|
|
103
|
+
* Default: engine dependent (usually 'moderate' or 'strict' by default).
|
|
104
|
+
*/
|
|
105
|
+
safeSearch?: SafeSearchLevel;
|
|
106
|
+
/**
|
|
107
|
+
* A custom transform function to filter or modify results at runtime.
|
|
108
|
+
* This runs AFTER the engine-level transform.
|
|
109
|
+
*/
|
|
110
|
+
transform?: (results: StandardSearchResult[], context: SearchContext) => Promise<StandardSearchResult[]> | StandardSearchResult[];
|
|
111
|
+
/** Any other custom variables to be injected into the template. */
|
|
112
|
+
[key: string]: any;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Constructor definition for Searcher subclasses.
|
|
117
|
+
*/
|
|
118
|
+
type SearcherConstructor = new (options?: FetcherOptions) => WebSearcher;
|
|
119
|
+
/**
|
|
120
|
+
* The abstract base class for all search engines.
|
|
121
|
+
*
|
|
122
|
+
* It extends `FetchSession`, meaning each `WebSearcher` instance is an active session
|
|
123
|
+
* capable of maintaining state (e.g., cookies, local storage) across multiple search queries.
|
|
124
|
+
*
|
|
125
|
+
* Developers should extend this class to create specific search engine implementations
|
|
126
|
+
* (e.g., Google, Bing, DuckDuckGo).
|
|
127
|
+
*
|
|
128
|
+
* @example
|
|
129
|
+
* ```typescript
|
|
130
|
+
* class MySearcher extends WebSearcher {
|
|
131
|
+
* get template() {
|
|
132
|
+
* return { url: '...' };
|
|
133
|
+
* }
|
|
134
|
+
* }
|
|
135
|
+
* WebSearcher.register(MySearcher);
|
|
136
|
+
* ```
|
|
137
|
+
*/
|
|
138
|
+
declare abstract class WebSearcher extends FetchSession {
|
|
139
|
+
static _isFactory: boolean;
|
|
140
|
+
/**
|
|
141
|
+
* Custom engine name. If not provided, it is derived from the class name.
|
|
142
|
+
* For example, `GoogleSearcher` becomes `Google`.
|
|
143
|
+
*/
|
|
144
|
+
static name?: string;
|
|
145
|
+
/**
|
|
146
|
+
* Engine alias(es). Can be a single string or an array of strings.
|
|
147
|
+
* Useful for registering shorthand names (e.g., 'g' for 'Google').
|
|
148
|
+
*/
|
|
149
|
+
static alias?: string | string[];
|
|
150
|
+
/**
|
|
151
|
+
* Registers a search engine class.
|
|
152
|
+
*
|
|
153
|
+
* @param ctor - The search engine class to register.
|
|
154
|
+
* @param options - Registration options. If a string is provided, it is used as the registered name.
|
|
155
|
+
* @returns `true` if registration was successful.
|
|
156
|
+
*/
|
|
157
|
+
static register: (ctor: typeof WebSearcher, options?: IBaseFactoryOptions | string) => boolean;
|
|
158
|
+
/**
|
|
159
|
+
* Unregisters a search engine.
|
|
160
|
+
*
|
|
161
|
+
* @param name - The name or class to unregister.
|
|
162
|
+
*/
|
|
163
|
+
static unregister: (name?: string | typeof WebSearcher) => void;
|
|
164
|
+
/**
|
|
165
|
+
* Retrieves a registered search engine class by name.
|
|
166
|
+
*
|
|
167
|
+
* @param name - The name of the engine (e.g., 'Google').
|
|
168
|
+
* @returns The search engine class constructor.
|
|
169
|
+
*/
|
|
170
|
+
static get: (name: string) => typeof WebSearcher;
|
|
171
|
+
/**
|
|
172
|
+
* Creates an instance of the registered search engine.
|
|
173
|
+
*
|
|
174
|
+
* @param name - The name of the engine.
|
|
175
|
+
* @param args - Arguments to pass to the constructor.
|
|
176
|
+
* @returns An instance of the search engine.
|
|
177
|
+
*/
|
|
178
|
+
static createObject: (name: string, ...args: any[]) => WebSearcher;
|
|
179
|
+
/**
|
|
180
|
+
* Iterates over all registered engines.
|
|
181
|
+
*
|
|
182
|
+
* @param cb - Callback function to invoke for each registered engine.
|
|
183
|
+
*/
|
|
184
|
+
static forEach: (cb: (ctor: typeof WebSearcher, name: string) => void) => void;
|
|
185
|
+
/**
|
|
186
|
+
* Sets aliases for a registered engine.
|
|
187
|
+
*
|
|
188
|
+
* @param ctor - The search engine class.
|
|
189
|
+
* @param aliases - Aliases to add.
|
|
190
|
+
*/
|
|
191
|
+
static setAliases: (ctor: typeof WebSearcher, ...aliases: string[]) => void;
|
|
192
|
+
/**
|
|
193
|
+
* Static helper to execute a one-off search.
|
|
194
|
+
*
|
|
195
|
+
* It creates an instance of the specified engine, executes the search, and then
|
|
196
|
+
* automatically disposes of the session.
|
|
197
|
+
*
|
|
198
|
+
* @param engineName - The name of the engine to use (e.g., 'Google').
|
|
199
|
+
* @param query - The search query string.
|
|
200
|
+
* @param options - Combined search options and fetcher options.
|
|
201
|
+
* @returns A promise resolving to an array of standardized search results.
|
|
202
|
+
*/
|
|
203
|
+
static search(engineName: string, query: string, options?: SearchOptions & FetcherOptions): Promise<StandardSearchResult[]>;
|
|
204
|
+
/**
|
|
205
|
+
* The declarative template for the fetch options.
|
|
206
|
+
*
|
|
207
|
+
* Subclasses **must** implement this getter to provide the engine configuration,
|
|
208
|
+
* including the base URL, search parameters pattern, and extraction rules.
|
|
209
|
+
*
|
|
210
|
+
* Supports variable injection using syntax like `${query}`, `${offset}`, etc.
|
|
211
|
+
*
|
|
212
|
+
* @example
|
|
213
|
+
* ```typescript
|
|
214
|
+
* get template() {
|
|
215
|
+
* return {
|
|
216
|
+
* url: 'https://example.com/search?q=${query}',
|
|
217
|
+
* actions: [ ... ]
|
|
218
|
+
* };
|
|
219
|
+
* }
|
|
220
|
+
* ```
|
|
221
|
+
*/
|
|
222
|
+
abstract get template(): FetcherOptions;
|
|
223
|
+
/**
|
|
224
|
+
* Optional pagination configuration.
|
|
225
|
+
* Defines how the searcher navigates to subsequent pages.
|
|
226
|
+
*
|
|
227
|
+
* If undefined, the searcher will only fetch the first page.
|
|
228
|
+
*/
|
|
229
|
+
get pagination(): PaginationConfig | undefined;
|
|
230
|
+
protected createContext(options?: FetcherOptions): _isdk_web_fetcher.FetchContext;
|
|
231
|
+
/**
|
|
232
|
+
* Executes a search query.
|
|
233
|
+
*
|
|
234
|
+
* This method handles the pagination loop, variable injection, fetching,
|
|
235
|
+
* and result transformation.
|
|
236
|
+
*
|
|
237
|
+
* @param query - The search query string.
|
|
238
|
+
* @param options - Optional search parameters (e.g., limit, timeRange).
|
|
239
|
+
* @returns A promise resolving to an array of standardized search results.
|
|
240
|
+
*/
|
|
241
|
+
search(query: string, options?: SearchOptions): Promise<StandardSearchResult[]>;
|
|
242
|
+
/**
|
|
243
|
+
* Transform and clean the raw extracted results.
|
|
244
|
+
*
|
|
245
|
+
* Subclasses should override this method to provide engine-specific cleaning,
|
|
246
|
+
* normalization, or post-processing of the data extracted by the fetcher.
|
|
247
|
+
*
|
|
248
|
+
* @param outputs - The complete outputs object from the fetch actions.
|
|
249
|
+
* @param context - The search context (query, page, etc.).
|
|
250
|
+
* @returns A promise resolving to an array of standardized search results.
|
|
251
|
+
*/
|
|
252
|
+
protected transform(outputs: Record<string, any>, context: SearchContext): Promise<StandardSearchResult[]>;
|
|
253
|
+
/**
|
|
254
|
+
* Transforms standard options into engine-specific template variables.
|
|
255
|
+
*
|
|
256
|
+
* Subclasses should override this to map standard options like 'timeRange',
|
|
257
|
+
* 'category', 'region' into the specific URL parameters required by the engine
|
|
258
|
+
* (e.g., mapping `timeRange: 'day'` to `tbs: 'qdr:d'` for Google).
|
|
259
|
+
*
|
|
260
|
+
* @param options - The search options provided by the user.
|
|
261
|
+
* @returns A dictionary of variables to be injected into the template.
|
|
262
|
+
*/
|
|
263
|
+
protected formatOptions(options: SearchOptions): Record<string, any>;
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
/**
|
|
267
|
+
* A sample implementation of a Google Search scraper.
|
|
268
|
+
*
|
|
269
|
+
* @remarks
|
|
270
|
+
* **⚠️ DEMO ONLY ⚠️**
|
|
271
|
+
*
|
|
272
|
+
* This class serves as a **reference implementation** to demonstrate how to extend
|
|
273
|
+
* the `WebSearcher` base class. It is **NOT intended for production use**.
|
|
274
|
+
*
|
|
275
|
+
* Google frequently changes its HTML structure and employs sophisticated anti-bot measures.
|
|
276
|
+
* A production-grade Google scraper would require robust proxy rotation, CAPTCHA solving,
|
|
277
|
+
* and constant maintenance of selectors, or usage of an official API.
|
|
278
|
+
*
|
|
279
|
+
* Use this class to understand:
|
|
280
|
+
* 1. How to define a fetch template with variable injection.
|
|
281
|
+
* 2. How to map standard options (like time range) to engine-specific URL parameters.
|
|
282
|
+
* 3. How to handle pagination.
|
|
283
|
+
* 4. How to transform and clean raw extracted data.
|
|
284
|
+
*/
|
|
285
|
+
declare class GoogleSearcher extends WebSearcher {
|
|
286
|
+
static alias: string[];
|
|
287
|
+
/**
|
|
288
|
+
* Defines the fetch template for Google Search.
|
|
289
|
+
*
|
|
290
|
+
* @returns The fetcher configuration including the URL pattern and extraction rules.
|
|
291
|
+
*/
|
|
292
|
+
get template(): FetcherOptions;
|
|
293
|
+
/**
|
|
294
|
+
* Configures pagination for Google Search results.
|
|
295
|
+
* Uses the 'start' URL parameter, incrementing by 10 for each page.
|
|
296
|
+
*/
|
|
297
|
+
get pagination(): PaginationConfig;
|
|
298
|
+
/**
|
|
299
|
+
* Maps standard `SearchOptions` to Google's specific URL parameters.
|
|
300
|
+
*
|
|
301
|
+
* - `timeRange` -> `tbs` (e.g., 'qdr:d' for day)
|
|
302
|
+
* - `category` -> `tbm` (e.g., 'isch' for images)
|
|
303
|
+
* - `region` -> `gl`
|
|
304
|
+
* - `language` -> `hl`
|
|
305
|
+
* - `safeSearch` -> `safe`
|
|
306
|
+
*
|
|
307
|
+
* @param options - The user-provided search options.
|
|
308
|
+
* @returns A map of variables to inject into the URL template.
|
|
309
|
+
*/
|
|
310
|
+
protected formatOptions(options: SearchOptions): Record<string, any>;
|
|
311
|
+
/**
|
|
312
|
+
* Cleans and normalizes the extracted results.
|
|
313
|
+
* Specifically, it unwraps Google's redirect URLs (starting with `/url?q=`).
|
|
314
|
+
*
|
|
315
|
+
* @param outputs - The raw outputs from the fetcher.
|
|
316
|
+
* @returns An array of cleaned search results.
|
|
317
|
+
*/
|
|
318
|
+
protected transform(outputs: Record<string, any>): Promise<any[]>;
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
export { type CustomTimeRange, GoogleSearcher, type PaginationConfig, type SafeSearchLevel, type SearchCategory, type SearchContext, type SearchOptions, type SearchTimeRange, type SearchTimeRangePreset, type SearcherConstructor, type StandardSearchResult, WebSearcher };
|
package/dist/index.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"use strict";var t,e=Object.defineProperty,r=Object.getOwnPropertyDescriptor,s=Object.getOwnPropertyNames,a=Object.prototype.hasOwnProperty,i={};((t,r)=>{for(var s in r)e(t,s,{get:r[s],enumerable:!0})})(i,{GoogleSearcher:()=>h,WebSearcher:()=>f}),module.exports=(t=i,((t,i,n,o)=>{if(i&&"object"==typeof i||"function"==typeof i)for(let c of s(i))a.call(t,c)||c===n||e(t,c,{get:()=>i[c],enumerable:!(o=r(i,c))||o.enumerable});return t})(e({},"__esModule",{value:!0}),t));var n=require("@isdk/web-fetcher"),o=require("custom-factory"),c=require("lodash-es");function l(t,e){if("string"==typeof t)return t.replace(/\$\{(.*?)\}/g,(t,r)=>{const s=e[r.trim()];return void 0!==s?String(s):""});if(Array.isArray(t))return t.map(t=>l(t,e));if((0,c.isPlainObject)(t)){const r={};for(const s in t)Object.prototype.hasOwnProperty.call(t,s)&&(r[s]=l(t[s],e));return r}return t}var u=require("lodash-es"),f=class extends n.FetchSession{static async search(t,e,r={}){const s=this.createObject(t,r);if(!s)throw new Error(`Search engine not found: ${t}`);try{return await s.search(e,r)}finally{await s.dispose()}}get pagination(){}createContext(t=this.options){const e=this.template,r=(0,u.defaultsDeep)({},e,t);return e.engine&&"auto"!==e.engine||!t.engine||(r.engine=t.engine),super.createContext(r)}async search(t,e={}){const r=e.limit||10,s=[];let a=0;const i=this.pagination?.startValue??0,n=this.pagination?.increment??1;for(;s.length<r;){const o=this.formatOptions(e),c=i+a*n,f={...e,...o,query:t,page:a+i,offset:c,limit:r},h=l(this.template,f),m=(0,u.defaultsDeep)({},h,e),d=[];if(0===a||"url-param"===this.pagination?.type?m.url&&d.push({id:"goto",params:{url:m.url}}):"click-next"===this.pagination?.type&&this.pagination.nextButtonSelector&&(d.push({id:"click",params:{selector:this.pagination.nextButtonSelector}}),d.push({id:"waitFor",params:{networkIdle:!0,ms:500}})),m.actions){const t=m.actions.filter(t=>!(d.length>0&&"goto"===d[0].id&&"goto"===t.id));d.push(...t)}m.engine&&this.context.engine!==m.engine&&m.engine;const{outputs:g}=await this.executeAll(d),p={query:t,page:a,limit:e.limit};let w=[];if(w=await this.transform(g,p),e.transform&&(w=await e.transform(w,p)),!w||0===w.length)break;if(s.push(...w),s.length>=r||!this.pagination)break;if(a++,a>10)break}return s.slice(0,r)}async transform(t,e){return t.results||[]}formatOptions(t){return{...t}}};f._isFactory=!1,(0,o.addBaseFactoryAbility)(f),f.prototype.name="Searcher";var h=class extends f{get template(){return{engine:"browser",browser:{headless:!1},url:"https://www.google.com/search?q=${query}&start=${offset}&tbs=${tbs}&tbm=${tbm}&gl=${gl}&hl=${hl}&safe=${safe}",actions:[{id:"extract",storeAs:"results",params:{type:"array",selector:"#main #search",items:{url:{selector:"a:has(h3)",attribute:"href",required:!0},title:{selector:"a:has(h3) h3",required:!0,mode:"innerText"},snippet:{selector:"div[style*='-webkit-line-clamp']",type:"html"}}}}]}}get pagination(){return{type:"url-param",paramName:"start",startValue:0,increment:10}}formatOptions(t){const e={};if(t.timeRange)if("string"==typeof t.timeRange){const r={day:"qdr:d",week:"qdr:w",month:"qdr:m",year:"qdr:y"};r[t.timeRange]&&(e.tbs=r[t.timeRange])}else{const r=new Date(t.timeRange.from),s=t.timeRange.to?new Date(t.timeRange.to):new Date;if(!isNaN(r.getTime())&&!isNaN(s.getTime())){const t=t=>`${t.getMonth()+1}/${t.getDate()}/${t.getFullYear()}`;e.tbs=`cdr:1,cd_min:${t(r)},cd_max:${t(s)}`}}if(t.category){const r={images:"isch",videos:"vid",news:"nws"};r[t.category]&&(e.tbm=r[t.category])}return t.region&&(e.gl=t.region),t.language&&(e.hl=t.language),t.safeSearch&&("strict"===t.safeSearch?e.safe="active":"off"===t.safeSearch&&(e.safe="images")),e}async transform(t){const e=t.results||[];return Array.isArray(e)?e.map(t=>{if(t.url&&t.url.startsWith("/url?q="))try{const e=new URL(t.url,"https://www.google.com").searchParams.get("q");e&&(t.url=e)}catch(t){}return t}):[]}};h.alias=["google"];
|
package/dist/index.mjs
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
import{FetchSession as t}from"@isdk/web-fetcher";import{addBaseFactoryAbility as r}from"custom-factory";import{isPlainObject as e}from"lodash-es";function s(t,r){if("string"==typeof t)return t.replace(/\$\{(.*?)\}/g,(t,e)=>{const s=r[e.trim()];return void 0!==s?String(s):""});if(Array.isArray(t))return t.map(t=>s(t,r));if(e(t)){const e={};for(const a in t)Object.prototype.hasOwnProperty.call(t,a)&&(e[a]=s(t[a],r));return e}return t}import{defaultsDeep as a}from"lodash-es";var i=class extends t{static async search(t,r,e={}){const s=this.createObject(t,e);if(!s)throw new Error(`Search engine not found: ${t}`);try{return await s.search(r,e)}finally{await s.dispose()}}get pagination(){}createContext(t=this.options){const r=this.template,e=a({},r,t);return r.engine&&"auto"!==r.engine||!t.engine||(e.engine=t.engine),super.createContext(e)}async search(t,r={}){const e=r.limit||10,i=[];let o=0;const n=this.pagination?.startValue??0,c=this.pagination?.increment??1;for(;i.length<e;){const l=this.formatOptions(r),m=n+o*c,h={...r,...l,query:t,page:o+n,offset:m,limit:e},f=s(this.template,h),u=a({},f,r),p=[];if(0===o||"url-param"===this.pagination?.type?u.url&&p.push({id:"goto",params:{url:u.url}}):"click-next"===this.pagination?.type&&this.pagination.nextButtonSelector&&(p.push({id:"click",params:{selector:this.pagination.nextButtonSelector}}),p.push({id:"waitFor",params:{networkIdle:!0,ms:500}})),u.actions){const t=u.actions.filter(t=>!(p.length>0&&"goto"===p[0].id&&"goto"===t.id));p.push(...t)}u.engine&&this.context.engine!==u.engine&&u.engine;const{outputs:d}=await this.executeAll(p),w={query:t,page:o,limit:r.limit};let g=[];if(g=await this.transform(d,w),r.transform&&(g=await r.transform(g,w)),!g||0===g.length)break;if(i.push(...g),i.length>=e||!this.pagination)break;if(o++,o>10)break}return i.slice(0,e)}async transform(t,r){return t.results||[]}formatOptions(t){return{...t}}};i._isFactory=!1,r(i),i.prototype.name="Searcher";var o=class extends i{get template(){return{engine:"browser",browser:{headless:!1},url:"https://www.google.com/search?q=${query}&start=${offset}&tbs=${tbs}&tbm=${tbm}&gl=${gl}&hl=${hl}&safe=${safe}",actions:[{id:"extract",storeAs:"results",params:{type:"array",selector:"#main #search",items:{url:{selector:"a:has(h3)",attribute:"href",required:!0},title:{selector:"a:has(h3) h3",required:!0,mode:"innerText"},snippet:{selector:"div[style*='-webkit-line-clamp']",type:"html"}}}}]}}get pagination(){return{type:"url-param",paramName:"start",startValue:0,increment:10}}formatOptions(t){const r={};if(t.timeRange)if("string"==typeof t.timeRange){const e={day:"qdr:d",week:"qdr:w",month:"qdr:m",year:"qdr:y"};e[t.timeRange]&&(r.tbs=e[t.timeRange])}else{const e=new Date(t.timeRange.from),s=t.timeRange.to?new Date(t.timeRange.to):new Date;if(!isNaN(e.getTime())&&!isNaN(s.getTime())){const t=t=>`${t.getMonth()+1}/${t.getDate()}/${t.getFullYear()}`;r.tbs=`cdr:1,cd_min:${t(e)},cd_max:${t(s)}`}}if(t.category){const e={images:"isch",videos:"vid",news:"nws"};e[t.category]&&(r.tbm=e[t.category])}return t.region&&(r.gl=t.region),t.language&&(r.hl=t.language),t.safeSearch&&("strict"===t.safeSearch?r.safe="active":"off"===t.safeSearch&&(r.safe="images")),r}async transform(t){const r=t.results||[];return Array.isArray(r)?r.map(t=>{if(t.url&&t.url.startsWith("/url?q="))try{const r=new URL(t.url,"https://www.google.com").searchParams.get("q");r&&(t.url=r)}catch(t){}return t}):[]}};o.alias=["google"];export{o as GoogleSearcher,i as WebSearcher};
|