@isdk/web-searcher 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.cn.md CHANGED
@@ -11,53 +11,57 @@ Search 模块提供了一个基于类的高级框架,用于构建搜索引擎
11
11
  - **数据清洗**: 解析原始 HTML 并处理重定向链接。
12
12
  - **灵活性**: 轻松切换 HTTP(快速)和 Browser(抗反爬)模式。
13
13
 
14
- 本模块将这些通用模式封装在一个可复用的 `Searcher` 类中。
14
+ 本模块将这些通用模式封装在一个可复用的 `WebSearcher` 类中。
15
15
 
16
16
  ## 🚀 快速开始
17
17
 
18
18
  ### 1. 一次性搜索 (One-off Search)
19
19
 
20
- 使用静态方法 `Searcher.search` 处理快速、用完即弃的任务。它会自动创建会话、抓取结果并进行清理。
20
+ > **⚠️ 关于 `GoogleSearcher` 的说明**:这些示例中使用的 `GoogleSearcher` 类仅作为**演示实现**用于教学目的。它不适用于生产环境。
21
+ >
22
+ > * 它缺乏大规模可靠抓取 Google 所需的高级反爬虫处理(如验证码破解、代理轮换)。
23
+ > * 由于 Google 频繁的 DOM 变更和 A/B 测试,提取的数据可能会出现**不准确或信息错位**的情况。
24
+
25
+ 使用静态方法 `WebSearcher.search` 处理快速、用完即弃的任务。它会自动创建会话、抓取结果并进行清理。
21
26
 
22
27
  ```typescript
23
- import { Searcher } from '@isdk/web-fetcher/search';
24
- import { GoogleSearcher } from '@isdk/web-fetcher/search/engines/google';
28
+ import { GoogleSearcher, WebSearcher } from '@isdk/web-fetcher';
25
29
 
26
30
  // 注册引擎 (只需执行一次)
27
- Searcher.register(GoogleSearcher);
31
+ WebSearcher.register(GoogleSearcher);
28
32
 
29
33
  // 搜索!
30
34
  // 'limit' 参数确保我们会自动翻页直到获取 20 条结果。
31
35
  // 注意:引擎名称区分大小写,且由类名自动提取(例如:'GoogleSearcher' -> 'Google')
32
- const results = await Searcher.search('Google', 'open source', { limit: 20 });
36
+ const results = await WebSearcher.search('Google', 'open source', { limit: 20 });
33
37
 
34
38
  console.log(results);
35
39
  ```
36
40
 
37
41
  ### 2. 有状态会话 (Stateful Session)
38
42
 
39
- 由于 `Searcher` 继承自 `FetchSession`,您可以实例化它以在多个请求之间保持 Cookie 和存储。这对于需要登录的搜索或通过模拟人类行为来避免反爬虫非常有用。
43
+ 由于 `WebSearcher` 继承自 `FetchSession`,您可以实例化它以在多个请求之间保持 Cookie 和存储。这对于需要登录的搜索或通过模拟人类行为来避免反爬虫非常有用。
44
+
45
+ ### 🛡️ 核心准则:模板即法律 (Template is Law)
40
46
 
41
- **配置优先级:**
42
- 创建会话时,选项按以下顺序合并:
43
- 1. **模板默认 (Template Default)**:在 Searcher 类中定义(结构化选项的优先级最高)。
44
- 2. **用户选项 (User Options)**:传递给构造函数的选项(可填充缺失的默认值,或在允许的情况下进行覆盖)。
47
+ 在 `WebSearcher` 子类中定义的 `template` 是权威的“蓝图”。
45
48
 
46
- *注:如果模板设置了 `engine: 'auto'`(默认值),则会尊重用户提供的 `engine` 选项。*
49
+ - **模板优先级**:如果模板定义了某个属性(如 `engine: 'browser'`、特定的 `headers` 等),该值将被**锁定**,用户选项无法覆盖。这确保了抓取逻辑的稳定性。
50
+ - **用户灵活性**:对于模板中**未**显式锁定的属性(如 `proxy`、`timeoutMs` 或自定义变量),用户可以在构造函数或 `search()` 方法中自由设置。
47
51
 
48
52
  ```typescript
49
53
  // 创建一个持久化会话
50
54
  const google = new GoogleSearcher({
51
- headless: false, // 覆盖默认选项 (例如显示浏览器)
55
+ headless: false, // 如果模板中未锁定,则可以覆盖
52
56
  proxy: 'http://my-proxy:8080',
53
- timeoutMs: 30000 // 为请求设置全局超时
57
+ timeoutMs: 30000 // 有效(假设 GoogleSearcher 模板未显式设置 timeoutMs)
54
58
  });
55
59
 
56
60
  try {
57
61
  // 第一次查询
58
62
  // 您还可以传递运行时选项来覆盖会话默认值或注入变量
59
63
  const results1 = await google.search('term A', {
60
- timeoutMs: 60000, // 仅针对此搜索覆盖超时时间
64
+ timeoutMs: 60000, // 针对此次搜索覆盖超时时间
61
65
  extraParam: 'value' // 可以在模板中通过 ${extraParam} 使用
62
66
  });
63
67
 
@@ -71,11 +75,11 @@ try {
71
75
 
72
76
  ## 🛠️ 实现一个新的搜索引擎
73
77
 
74
- 要支持一个新的网站,请创建一个继承自 `Searcher` 的类。
78
+ 要支持一个新的网站,请创建一个继承自 `WebSearcher` 的类。
75
79
 
76
80
  ### 步骤 1: 定义模板 (Template)
77
81
 
78
- 要支持一个新的网站,请创建一个继承自 `Searcher` 的类。引擎名称默认由类名自动提取(例如:`MyBlogSearcher` -> `MyBlog`),但您可以通过静态属性自定义名称和别名。
82
+ 要支持一个新的网站,请创建一个继承自 `WebSearcher` 的类。引擎名称默认由类名自动提取(例如:`MyBlogSearcher` -> `MyBlog`),但您可以通过静态属性自定义名称和别名。
79
83
 
80
84
  `template` 属性定义了搜索的“蓝图”。它是一个标准的 `FetcherOptions` 对象,但支持**变量注入**。
81
85
 
@@ -87,10 +91,10 @@ try {
87
91
  - `${limit}`: 请求的限制数量。
88
92
 
89
93
  ```typescript
90
- import { Searcher } from '@isdk/web-fetcher/search';
94
+ import { WebSearcher } from '@isdk/web-fetcher/search';
91
95
  import { FetcherOptions } from '@isdk/web-fetcher/types';
92
96
 
93
- export class MyBlogSearcher extends Searcher {
97
+ export class MyBlogSearcher extends WebSearcher {
94
98
  static name = 'blog'; // 自定义名称 (区分大小写)
95
99
  static alias = ['myblog', 'news'];
96
100
 
@@ -120,7 +124,7 @@ export class MyBlogSearcher extends Searcher {
120
124
 
121
125
  ### 步骤 2: 配置分页 (Pagination)
122
126
 
123
- 告诉 `Searcher` 如何导航到下一页。实现 `pagination` 获取器。
127
+ 告诉 `WebSearcher` 如何导航到下一页。实现 `pagination` 获取器。
124
128
 
125
129
  #### 方案 A: URL 参数 (Offset/Page)
126
130
 
@@ -152,7 +156,7 @@ protected override get pagination() {
152
156
 
153
157
  ### 步骤 3: 转换与清洗数据 (Transform)
154
158
 
155
- 重写 `transform` 以清洗数据。由于 `Searcher` 本身就是一个 `FetchSession`,您还可以使用 `this` 发起额外的请求(如解析重定向)。
159
+ 重写 `transform` 以清洗数据。由于 `WebSearcher` 本身就是一个 `FetchSession`,您还可以使用 `this` 发起额外的请求(如解析重定向)。
156
160
 
157
161
  ```typescript
158
162
  protected override async transform(outputs: Record<string, any>) {
@@ -169,22 +173,41 @@ protected override async transform(outputs: Record<string, any>) {
169
173
 
170
174
  ## 🧠 高级概念
171
175
 
172
- ### 自动分页与过滤
176
+ ### 自动分页:`limit` 与 `maxPages` 的关系
177
+
178
+ `WebSearcher` 的设计是以结果为导向的。当您调用 `search()` 时,您只需要指定想要多少条结果,搜索器会自动处理翻页逻辑。
179
+
180
+ - **`limit`**: 您期望获取的结果总数。
181
+ - **`maxPages`**: 安全阈值。它限制了搜索器为了满足 `limit` 而允许抓取的最大页数(翻页循环次数)。
182
+
183
+ **协作逻辑示例:**
184
+ 如果您请求 `{ limit: 50 }`,但每页只有 5 条结果:
173
185
 
174
- `Searcher` 是智能的。如果您请求 `limit: 10`,但第一页只返回了 5 条结果(或者如果您的 `transform` 过滤掉了一些结果),它会自动抓取下一页,直到满足限制。
186
+ 1. 搜索器抓取第 1 页(得到 5 条)。
187
+ 2. 发现 `5 < 50`,于是自动抓取第 2 页。
188
+ 3. 循环持续,直到获取 50 条结果 **或者** 达到了 `maxPages` 的限制(默认为 10 页)。
189
+
190
+ 这种机制可以防止因“下一页”选择器失效或引擎陷入死循环而导致的无限抓取,保护您的系统资源。
175
191
 
176
192
  ### 用户自定义转换 (User-defined Transforms)
177
193
 
178
194
  用户可以在调用 `search` 时提供自己的 `transform`。它会在引擎内置的转换**之后**运行。
179
195
 
196
+ 这在**过滤广告**或无关内容时非常强大。如果用户过滤掉了某些结果,自动分页逻辑会**自动启动**以抓取更多页面,确保最终返回给您的结果列表既满足 `limit` 数量要求,又只包含有效的条目。
197
+
180
198
  ```typescript
181
199
  await google.search('test', {
182
- transform: (results) => results.filter(r => r.url.endsWith('.pdf'))
200
+ limit: 20,
201
+ // 示例:过滤掉赞助商结果(广告)并只保留 PDF
202
+ transform: (results) => {
203
+ return results.filter(r => {
204
+ const isAd = r.isSponsored || r.url.includes('googleadservices.com');
205
+ return !isAd && r.url.endsWith('.pdf');
206
+ });
207
+ }
183
208
  });
184
209
  ```
185
210
 
186
- 如果用户过滤掉了结果,自动分页逻辑会启动以抓取更多页面来满足请求的 limit。
187
-
188
211
  ### 标准化搜索选项
189
212
 
190
213
  在调用 `search()` 时,您可以提供标准化的选项,搜索引擎会将其映射到特定的参数:
package/README.md CHANGED
@@ -11,53 +11,57 @@ Building a robust search scraper involves more than just fetching a URL. You oft
11
11
  - **Data Cleaning**: Parse raw HTML and resolve redirect links.
12
12
  - **Flexibility**: Switch between HTTP (fast) and Browser (anti-bot) modes easily.
13
13
 
14
- This module encapsulates these patterns into a reusable `Searcher` class.
14
+ This module encapsulates these patterns into a reusable `WebSearcher` class.
15
15
 
16
16
  ## 🚀 Quick Start
17
17
 
18
18
  ### 1. One-off Search
19
19
 
20
- Use the static `Searcher.search` method for quick, disposable tasks. It automatically creates a session, fetches results, and cleans up.
20
+ > **⚠️ Note on `GoogleSearcher`**: The `GoogleSearcher` class used in these examples is a **demo implementation** included for educational purposes. It is not intended for production use.
21
+ >
22
+ > * It lacks advanced anti-bot handling (CAPTCHA solving, proxy rotation) required for scraping Google reliably at scale.
23
+ > * The extracted data may be **inaccurate or misaligned** due to Google's frequent DOM changes and A/B testing.
24
+
25
+ Use the static `WebSearcher.search` method for quick, disposable tasks. It automatically creates a session, fetches results, and cleans up.
21
26
 
22
27
  ```typescript
23
- import { Searcher } from '@isdk/web-fetcher/search';
24
- import { GoogleSearcher } from '@isdk/web-fetcher/search/engines/google';
28
+ import { GoogleSearcher, WebSearcher } from '@isdk/web-fetcher';
25
29
 
26
30
  // Register the engine (only needs to be done once)
27
- Searcher.register(GoogleSearcher);
31
+ WebSearcher.register(GoogleSearcher);
28
32
 
29
33
  // Search!
30
34
  // The 'limit' parameter ensures we fetch enough pages to get 20 results.
31
35
  // Note: The engine name is case-sensitive and derived from the class name (e.g., 'GoogleSearcher' -> 'Google')
32
- const results = await Searcher.search('Google', 'open source', { limit: 20 });
36
+ const results = await WebSearcher.search('Google', 'open source', { limit: 20 });
33
37
 
34
38
  console.log(results);
35
39
  ```
36
40
 
37
41
  ### 2. Stateful Session
38
42
 
39
- Since `Searcher` extends `FetchSession`, you can instantiate it to keep cookies and storage alive across multiple requests. This is useful for authenticated searches or avoiding bot detection by behaving like a human.
43
+ Since `WebSearcher` extends `FetchSession`, you can instantiate it to keep cookies and storage alive across multiple requests. This is useful for authenticated searches or avoiding bot detection by behaving like a human.
44
+
45
+ ### 🛡️ Core Principle: Template is Law
40
46
 
41
- **Configuration Precedence:**
42
- When creating a session, options are merged in the following order:
43
- 1. **Template Default**: Defined in the Searcher class (highest priority for structural options).
44
- 2. **User Options**: Passed to the constructor (can fill missing defaults or override if allowed).
47
+ The `template` defined in the `WebSearcher` subclass acts as the authoritative "blueprint".
45
48
 
46
- *Note: If the template sets `engine: 'auto'` (default), user-provided `engine` option will be respected.*
49
+ - **Template Priority**: If the template defines a property (e.g., `engine: 'browser'`, `headers`), that value is **locked** and cannot be overridden by user options. This ensures engine stability.
50
+ - **User Flexibility**: Properties **not** explicitly defined in the template (such as `proxy`, `timeoutMs`, or custom variables) can be freely set by the user in the constructor or `search()` method.
47
51
 
48
52
  ```typescript
49
53
  // Create a persistent session
50
54
  const google = new GoogleSearcher({
51
- headless: false, // Override default options (e.g., show browser)
55
+ headless: false, // Override if not locked in template
52
56
  proxy: 'http://my-proxy:8080',
53
- timeoutMs: 30000 // Set a global timeout for requests
57
+ timeoutMs: 30000 // Set a global timeout (valid if template doesn't define it)
54
58
  });
55
59
 
56
60
  try {
57
61
  // First query
58
62
  // You can also pass runtime options to override session defaults or inject variables
59
63
  const results1 = await google.search('term A', {
60
- timeoutMs: 60000, // Override timeout just for this search
64
+ timeoutMs: 60000, // Override session timeout just for this search
61
65
  extraParam: 'value' // Can be used in template as ${extraParam}
62
66
  });
63
67
 
@@ -71,11 +75,11 @@ try {
71
75
 
72
76
  ## 🛠️ Implementing a New Search Engine
73
77
 
74
- To support a new website, create a class that extends `Searcher`.
78
+ To support a new website, create a class that extends `WebSearcher`.
75
79
 
76
80
  ### Step 1: Define the Template
77
81
 
78
- To support a new website, create a class that extends `Searcher`. The engine name is automatically derived from the class name (e.g., `MyBlogSearcher` -> `MyBlog`), but you can customize it and add aliases using static properties.
82
+ To support a new website, create a class that extends `WebSearcher`. The engine name is automatically derived from the class name (e.g., `MyBlogSearcher` -> `MyBlog`), but you can customize it and add aliases using static properties.
79
83
 
80
84
  The `template` property defines the "Blueprint" for your search. It's a standard `FetcherOptions` object but supports **variable injection**.
81
85
 
@@ -87,10 +91,10 @@ Supported variables:
87
91
  - `${limit}`: The requested limit.
88
92
 
89
93
  ```typescript
90
- import { Searcher } from '@isdk/web-fetcher/search';
94
+ import { WebSearcher } from '@isdk/web-fetcher/search';
91
95
  import { FetcherOptions } from '@isdk/web-fetcher/types';
92
96
 
93
- export class MyBlogSearcher extends Searcher {
97
+ export class MyBlogSearcher extends WebSearcher {
94
98
  static name = 'blog'; // Custom name (case-sensitive)
95
99
  static alias = ['myblog', 'news'];
96
100
 
@@ -120,7 +124,7 @@ export class MyBlogSearcher extends Searcher {
120
124
 
121
125
  ### Step 2: Configure Pagination
122
126
 
123
- Tell the `Searcher` how to navigate to the next page. Implement the `pagination` getter.
127
+ Tell the `WebSearcher` how to navigate to the next page. Implement the `pagination` getter.
124
128
 
125
129
  #### Option A: URL Parameters (Offset/Page)
126
130
 
@@ -152,7 +156,7 @@ protected override get pagination() {
152
156
 
153
157
  ### Step 3: Transform & Clean Data
154
158
 
155
- Override `transform` to clean data. Since `Searcher` is a `FetchSession`, you can also make extra requests (like resolving redirects) using `this`.
159
+ Override `transform` to clean data. Since `WebSearcher` is a `FetchSession`, you can also make extra requests (like resolving redirects) using `this`.
156
160
 
157
161
  ```typescript
158
162
  protected override async transform(outputs: Record<string, any>) {
@@ -167,24 +171,43 @@ protected override async transform(outputs: Record<string, any>) {
167
171
  }
168
172
  ```
169
173
 
170
- ## 🧠 Advanced Concepts
174
+ ### 🧠 Advanced Concepts
175
+
176
+ ### Auto-Pagination: `limit` vs `maxPages`
177
+
178
+ The `WebSearcher` is designed to be result-oriented. When you call `search()`, you specify how many results you want, and the searcher handles the pagination logic.
179
+
180
+ - **`limit`**: Your target number of total results.
181
+ - **`maxPages`**: The safety threshold. It limits how many pages (fetch cycles) the searcher is allowed to navigate to satisfy your `limit`.
171
182
 
172
- ### Auto-Pagination & Filtering
183
+ **Example Logic:**
184
+ If you request `{ limit: 50 }` but each page only has 5 results:
173
185
 
174
- The `Searcher` is smart. If you request `limit: 10`, but the first page only returns 5 results (or if your `transform` filters out results), it will automatically fetch the next page until the limit is met.
186
+ 1. The searcher fetches page 1 (5 results).
187
+ 2. It sees `5 < 50`, so it fetches page 2.
188
+ 3. It continues until it has 50 results **OR** it reaches `maxPages` (default 10).
189
+
190
+ This prevent infinite loops if the "Next" button selector is broken or if the search engine keeps returning the same results.
175
191
 
176
192
  ### User-defined Transforms
177
193
 
178
194
  Users can provide their own `transform` when calling `search`. This runs **after** the engine's built-in transform.
179
195
 
196
+ This is extremely powerful for **filtering out ads** or irrelevant content. If the user filters out results, the auto-pagination logic will automatically kick in to fetch more pages to ensure the final result list meets your requested `limit` with only valid entries.
197
+
180
198
  ```typescript
181
199
  await google.search('test', {
182
- transform: (results) => results.filter(r => r.url.endsWith('.pdf'))
200
+ limit: 20,
201
+ // Example: Filter out sponsored results and only keep PDFs
202
+ transform: (results) => {
203
+ return results.filter(r => {
204
+ const isAd = r.isSponsored || r.url.includes('googleadservices.com');
205
+ return !isAd && r.url.endsWith('.pdf');
206
+ });
207
+ }
183
208
  });
184
209
  ```
185
210
 
186
- If the user filters out results, the auto-pagination logic will kick in to fetch more pages to meet the requested limit.
187
-
188
211
  ### Standardized Search Options
189
212
 
190
213
  When calling `search()`, you can provide standardized options that the search engine will map to specific parameters:
package/dist/index.d.mts CHANGED
@@ -15,7 +15,17 @@ interface StandardSearchResult {
15
15
  snippet?: string;
16
16
  /** An optional image URL associated with the result. */
17
17
  image?: string;
18
- /** Allows for engine-specific extra fields (e.g., rank, author, date). */
18
+ /** The date the result was published or last updated. */
19
+ date?: string | Date;
20
+ /** The author or source name of the result. */
21
+ author?: string;
22
+ /** The favicon URL of the source website. */
23
+ favicon?: string;
24
+ /** The rank or position of the result (usually 1-indexed). */
25
+ rank?: number;
26
+ /** The source website name (e.g., 'GitHub', 'StackOverflow'). */
27
+ source?: string;
28
+ /** Allows for engine-specific extra fields (e.g., siteIcon, category). */
19
29
  [key: string]: any;
20
30
  }
21
31
  /**
@@ -52,6 +62,16 @@ interface PaginationConfig {
52
62
  * Required if type is 'click-next'.
53
63
  */
54
64
  nextButtonSelector?: string;
65
+ /**
66
+ * The safety threshold for the maximum number of pages to fetch automatically
67
+ * in a single search call.
68
+ *
69
+ * Even if the requested `limit` of results hasn't been reached, the searcher
70
+ * will stop after this many pages to prevent infinite loops or excessive API usage.
71
+ *
72
+ * @default 10
73
+ */
74
+ maxPages?: number;
55
75
  }
56
76
  /**
57
77
  * Context object passed to the transform function.
@@ -80,6 +100,15 @@ type SafeSearchLevel = 'off' | 'moderate' | 'strict';
80
100
  interface SearchOptions {
81
101
  /** The maximum number of results to retrieve. */
82
102
  limit?: number;
103
+ /**
104
+ * The maximum number of pages (fetch cycles) allowed to reach the requested `limit`.
105
+ *
106
+ * This is a safety guard. If the `limit` is high but each page has few results,
107
+ * the searcher will stop once this page count is reached.
108
+ *
109
+ * If not provided, it defaults to the value in `PaginationConfig` or 10.
110
+ */
111
+ maxPages?: number;
83
112
  /**
84
113
  * Date range for the search results.
85
114
  * Default: 'all'
package/dist/index.d.ts CHANGED
@@ -15,7 +15,17 @@ interface StandardSearchResult {
15
15
  snippet?: string;
16
16
  /** An optional image URL associated with the result. */
17
17
  image?: string;
18
- /** Allows for engine-specific extra fields (e.g., rank, author, date). */
18
+ /** The date the result was published or last updated. */
19
+ date?: string | Date;
20
+ /** The author or source name of the result. */
21
+ author?: string;
22
+ /** The favicon URL of the source website. */
23
+ favicon?: string;
24
+ /** The rank or position of the result (usually 1-indexed). */
25
+ rank?: number;
26
+ /** The source website name (e.g., 'GitHub', 'StackOverflow'). */
27
+ source?: string;
28
+ /** Allows for engine-specific extra fields (e.g., siteIcon, category). */
19
29
  [key: string]: any;
20
30
  }
21
31
  /**
@@ -52,6 +62,16 @@ interface PaginationConfig {
52
62
  * Required if type is 'click-next'.
53
63
  */
54
64
  nextButtonSelector?: string;
65
+ /**
66
+ * The safety threshold for the maximum number of pages to fetch automatically
67
+ * in a single search call.
68
+ *
69
+ * Even if the requested `limit` of results hasn't been reached, the searcher
70
+ * will stop after this many pages to prevent infinite loops or excessive API usage.
71
+ *
72
+ * @default 10
73
+ */
74
+ maxPages?: number;
55
75
  }
56
76
  /**
57
77
  * Context object passed to the transform function.
@@ -80,6 +100,15 @@ type SafeSearchLevel = 'off' | 'moderate' | 'strict';
80
100
  interface SearchOptions {
81
101
  /** The maximum number of results to retrieve. */
82
102
  limit?: number;
103
+ /**
104
+ * The maximum number of pages (fetch cycles) allowed to reach the requested `limit`.
105
+ *
106
+ * This is a safety guard. If the `limit` is high but each page has few results,
107
+ * the searcher will stop once this page count is reached.
108
+ *
109
+ * If not provided, it defaults to the value in `PaginationConfig` or 10.
110
+ */
111
+ maxPages?: number;
83
112
  /**
84
113
  * Date range for the search results.
85
114
  * Default: 'all'
package/dist/index.js CHANGED
@@ -1 +1 @@
1
- "use strict";var t,e=Object.defineProperty,r=Object.getOwnPropertyDescriptor,s=Object.getOwnPropertyNames,a=Object.prototype.hasOwnProperty,i={};((t,r)=>{for(var s in r)e(t,s,{get:r[s],enumerable:!0})})(i,{GoogleSearcher:()=>h,WebSearcher:()=>f}),module.exports=(t=i,((t,i,n,o)=>{if(i&&"object"==typeof i||"function"==typeof i)for(let c of s(i))a.call(t,c)||c===n||e(t,c,{get:()=>i[c],enumerable:!(o=r(i,c))||o.enumerable});return t})(e({},"__esModule",{value:!0}),t));var n=require("@isdk/web-fetcher"),o=require("custom-factory"),c=require("lodash-es");function l(t,e){if("string"==typeof t)return t.replace(/\$\{(.*?)\}/g,(t,r)=>{const s=e[r.trim()];return void 0!==s?String(s):""});if(Array.isArray(t))return t.map(t=>l(t,e));if((0,c.isPlainObject)(t)){const r={};for(const s in t)Object.prototype.hasOwnProperty.call(t,s)&&(r[s]=l(t[s],e));return r}return t}var u=require("lodash-es"),f=class extends n.FetchSession{static async search(t,e,r={}){const s=this.createObject(t,r);if(!s)throw new Error(`Search engine not found: ${t}`);try{return await s.search(e,r)}finally{await s.dispose()}}get pagination(){}createContext(t=this.options){const e=this.template,r=(0,u.defaultsDeep)({},e,t);return e.engine&&"auto"!==e.engine||!t.engine||(r.engine=t.engine),super.createContext(r)}async search(t,e={}){const r=e.limit||10,s=[];let a=0;const i=this.pagination?.startValue??0,n=this.pagination?.increment??1;for(;s.length<r;){const o=this.formatOptions(e),c=i+a*n,f={...e,...o,query:t,page:a+i,offset:c,limit:r},h=l(this.template,f),m=(0,u.defaultsDeep)({},h,e),d=[];if(0===a||"url-param"===this.pagination?.type?m.url&&d.push({id:"goto",params:{url:m.url}}):"click-next"===this.pagination?.type&&this.pagination.nextButtonSelector&&(d.push({id:"click",params:{selector:this.pagination.nextButtonSelector}}),d.push({id:"waitFor",params:{networkIdle:!0,ms:500}})),m.actions){const t=m.actions.filter(t=>!(d.length>0&&"goto"===d[0].id&&"goto"===t.id));d.push(...t)}m.engine&&this.context.engine!==m.engine&&m.engine;const{outputs:g}=await this.executeAll(d),p={query:t,page:a,limit:e.limit};let w=[];if(w=await this.transform(g,p),e.transform&&(w=await e.transform(w,p)),!w||0===w.length)break;if(s.push(...w),s.length>=r||!this.pagination)break;if(a++,a>10)break}return s.slice(0,r)}async transform(t,e){return t.results||[]}formatOptions(t){return{...t}}};f._isFactory=!1,(0,o.addBaseFactoryAbility)(f),f.prototype.name="Searcher";var h=class extends f{get template(){return{engine:"browser",browser:{headless:!1},url:"https://www.google.com/search?q=${query}&start=${offset}&tbs=${tbs}&tbm=${tbm}&gl=${gl}&hl=${hl}&safe=${safe}",actions:[{id:"extract",storeAs:"results",params:{type:"array",selector:"#main #search",items:{url:{selector:"a:has(h3)",attribute:"href",required:!0},title:{selector:"a:has(h3) h3",required:!0,mode:"innerText"},snippet:{selector:"div[style*='-webkit-line-clamp']",type:"html"}}}}]}}get pagination(){return{type:"url-param",paramName:"start",startValue:0,increment:10}}formatOptions(t){const e={};if(t.timeRange)if("string"==typeof t.timeRange){const r={day:"qdr:d",week:"qdr:w",month:"qdr:m",year:"qdr:y"};r[t.timeRange]&&(e.tbs=r[t.timeRange])}else{const r=new Date(t.timeRange.from),s=t.timeRange.to?new Date(t.timeRange.to):new Date;if(!isNaN(r.getTime())&&!isNaN(s.getTime())){const t=t=>`${t.getMonth()+1}/${t.getDate()}/${t.getFullYear()}`;e.tbs=`cdr:1,cd_min:${t(r)},cd_max:${t(s)}`}}if(t.category){const r={images:"isch",videos:"vid",news:"nws"};r[t.category]&&(e.tbm=r[t.category])}return t.region&&(e.gl=t.region),t.language&&(e.hl=t.language),t.safeSearch&&("strict"===t.safeSearch?e.safe="active":"off"===t.safeSearch&&(e.safe="images")),e}async transform(t){const e=t.results||[];return Array.isArray(e)?e.map(t=>{if(t.url&&t.url.startsWith("/url?q="))try{const e=new URL(t.url,"https://www.google.com").searchParams.get("q");e&&(t.url=e)}catch(t){}return t}):[]}};h.alias=["google"];
1
+ "use strict";var t,e=Object.defineProperty,r=Object.getOwnPropertyDescriptor,s=Object.getOwnPropertyNames,a=Object.prototype.hasOwnProperty,i={};((t,r)=>{for(var s in r)e(t,s,{get:r[s],enumerable:!0})})(i,{GoogleSearcher:()=>f,WebSearcher:()=>h}),module.exports=(t=i,((t,i,n,o)=>{if(i&&"object"==typeof i||"function"==typeof i)for(let c of s(i))a.call(t,c)||c===n||e(t,c,{get:()=>i[c],enumerable:!(o=r(i,c))||o.enumerable});return t})(e({},"__esModule",{value:!0}),t));var n=require("@isdk/web-fetcher"),o=require("custom-factory"),c=require("lodash-es");function l(t,e){if("string"==typeof t)return t.replace(/\$\{(.*?)\}/g,(t,r)=>{const s=e[r.trim()];return void 0!==s?String(s):""});if(Array.isArray(t))return t.map(t=>l(t,e));if((0,c.isPlainObject)(t)){const r={};for(const s in t)Object.prototype.hasOwnProperty.call(t,s)&&(r[s]=l(t[s],e));return r}return t}var u=require("lodash-es"),h=class extends n.FetchSession{static async search(t,e,r={}){const s=this.createObject(t,r);if(!s)throw new Error(`Search engine not found: ${t}`);try{return await s.search(e,r)}finally{await s.dispose()}}get pagination(){}createContext(t=this.options){const e=this.template,r=(0,u.defaultsDeep)({},e,t);return e.engine&&"auto"!==e.engine||!t.engine||(r.engine=t.engine),super.createContext(r)}async search(t,e={}){const r=e.limit||10,s=[];let a=0;const i=this.pagination?.startValue??0,n=this.pagination?.increment??1,o=e.maxPages||this.pagination?.maxPages||10;for(;s.length<r;){const c=this.formatOptions(e),h=i+a*n,f={...e,...c,query:t,page:a+i,offset:h,limit:r},m=l(this.template,f),d=(0,u.defaultsDeep)({},m,e),g=[];if(0===a||"url-param"===this.pagination?.type?d.url&&g.push({id:"goto",params:{url:d.url}}):"click-next"===this.pagination?.type&&this.pagination.nextButtonSelector&&(g.push({id:"click",params:{selector:this.pagination.nextButtonSelector}}),g.push({id:"waitFor",params:{networkIdle:!0,ms:500}})),d.actions){const t=d.actions.filter(t=>!(g.length>0&&"goto"===g[0].id&&"goto"===t.id));g.push(...t)}d.engine&&this.context.engine!==d.engine&&d.engine;const{outputs:p}=await this.executeAll(g),w={query:t,page:a,limit:e.limit};let y=[];if(y=await this.transform(p,w),e.transform&&(y=await e.transform(y,w)),!y||0===y.length)break;if(s.push(...y),s.length>=r||!this.pagination)break;if(a++,a>=o)break}return s.slice(0,r)}async transform(t,e){return t.results||[]}formatOptions(t){return{...t}}};h._isFactory=!1,(0,o.addBaseFactoryAbility)(h),h.prototype.name="Searcher";var f=class extends h{get template(){return{engine:"browser",browser:{headless:!1},url:"https://www.google.com/search?q=${query}&start=${offset}&tbs=${tbs}&tbm=${tbm}&gl=${gl}&hl=${hl}&safe=${safe}",actions:[{id:"extract",storeAs:"results",params:{type:"array",selector:"#main #search",items:{url:{selector:"a:has(h3)",attribute:"href",required:!0},title:{selector:"a:has(h3) h3",required:!0,mode:"innerText"},snippet:{selector:"div[style*='-webkit-line-clamp']",type:"html"}}}}]}}get pagination(){return{type:"url-param",paramName:"start",startValue:0,increment:10}}formatOptions(t){const e={};if(t.timeRange)if("string"==typeof t.timeRange){const r={day:"qdr:d",week:"qdr:w",month:"qdr:m",year:"qdr:y"};r[t.timeRange]&&(e.tbs=r[t.timeRange])}else{const r=new Date(t.timeRange.from),s=t.timeRange.to?new Date(t.timeRange.to):new Date;if(!isNaN(r.getTime())&&!isNaN(s.getTime())){const t=t=>`${t.getMonth()+1}/${t.getDate()}/${t.getFullYear()}`;e.tbs=`cdr:1,cd_min:${t(r)},cd_max:${t(s)}`}}if(t.category){const r={images:"isch",videos:"vid",news:"nws"};r[t.category]&&(e.tbm=r[t.category])}return t.region&&(e.gl=t.region),t.language&&(e.hl=t.language),t.safeSearch&&("strict"===t.safeSearch?e.safe="active":"off"===t.safeSearch&&(e.safe="images")),e}async transform(t){const e=t.results||[];return Array.isArray(e)?e.map(t=>{if(t.url&&t.url.startsWith("/url?q="))try{const e=new URL(t.url,"https://www.google.com").searchParams.get("q");e&&(t.url=e)}catch(t){}return t}):[]}};f.alias=["google"];
package/dist/index.mjs CHANGED
@@ -1 +1 @@
1
- import{FetchSession as t}from"@isdk/web-fetcher";import{addBaseFactoryAbility as r}from"custom-factory";import{isPlainObject as e}from"lodash-es";function s(t,r){if("string"==typeof t)return t.replace(/\$\{(.*?)\}/g,(t,e)=>{const s=r[e.trim()];return void 0!==s?String(s):""});if(Array.isArray(t))return t.map(t=>s(t,r));if(e(t)){const e={};for(const a in t)Object.prototype.hasOwnProperty.call(t,a)&&(e[a]=s(t[a],r));return e}return t}import{defaultsDeep as a}from"lodash-es";var i=class extends t{static async search(t,r,e={}){const s=this.createObject(t,e);if(!s)throw new Error(`Search engine not found: ${t}`);try{return await s.search(r,e)}finally{await s.dispose()}}get pagination(){}createContext(t=this.options){const r=this.template,e=a({},r,t);return r.engine&&"auto"!==r.engine||!t.engine||(e.engine=t.engine),super.createContext(e)}async search(t,r={}){const e=r.limit||10,i=[];let o=0;const n=this.pagination?.startValue??0,c=this.pagination?.increment??1;for(;i.length<e;){const l=this.formatOptions(r),m=n+o*c,h={...r,...l,query:t,page:o+n,offset:m,limit:e},f=s(this.template,h),u=a({},f,r),p=[];if(0===o||"url-param"===this.pagination?.type?u.url&&p.push({id:"goto",params:{url:u.url}}):"click-next"===this.pagination?.type&&this.pagination.nextButtonSelector&&(p.push({id:"click",params:{selector:this.pagination.nextButtonSelector}}),p.push({id:"waitFor",params:{networkIdle:!0,ms:500}})),u.actions){const t=u.actions.filter(t=>!(p.length>0&&"goto"===p[0].id&&"goto"===t.id));p.push(...t)}u.engine&&this.context.engine!==u.engine&&u.engine;const{outputs:d}=await this.executeAll(p),w={query:t,page:o,limit:r.limit};let g=[];if(g=await this.transform(d,w),r.transform&&(g=await r.transform(g,w)),!g||0===g.length)break;if(i.push(...g),i.length>=e||!this.pagination)break;if(o++,o>10)break}return i.slice(0,e)}async transform(t,r){return t.results||[]}formatOptions(t){return{...t}}};i._isFactory=!1,r(i),i.prototype.name="Searcher";var o=class extends i{get template(){return{engine:"browser",browser:{headless:!1},url:"https://www.google.com/search?q=${query}&start=${offset}&tbs=${tbs}&tbm=${tbm}&gl=${gl}&hl=${hl}&safe=${safe}",actions:[{id:"extract",storeAs:"results",params:{type:"array",selector:"#main #search",items:{url:{selector:"a:has(h3)",attribute:"href",required:!0},title:{selector:"a:has(h3) h3",required:!0,mode:"innerText"},snippet:{selector:"div[style*='-webkit-line-clamp']",type:"html"}}}}]}}get pagination(){return{type:"url-param",paramName:"start",startValue:0,increment:10}}formatOptions(t){const r={};if(t.timeRange)if("string"==typeof t.timeRange){const e={day:"qdr:d",week:"qdr:w",month:"qdr:m",year:"qdr:y"};e[t.timeRange]&&(r.tbs=e[t.timeRange])}else{const e=new Date(t.timeRange.from),s=t.timeRange.to?new Date(t.timeRange.to):new Date;if(!isNaN(e.getTime())&&!isNaN(s.getTime())){const t=t=>`${t.getMonth()+1}/${t.getDate()}/${t.getFullYear()}`;r.tbs=`cdr:1,cd_min:${t(e)},cd_max:${t(s)}`}}if(t.category){const e={images:"isch",videos:"vid",news:"nws"};e[t.category]&&(r.tbm=e[t.category])}return t.region&&(r.gl=t.region),t.language&&(r.hl=t.language),t.safeSearch&&("strict"===t.safeSearch?r.safe="active":"off"===t.safeSearch&&(r.safe="images")),r}async transform(t){const r=t.results||[];return Array.isArray(r)?r.map(t=>{if(t.url&&t.url.startsWith("/url?q="))try{const r=new URL(t.url,"https://www.google.com").searchParams.get("q");r&&(t.url=r)}catch(t){}return t}):[]}};o.alias=["google"];export{o as GoogleSearcher,i as WebSearcher};
1
+ import{FetchSession as t}from"@isdk/web-fetcher";import{addBaseFactoryAbility as r}from"custom-factory";import{isPlainObject as e}from"lodash-es";function s(t,r){if("string"==typeof t)return t.replace(/\$\{(.*?)\}/g,(t,e)=>{const s=r[e.trim()];return void 0!==s?String(s):""});if(Array.isArray(t))return t.map(t=>s(t,r));if(e(t)){const e={};for(const a in t)Object.prototype.hasOwnProperty.call(t,a)&&(e[a]=s(t[a],r));return e}return t}import{defaultsDeep as a}from"lodash-es";var i=class extends t{static async search(t,r,e={}){const s=this.createObject(t,e);if(!s)throw new Error(`Search engine not found: ${t}`);try{return await s.search(r,e)}finally{await s.dispose()}}get pagination(){}createContext(t=this.options){const r=this.template,e=a({},r,t);return r.engine&&"auto"!==r.engine||!t.engine||(e.engine=t.engine),super.createContext(e)}async search(t,r={}){const e=r.limit||10,i=[];let o=0;const n=this.pagination?.startValue??0,c=this.pagination?.increment??1,h=r.maxPages||this.pagination?.maxPages||10;for(;i.length<e;){const l=this.formatOptions(r),m=n+o*c,f={...r,...l,query:t,page:o+n,offset:m,limit:e},u=s(this.template,f),p=a({},u,r),d=[];if(0===o||"url-param"===this.pagination?.type?p.url&&d.push({id:"goto",params:{url:p.url}}):"click-next"===this.pagination?.type&&this.pagination.nextButtonSelector&&(d.push({id:"click",params:{selector:this.pagination.nextButtonSelector}}),d.push({id:"waitFor",params:{networkIdle:!0,ms:500}})),p.actions){const t=p.actions.filter(t=>!(d.length>0&&"goto"===d[0].id&&"goto"===t.id));d.push(...t)}p.engine&&this.context.engine!==p.engine&&p.engine;const{outputs:w}=await this.executeAll(d),g={query:t,page:o,limit:r.limit};let y=[];if(y=await this.transform(w,g),r.transform&&(y=await r.transform(y,g)),!y||0===y.length)break;if(i.push(...y),i.length>=e||!this.pagination)break;if(o++,o>=h)break}return i.slice(0,e)}async transform(t,r){return t.results||[]}formatOptions(t){return{...t}}};i._isFactory=!1,r(i),i.prototype.name="Searcher";var o=class extends i{get template(){return{engine:"browser",browser:{headless:!1},url:"https://www.google.com/search?q=${query}&start=${offset}&tbs=${tbs}&tbm=${tbm}&gl=${gl}&hl=${hl}&safe=${safe}",actions:[{id:"extract",storeAs:"results",params:{type:"array",selector:"#main #search",items:{url:{selector:"a:has(h3)",attribute:"href",required:!0},title:{selector:"a:has(h3) h3",required:!0,mode:"innerText"},snippet:{selector:"div[style*='-webkit-line-clamp']",type:"html"}}}}]}}get pagination(){return{type:"url-param",paramName:"start",startValue:0,increment:10}}formatOptions(t){const r={};if(t.timeRange)if("string"==typeof t.timeRange){const e={day:"qdr:d",week:"qdr:w",month:"qdr:m",year:"qdr:y"};e[t.timeRange]&&(r.tbs=e[t.timeRange])}else{const e=new Date(t.timeRange.from),s=t.timeRange.to?new Date(t.timeRange.to):new Date;if(!isNaN(e.getTime())&&!isNaN(s.getTime())){const t=t=>`${t.getMonth()+1}/${t.getDate()}/${t.getFullYear()}`;r.tbs=`cdr:1,cd_min:${t(e)},cd_max:${t(s)}`}}if(t.category){const e={images:"isch",videos:"vid",news:"nws"};e[t.category]&&(r.tbm=e[t.category])}return t.region&&(r.gl=t.region),t.language&&(r.hl=t.language),t.safeSearch&&("strict"===t.safeSearch?r.safe="active":"off"===t.safeSearch&&(r.safe="images")),r}async transform(t){const r=t.results||[];return Array.isArray(r)?r.map(t=>{if(t.url&&t.url.startsWith("/url?q="))try{const r=new URL(t.url,"https://www.google.com").searchParams.get("q");r&&(t.url=r)}catch(t){}return t}):[]}};o.alias=["google"];export{o as GoogleSearcher,i as WebSearcher};