@howuse/electron-crawler 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +102 -0
- package/dist/common.d.ts +25 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +86 -0
- package/dist/index.mjs +318 -0
- package/dist/newsCrawler.d.ts +53 -0
- package/dist/vite.config.d.ts +2 -0
- package/package.json +58 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Your Name
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# @howuse/electron-crawler
|
|
2
|
+
|
|
3
|
+
基于 Electron 的爬虫工具包,用于爬取新闻和股票详情。
|
|
4
|
+
|
|
5
|
+
## 特性
|
|
6
|
+
|
|
7
|
+
- 🚀 基于 Electron 构建,支持现代网页渲染
|
|
8
|
+
- 📰 专为新闻网站设计的爬虫功能
|
|
9
|
+
- 📊 股票详情数据抓取
|
|
10
|
+
- ⚙️ 可配置的爬取规则
|
|
11
|
+
- 🔄 自动定时任务
|
|
12
|
+
- 📦 TypeScript 支持,提供完整的类型定义
|
|
13
|
+
|
|
14
|
+
## 安装
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
npm install @howuse/electron-crawler
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## 使用方法
|
|
21
|
+
|
|
22
|
+
### 基本用法
|
|
23
|
+
|
|
24
|
+
```javascript
|
|
25
|
+
import { initCrawler } from '@howuse/electron-crawler'
|
|
26
|
+
|
|
27
|
+
// 初始化爬虫配置(调用后自动启动定时任务,默认每30分钟执行一次)
|
|
28
|
+
initCrawler({
|
|
29
|
+
rules: [
|
|
30
|
+
{
|
|
31
|
+
remark: '示例新闻站点',
|
|
32
|
+
base_url: 'https://example-news-site.com',
|
|
33
|
+
home_range_selector: '.news-list',
|
|
34
|
+
title_selector: 'h1',
|
|
35
|
+
content_selector: '.article-content',
|
|
36
|
+
time_selector: '.publish-time',
|
|
37
|
+
exclude_selectors: ['.ad', '.related-links'],
|
|
38
|
+
enabled: true
|
|
39
|
+
}
|
|
40
|
+
],
|
|
41
|
+
pushApiUrl: 'https://api.example.com/news/push',
|
|
42
|
+
// 可选:字段映射,'-' 表示忽略该字段
|
|
43
|
+
newsItemFieldMap: {
|
|
44
|
+
url: 'link', // 重命名链接
|
|
45
|
+
title: 'title', // 保持不变
|
|
46
|
+
content_html: 'content', // 重命名 HTML 内容
|
|
47
|
+
content_markdown: '-', // 不推送 markdown
|
|
48
|
+
published_at: 'publishTime'// 重命名发布时间
|
|
49
|
+
}
|
|
50
|
+
})
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### 从接口拉取规则
|
|
54
|
+
|
|
55
|
+
```javascript
|
|
56
|
+
import { initCrawler } from '@howuse/electron-crawler'
|
|
57
|
+
|
|
58
|
+
initCrawler({
|
|
59
|
+
// 优先使用内存 rules,其次本地 store,最后从接口获取并写入 store
|
|
60
|
+
rulesApiUrl: 'https://example-news-site.com/rules.json',
|
|
61
|
+
ruleTransformer: (data) => data.rules ?? data, // 若返回 { rules: [...] },可在此转换
|
|
62
|
+
pushApiUrl: 'https://api.example.com/news/push'
|
|
63
|
+
})
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### 配置选项
|
|
67
|
+
|
|
68
|
+
| 选项 | 类型 | 描述 |
|
|
69
|
+
|------|------|------|
|
|
70
|
+
| `rules` | `NewsRule[]` | 直接提供的规则数组 |
|
|
71
|
+
| `rulesApiUrl` | `string` | 规则API接口URL |
|
|
72
|
+
| `pushApiUrl` | `string` | 结果推送API接口URL(必填以启用推送) |
|
|
73
|
+
| `ruleTransformer` | `(data: any) => any` | 规则转换函数 |
|
|
74
|
+
| `newsItemFieldMap` | `Partial<Record<keyof NewsItem, string \| '-'>>` | 推送字段映射,值为 '-' 表示忽略该字段 |
|
|
75
|
+
|
|
76
|
+
### NewsRule 结构
|
|
77
|
+
|
|
78
|
+
| 属性 | 类型 | 必需 | 描述 |
|
|
79
|
+
|------|------|------|------|
|
|
80
|
+
| `remark` | `string` | 是 | 备注 |
|
|
81
|
+
| `base_url` | `string` | 是 | 网站URL |
|
|
82
|
+
| `home_range_selector` | `string` | 是 | 首页范围选择器 |
|
|
83
|
+
| `title_selector` | `string` | 是 | 标题选择器 |
|
|
84
|
+
| `content_selector` | `string` | 是 | 内容选择器 |
|
|
85
|
+
| `time_selector` | `string` | 是 | 时间选择器 |
|
|
86
|
+
| `exclude_selectors` | `string[]` | 否 | 排除元素选择器数组 |
|
|
87
|
+
| `enabled` | `boolean` | 否 | 是否启用,默认为 true |
|
|
88
|
+
|
|
89
|
+
## API
|
|
90
|
+
|
|
91
|
+
### `initCrawler(config: CrawlerConfig)`
|
|
92
|
+
初始化并启动定时爬虫。会按配置自动拉取规则(内存 > 本地 store > API),并在每次抓取时为每条新闻立即推送到 `pushApiUrl`。
|
|
93
|
+
|
|
94
|
+
## 注意事项
|
|
95
|
+
|
|
96
|
+
1. 此包需要在 Electron 环境中运行
|
|
97
|
+
2. 请遵守目标网站的 robots.txt 和使用条款
|
|
98
|
+
3. 建议设置合理的请求间隔,避免对目标服务器造成压力
|
|
99
|
+
|
|
100
|
+
## 许可证
|
|
101
|
+
|
|
102
|
+
MIT
|
package/dist/common.d.ts
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import { BrowserWindow } from 'electron';
|
|
2
|
+
/**
|
|
3
|
+
* 确保隐藏窗口存在,用于爬虫操作
|
|
4
|
+
* 所有爬虫模块共享同一个隐藏窗口
|
|
5
|
+
*/
|
|
6
|
+
export declare function ensureHiddenWindow(): BrowserWindow;
|
|
7
|
+
/**
|
|
8
|
+
* 清理所有爬虫隐藏窗口
|
|
9
|
+
*/
|
|
10
|
+
export declare function cleanupHiddenWindows(): void;
|
|
11
|
+
/**
|
|
12
|
+
* 将 HTML 转换为 Markdown 格式
|
|
13
|
+
* 使用专业的 turndown 库进行转换,保留原始HTML结构
|
|
14
|
+
*/
|
|
15
|
+
export declare function normalizeMarkdown(html: string): string;
|
|
16
|
+
/**
|
|
17
|
+
* 解析和补全时间文本
|
|
18
|
+
* 支持多种时间格式:年月日、年月日时分、时分等
|
|
19
|
+
* 规则:
|
|
20
|
+
* 1. 如果时间不存在,返回当前时间的年月日时分
|
|
21
|
+
* 2. 如果缺少年月日,取今年的年月日
|
|
22
|
+
* 3. 如果缺少时分,取当前系统时间的时分
|
|
23
|
+
* 4. 使用正则表达式匹配时间
|
|
24
|
+
*/
|
|
25
|
+
export declare function parseDate(text?: string): string;
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { initCrawler } from './newsCrawler';
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const k=require("electron-store"),x=require("electron"),U=require("turndown");function v(){const e=x.BrowserWindow.getAllWindows().find(r=>r.title==="crawler-hidden-window");return e||new x.BrowserWindow({show:!1,webPreferences:{sandbox:!1},title:"crawler-hidden-window"})}const S=new U({headingStyle:"atx",codeBlockStyle:"fenced",bulletListMarker:"-",emDelimiter:"*",strongDelimiter:"**",linkStyle:"inlined",linkReferenceStyle:"full",preformattedCode:!1,blankReplacement:(e,t)=>t.nodeName==="BR"?`
|
|
2
|
+
`:""});S.addRule("preserveLineBreaks",{filter:["br"],replacement:()=>`
|
|
3
|
+
`});S.addRule("images",{filter:"img",replacement:(e,t)=>{const r=t.alt||"",s=t.src||t.getAttribute("src")||"",a=t.title||"";return a?``:``}});function A(e){return e.replace(/<script[\s\S]*?<\/script>/gi,"").replace(/<style[\s\S]*?<\/style>/gi,"").replace(/<noscript[\s\S]*?<\/noscript>/gi,"").replace(/<iframe[\s\S]*?<\/iframe>/gi,"").replace(/on\w+="[^"]*"/gi,"").replace(/on\w+='[^']*'/gi,"").trim()}function C(e){if(!e||!e.trim())return"";try{const t=A(e);if(!t)return"";let r=S.turndown(t);return r=r.replace(/\n{3,}/g,`
|
|
4
|
+
|
|
5
|
+
`),r=r.split(`
|
|
6
|
+
`).map(s=>s.trimEnd()).join(`
|
|
7
|
+
`),r.trim()}catch(t){return console.error("[normalizeMarkdown] 转换失败:",t),A(e).replace(/<[^>]+>/g,"").replace(/\n{3,}/g,`
|
|
8
|
+
|
|
9
|
+
`).trim()}}function D(e){const t=new Date,r=t.getFullYear(),s=t.getMonth()+1,a=t.getDate(),o=t.getHours(),i=t.getMinutes();if(!e||!e.trim())return t.toISOString();const c=e.trim(),w=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/,y=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,T=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/,_=/(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,R=/(\d{1,2})[月\-/](\d{1,2})[日]?/,O=/(\d{1,2})[:时](\d{1,2})[分]?/;let g=r,u=s,f=a,m=o,d=i,n=c.match(w);if(n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),m=parseInt(n[4],10),d=parseInt(n[5],10);else if(n=c.match(y),n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),m=parseInt(n[4],10),d=parseInt(n[5],10);else if(n=c.match(T),n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10);else if(n=c.match(_),n)u=parseInt(n[1],10),f=parseInt(n[2],10),m=parseInt(n[3],10),d=parseInt(n[4],10);else if(n=c.match(R),n)u=parseInt(n[1],10),f=parseInt(n[2],10);else if(n=c.match(O),n)m=parseInt(n[1],10),d=parseInt(n[2],10);else{const I=new Date(c);return Number.isNaN(I.getTime())?t.toISOString():I.toISOString()}(u<1||u>12)&&(u=s),(f<1||f>31)&&(f=a),(m<0||m>23)&&(m=o),(d<0||d>59)&&(d=i);const h=new Date(g,u-1,f,m,d,0,0);return h.getFullYear()!==g||h.getMonth()!==u-1||h.getDate()!==f?t.toISOString():L(h)}function L(e){const t=y=>y.toString().padStart(2,"0"),r=e.getFullYear(),s=t(e.getMonth()+1),a=t(e.getDate()),o=t(e.getHours()),i=t(e.getMinutes()),c=t(e.getSeconds()),w=e.getMilliseconds().toString().padStart(3,"0");return`${r}-${s}-${a}T${o}:${i}:${c}.${w}`}const M=k.default||k,b=new M;async function P(){if(l.rules&&l.rules.length>0)return l.rules;const e=b.get("news.rules")||[];if(e.length>0)return e;const t=await H();return t.length>0?(b.set("news.rules",t),t):[]}const p={running:!1,interval_ms:1800*1e3};let l={rules:[],rulesApiUrl:void 0,pushApiUrl:void 0,ruleTransformer:e=>e,newsItemFieldMap:void 0},$=!1;function E(e){l={...l,...e,ruleTransformer:e.ruleTransformer||(t=>t&&typeof t=="object"&&"data"in t?t.data:t)},j()}async function H(){if(!l.rulesApiUrl)return[];try{const e=await fetch(l.rulesApiUrl);if(!e.ok)throw new Error(`Failed to fetch rules from API: ${e.status} ${e.statusText}`);const t=await e.json(),r=l.ruleTransformer(t);return Array.isArray(r)?r:(console.warn("[crawler] Rules API returned non-array data, using empty array instead"),[])}catch(e){return console.error("[crawler] Failed to fetch rules from API:",e),[]}}async function N(e){if(l.pushApiUrl)try{const t=q(e),r=await fetch(l.pushApiUrl,{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify(t)});if(!r.ok)throw new Error(`Failed to push results to API: ${r.status} ${r.statusText}`);console.log("[crawler] Results pushed to API successfully")}catch(t){console.error("[crawler] Failed to push results to API:",t)}}function q(e){const t=l.newsItemFieldMap;if(!t||Object.keys(t).length===0)return e;const r={},s=Object.entries(e);for(const[a,o]of s){const i=t[a];if(i==="-")continue;const c=typeof i=="string"?i:a;r[c]=o}return r}async function F(e,t){return await e.webContents.executeJavaScript(`
|
|
10
|
+
(() => {
|
|
11
|
+
const links = []
|
|
12
|
+
// 在指定范围内查找所有链接
|
|
13
|
+
const rangeElements = document.querySelectorAll(${JSON.stringify(t.home_range_selector)})
|
|
14
|
+
if (rangeElements.length === 0) {
|
|
15
|
+
// 如果没有找到范围,则在整个文档中查找
|
|
16
|
+
const allLinks = document.querySelectorAll('a')
|
|
17
|
+
allLinks.forEach((a) => {
|
|
18
|
+
if (a.href && a.textContent && a.textContent.trim()) {
|
|
19
|
+
links.push({
|
|
20
|
+
href: a.href,
|
|
21
|
+
text: a.textContent.trim()
|
|
22
|
+
})
|
|
23
|
+
}
|
|
24
|
+
})
|
|
25
|
+
} else {
|
|
26
|
+
rangeElements.forEach((range) => {
|
|
27
|
+
const rangeLinks = range.querySelectorAll('a')
|
|
28
|
+
rangeLinks.forEach((a) => {
|
|
29
|
+
if (a.href && a.textContent && a.textContent.trim()) {
|
|
30
|
+
links.push({
|
|
31
|
+
href: a.href,
|
|
32
|
+
text: a.textContent.trim()
|
|
33
|
+
})
|
|
34
|
+
}
|
|
35
|
+
})
|
|
36
|
+
})
|
|
37
|
+
}
|
|
38
|
+
// 去重并返回
|
|
39
|
+
const uniqueLinks = [...new Map(links.map(l => [l.href, l])).values()]
|
|
40
|
+
return uniqueLinks.map(l => l.href)
|
|
41
|
+
})()
|
|
42
|
+
`)}async function W(e,t,r){try{return await e.loadURL(r,{httpReferrer:t.base_url}),await e.webContents.executeJavaScript(`
|
|
43
|
+
(() => {
|
|
44
|
+
const pickText = (sel) => {
|
|
45
|
+
const el = document.querySelector(sel)
|
|
46
|
+
return el ? (el.textContent || '').trim() : ''
|
|
47
|
+
}
|
|
48
|
+
const pickContentHtml = (sel, excludes = []) => {
|
|
49
|
+
const el = document.querySelector(sel)
|
|
50
|
+
if (!el) return ''
|
|
51
|
+
const clone = el.cloneNode(true)
|
|
52
|
+
excludes.forEach((ex) => clone.querySelectorAll(ex).forEach((n) => n.remove()))
|
|
53
|
+
|
|
54
|
+
// 将相对路径的图片 URL 转换为绝对路径
|
|
55
|
+
const images = clone.querySelectorAll('img')
|
|
56
|
+
images.forEach((img) => {
|
|
57
|
+
if (img.src) {
|
|
58
|
+
try {
|
|
59
|
+
// 如果 src 是相对路径,转换为绝对路径
|
|
60
|
+
const imgUrl = new URL(img.src, location.href)
|
|
61
|
+
img.src = imgUrl.href
|
|
62
|
+
} catch {
|
|
63
|
+
// 如果 URL 解析失败,尝试使用 getAttribute('src') 并手动拼接
|
|
64
|
+
const srcAttr = img.getAttribute('src')
|
|
65
|
+
if (srcAttr && !srcAttr.startsWith('http://') && !srcAttr.startsWith('https://') && !srcAttr.startsWith('data:')) {
|
|
66
|
+
try {
|
|
67
|
+
const baseUrl = new URL(location.href)
|
|
68
|
+
img.src = new URL(srcAttr, baseUrl.origin + baseUrl.pathname.substring(0, baseUrl.pathname.lastIndexOf('/') + 1)).href
|
|
69
|
+
} catch {
|
|
70
|
+
// 如果还是失败,保持原样
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
})
|
|
76
|
+
|
|
77
|
+
return clone.innerHTML || ''
|
|
78
|
+
}
|
|
79
|
+
return {
|
|
80
|
+
title: pickText(${JSON.stringify(t.title_selector)}),
|
|
81
|
+
contentHtml: pickContentHtml(${JSON.stringify(t.content_selector)}, ${JSON.stringify(t.exclude_selectors||[])}),
|
|
82
|
+
timeText: pickText(${JSON.stringify(t.time_selector)}),
|
|
83
|
+
url: location.href
|
|
84
|
+
}
|
|
85
|
+
})()
|
|
86
|
+
`)}catch(s){return console.warn("[crawler] failed to extract page content",r,s),null}}async function J(e){const t=v();try{await t.loadURL(e.base_url,{httpReferrer:e.base_url});const r=await F(t,e);console.log(`[crawler] found ${r.length} links from ${e.remark||e.base_url}`);const s=[];for(const a of r){const o=await W(t,e,a);if(!o||!o.title||!o.contentHtml){console.log(`[crawler] skip empty result for ${a}`);continue}const i={url:o.url||a,title:o.title,content_html:o.contentHtml,content_markdown:C(o.contentHtml),published_at:D(o.timeText)};s.push(i);try{await N(i)}catch(c){console.warn("[crawler] push single news item failed",c)}}return console.log(`[crawler] processed ${s.length} items from ${e.remark||e.base_url}`),{success:!0,data:s}}catch(r){return console.warn("[crawler] rule failed",e.remark||e.base_url,r),{success:!1,error:r instanceof Error?r.message:String(r)}}}function j(){if($)return;$=!0;const e=p.interval_ms,t=async()=>{const r=await P();console.log(`[crawler] scheduled run, rules=${r.length}`),p.running=!0,p.running_source=void 0;try{for(const s of r)await J(s)}finally{p.running=!1,p.running_source=void 0,p.next_run_at=new Date(Date.now()+e).toISOString()}};p.next_run_at=new Date(Date.now()+5e3).toISOString(),setTimeout(t,5e3),setInterval(t,e)}exports.initCrawler=E;
|
package/dist/index.mjs
ADDED
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
import k from "electron-store";
|
|
2
|
+
import { BrowserWindow as x } from "electron";
|
|
3
|
+
import U from "turndown";
|
|
4
|
+
function D() {
|
|
5
|
+
const e = x.getAllWindows().find((r) => r.title === "crawler-hidden-window");
|
|
6
|
+
return e || new x({
|
|
7
|
+
show: !1,
|
|
8
|
+
webPreferences: {
|
|
9
|
+
sandbox: !1
|
|
10
|
+
},
|
|
11
|
+
title: "crawler-hidden-window"
|
|
12
|
+
});
|
|
13
|
+
}
|
|
14
|
+
const S = new U({
|
|
15
|
+
headingStyle: "atx",
|
|
16
|
+
// 使用 # 格式的标题
|
|
17
|
+
codeBlockStyle: "fenced",
|
|
18
|
+
// 使用 ``` 格式的代码块
|
|
19
|
+
bulletListMarker: "-",
|
|
20
|
+
// 使用 - 作为列表标记
|
|
21
|
+
emDelimiter: "*",
|
|
22
|
+
// 使用 * 作为强调标记
|
|
23
|
+
strongDelimiter: "**",
|
|
24
|
+
// 使用 ** 作为粗体标记
|
|
25
|
+
linkStyle: "inlined",
|
|
26
|
+
// 内联链接格式 [text](url)
|
|
27
|
+
linkReferenceStyle: "full",
|
|
28
|
+
// 完整的链接引用格式
|
|
29
|
+
preformattedCode: !1,
|
|
30
|
+
// 不使用预格式化代码
|
|
31
|
+
blankReplacement: (e, t) => t.nodeName === "BR" ? `
|
|
32
|
+
` : ""
|
|
33
|
+
});
|
|
34
|
+
S.addRule("preserveLineBreaks", {
|
|
35
|
+
filter: ["br"],
|
|
36
|
+
replacement: () => `
|
|
37
|
+
`
|
|
38
|
+
});
|
|
39
|
+
S.addRule("images", {
|
|
40
|
+
filter: "img",
|
|
41
|
+
replacement: (e, t) => {
|
|
42
|
+
const r = t.alt || "", s = t.src || t.getAttribute("src") || "", a = t.title || "";
|
|
43
|
+
return a ? `` : ``;
|
|
44
|
+
}
|
|
45
|
+
});
|
|
46
|
+
function A(e) {
|
|
47
|
+
return e.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<noscript[\s\S]*?<\/noscript>/gi, "").replace(/<iframe[\s\S]*?<\/iframe>/gi, "").replace(/on\w+="[^"]*"/gi, "").replace(/on\w+='[^']*'/gi, "").trim();
|
|
48
|
+
}
|
|
49
|
+
function L(e) {
|
|
50
|
+
if (!e || !e.trim())
|
|
51
|
+
return "";
|
|
52
|
+
try {
|
|
53
|
+
const t = A(e);
|
|
54
|
+
if (!t)
|
|
55
|
+
return "";
|
|
56
|
+
let r = S.turndown(t);
|
|
57
|
+
return r = r.replace(/\n{3,}/g, `
|
|
58
|
+
|
|
59
|
+
`), r = r.split(`
|
|
60
|
+
`).map((s) => s.trimEnd()).join(`
|
|
61
|
+
`), r.trim();
|
|
62
|
+
} catch (t) {
|
|
63
|
+
return console.error("[normalizeMarkdown] 转换失败:", t), A(e).replace(/<[^>]+>/g, "").replace(/\n{3,}/g, `
|
|
64
|
+
|
|
65
|
+
`).trim();
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
function v(e) {
|
|
69
|
+
const t = /* @__PURE__ */ new Date(), r = t.getFullYear(), s = t.getMonth() + 1, a = t.getDate(), o = t.getHours(), l = t.getMinutes();
|
|
70
|
+
if (!e || !e.trim())
|
|
71
|
+
return t.toISOString();
|
|
72
|
+
const c = e.trim(), w = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/, y = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, _ = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/, T = /(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, R = /(\d{1,2})[月\-/](\d{1,2})[日]?/, O = /(\d{1,2})[:时](\d{1,2})[分]?/;
|
|
73
|
+
let g = r, u = s, f = a, m = o, p = l, n = c.match(w);
|
|
74
|
+
if (n)
|
|
75
|
+
g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), m = parseInt(n[4], 10), p = parseInt(n[5], 10);
|
|
76
|
+
else if (n = c.match(y), n)
|
|
77
|
+
g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), m = parseInt(n[4], 10), p = parseInt(n[5], 10);
|
|
78
|
+
else if (n = c.match(_), n)
|
|
79
|
+
g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10);
|
|
80
|
+
else if (n = c.match(T), n)
|
|
81
|
+
u = parseInt(n[1], 10), f = parseInt(n[2], 10), m = parseInt(n[3], 10), p = parseInt(n[4], 10);
|
|
82
|
+
else if (n = c.match(R), n)
|
|
83
|
+
u = parseInt(n[1], 10), f = parseInt(n[2], 10);
|
|
84
|
+
else if (n = c.match(O), n)
|
|
85
|
+
m = parseInt(n[1], 10), p = parseInt(n[2], 10);
|
|
86
|
+
else {
|
|
87
|
+
const I = new Date(c);
|
|
88
|
+
return Number.isNaN(I.getTime()) ? t.toISOString() : I.toISOString();
|
|
89
|
+
}
|
|
90
|
+
(u < 1 || u > 12) && (u = s), (f < 1 || f > 31) && (f = a), (m < 0 || m > 23) && (m = o), (p < 0 || p > 59) && (p = l);
|
|
91
|
+
const h = new Date(g, u - 1, f, m, p, 0, 0);
|
|
92
|
+
return h.getFullYear() !== g || h.getMonth() !== u - 1 || h.getDate() !== f ? t.toISOString() : C(h);
|
|
93
|
+
}
|
|
94
|
+
function C(e) {
|
|
95
|
+
const t = (y) => y.toString().padStart(2, "0"), r = e.getFullYear(), s = t(e.getMonth() + 1), a = t(e.getDate()), o = t(e.getHours()), l = t(e.getMinutes()), c = t(e.getSeconds()), w = e.getMilliseconds().toString().padStart(3, "0");
|
|
96
|
+
return `${r}-${s}-${a}T${o}:${l}:${c}.${w}`;
|
|
97
|
+
}
|
|
98
|
+
const M = k.default || k, b = new M();
|
|
99
|
+
async function P() {
|
|
100
|
+
if (i.rules && i.rules.length > 0)
|
|
101
|
+
return i.rules;
|
|
102
|
+
const e = b.get("news.rules") || [];
|
|
103
|
+
if (e.length > 0)
|
|
104
|
+
return e;
|
|
105
|
+
const t = await E();
|
|
106
|
+
return t.length > 0 ? (b.set("news.rules", t), t) : [];
|
|
107
|
+
}
|
|
108
|
+
const d = {
|
|
109
|
+
running: !1,
|
|
110
|
+
interval_ms: 1800 * 1e3
|
|
111
|
+
};
|
|
112
|
+
let i = {
|
|
113
|
+
rules: [],
|
|
114
|
+
rulesApiUrl: void 0,
|
|
115
|
+
pushApiUrl: void 0,
|
|
116
|
+
ruleTransformer: (e) => e,
|
|
117
|
+
newsItemFieldMap: void 0
|
|
118
|
+
}, $ = !1;
|
|
119
|
+
function z(e) {
|
|
120
|
+
i = {
|
|
121
|
+
...i,
|
|
122
|
+
...e,
|
|
123
|
+
// 确保ruleTransformer始终存在
|
|
124
|
+
ruleTransformer: e.ruleTransformer || ((t) => t && typeof t == "object" && "data" in t ? t.data : t)
|
|
125
|
+
}, W();
|
|
126
|
+
}
|
|
127
|
+
async function E() {
|
|
128
|
+
if (!i.rulesApiUrl)
|
|
129
|
+
return [];
|
|
130
|
+
try {
|
|
131
|
+
const e = await fetch(i.rulesApiUrl);
|
|
132
|
+
if (!e.ok)
|
|
133
|
+
throw new Error(`Failed to fetch rules from API: ${e.status} ${e.statusText}`);
|
|
134
|
+
const t = await e.json(), r = i.ruleTransformer(t);
|
|
135
|
+
return Array.isArray(r) ? r : (console.warn("[crawler] Rules API returned non-array data, using empty array instead"), []);
|
|
136
|
+
} catch (e) {
|
|
137
|
+
return console.error("[crawler] Failed to fetch rules from API:", e), [];
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
async function H(e) {
|
|
141
|
+
if (i.pushApiUrl)
|
|
142
|
+
try {
|
|
143
|
+
const t = N(e), r = await fetch(i.pushApiUrl, {
|
|
144
|
+
method: "POST",
|
|
145
|
+
headers: {
|
|
146
|
+
"Content-Type": "application/json"
|
|
147
|
+
},
|
|
148
|
+
body: JSON.stringify(t)
|
|
149
|
+
});
|
|
150
|
+
if (!r.ok)
|
|
151
|
+
throw new Error(`Failed to push results to API: ${r.status} ${r.statusText}`);
|
|
152
|
+
console.log("[crawler] Results pushed to API successfully");
|
|
153
|
+
} catch (t) {
|
|
154
|
+
console.error("[crawler] Failed to push results to API:", t);
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
function N(e) {
|
|
158
|
+
const t = i.newsItemFieldMap;
|
|
159
|
+
if (!t || Object.keys(t).length === 0)
|
|
160
|
+
return e;
|
|
161
|
+
const r = {}, s = Object.entries(e);
|
|
162
|
+
for (const [a, o] of s) {
|
|
163
|
+
const l = t[a];
|
|
164
|
+
if (l === "-") continue;
|
|
165
|
+
const c = typeof l == "string" ? l : a;
|
|
166
|
+
r[c] = o;
|
|
167
|
+
}
|
|
168
|
+
return r;
|
|
169
|
+
}
|
|
170
|
+
async function F(e, t) {
|
|
171
|
+
return await e.webContents.executeJavaScript(
|
|
172
|
+
`
|
|
173
|
+
(() => {
|
|
174
|
+
const links = []
|
|
175
|
+
// 在指定范围内查找所有链接
|
|
176
|
+
const rangeElements = document.querySelectorAll(${JSON.stringify(t.home_range_selector)})
|
|
177
|
+
if (rangeElements.length === 0) {
|
|
178
|
+
// 如果没有找到范围,则在整个文档中查找
|
|
179
|
+
const allLinks = document.querySelectorAll('a')
|
|
180
|
+
allLinks.forEach((a) => {
|
|
181
|
+
if (a.href && a.textContent && a.textContent.trim()) {
|
|
182
|
+
links.push({
|
|
183
|
+
href: a.href,
|
|
184
|
+
text: a.textContent.trim()
|
|
185
|
+
})
|
|
186
|
+
}
|
|
187
|
+
})
|
|
188
|
+
} else {
|
|
189
|
+
rangeElements.forEach((range) => {
|
|
190
|
+
const rangeLinks = range.querySelectorAll('a')
|
|
191
|
+
rangeLinks.forEach((a) => {
|
|
192
|
+
if (a.href && a.textContent && a.textContent.trim()) {
|
|
193
|
+
links.push({
|
|
194
|
+
href: a.href,
|
|
195
|
+
text: a.textContent.trim()
|
|
196
|
+
})
|
|
197
|
+
}
|
|
198
|
+
})
|
|
199
|
+
})
|
|
200
|
+
}
|
|
201
|
+
// 去重并返回
|
|
202
|
+
const uniqueLinks = [...new Map(links.map(l => [l.href, l])).values()]
|
|
203
|
+
return uniqueLinks.map(l => l.href)
|
|
204
|
+
})()
|
|
205
|
+
`
|
|
206
|
+
);
|
|
207
|
+
}
|
|
208
|
+
async function q(e, t, r) {
|
|
209
|
+
try {
|
|
210
|
+
return await e.loadURL(r, { httpReferrer: t.base_url }), await e.webContents.executeJavaScript(
|
|
211
|
+
`
|
|
212
|
+
(() => {
|
|
213
|
+
const pickText = (sel) => {
|
|
214
|
+
const el = document.querySelector(sel)
|
|
215
|
+
return el ? (el.textContent || '').trim() : ''
|
|
216
|
+
}
|
|
217
|
+
const pickContentHtml = (sel, excludes = []) => {
|
|
218
|
+
const el = document.querySelector(sel)
|
|
219
|
+
if (!el) return ''
|
|
220
|
+
const clone = el.cloneNode(true)
|
|
221
|
+
excludes.forEach((ex) => clone.querySelectorAll(ex).forEach((n) => n.remove()))
|
|
222
|
+
|
|
223
|
+
// 将相对路径的图片 URL 转换为绝对路径
|
|
224
|
+
const images = clone.querySelectorAll('img')
|
|
225
|
+
images.forEach((img) => {
|
|
226
|
+
if (img.src) {
|
|
227
|
+
try {
|
|
228
|
+
// 如果 src 是相对路径,转换为绝对路径
|
|
229
|
+
const imgUrl = new URL(img.src, location.href)
|
|
230
|
+
img.src = imgUrl.href
|
|
231
|
+
} catch {
|
|
232
|
+
// 如果 URL 解析失败,尝试使用 getAttribute('src') 并手动拼接
|
|
233
|
+
const srcAttr = img.getAttribute('src')
|
|
234
|
+
if (srcAttr && !srcAttr.startsWith('http://') && !srcAttr.startsWith('https://') && !srcAttr.startsWith('data:')) {
|
|
235
|
+
try {
|
|
236
|
+
const baseUrl = new URL(location.href)
|
|
237
|
+
img.src = new URL(srcAttr, baseUrl.origin + baseUrl.pathname.substring(0, baseUrl.pathname.lastIndexOf('/') + 1)).href
|
|
238
|
+
} catch {
|
|
239
|
+
// 如果还是失败,保持原样
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
})
|
|
245
|
+
|
|
246
|
+
return clone.innerHTML || ''
|
|
247
|
+
}
|
|
248
|
+
return {
|
|
249
|
+
title: pickText(${JSON.stringify(t.title_selector)}),
|
|
250
|
+
contentHtml: pickContentHtml(${JSON.stringify(t.content_selector)}, ${JSON.stringify(
|
|
251
|
+
t.exclude_selectors || []
|
|
252
|
+
)}),
|
|
253
|
+
timeText: pickText(${JSON.stringify(t.time_selector)}),
|
|
254
|
+
url: location.href
|
|
255
|
+
}
|
|
256
|
+
})()
|
|
257
|
+
`
|
|
258
|
+
);
|
|
259
|
+
} catch (s) {
|
|
260
|
+
return console.warn("[crawler] failed to extract page content", r, s), null;
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
async function J(e) {
|
|
264
|
+
const t = D();
|
|
265
|
+
try {
|
|
266
|
+
await t.loadURL(e.base_url, { httpReferrer: e.base_url });
|
|
267
|
+
const r = await F(t, e);
|
|
268
|
+
console.log(`[crawler] found ${r.length} links from ${e.remark || e.base_url}`);
|
|
269
|
+
const s = [];
|
|
270
|
+
for (const a of r) {
|
|
271
|
+
const o = await q(t, e, a);
|
|
272
|
+
if (!o || !o.title || !o.contentHtml) {
|
|
273
|
+
console.log(`[crawler] skip empty result for ${a}`);
|
|
274
|
+
continue;
|
|
275
|
+
}
|
|
276
|
+
const l = {
|
|
277
|
+
url: o.url || a,
|
|
278
|
+
title: o.title,
|
|
279
|
+
content_html: o.contentHtml,
|
|
280
|
+
content_markdown: L(o.contentHtml),
|
|
281
|
+
published_at: v(o.timeText)
|
|
282
|
+
};
|
|
283
|
+
s.push(l);
|
|
284
|
+
try {
|
|
285
|
+
await H(l);
|
|
286
|
+
} catch (c) {
|
|
287
|
+
console.warn("[crawler] push single news item failed", c);
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
return console.log(`[crawler] processed ${s.length} items from ${e.remark || e.base_url}`), {
|
|
291
|
+
success: !0,
|
|
292
|
+
data: s
|
|
293
|
+
};
|
|
294
|
+
} catch (r) {
|
|
295
|
+
return console.warn("[crawler] rule failed", e.remark || e.base_url, r), {
|
|
296
|
+
success: !1,
|
|
297
|
+
error: r instanceof Error ? r.message : String(r)
|
|
298
|
+
};
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
function W() {
|
|
302
|
+
if ($) return;
|
|
303
|
+
$ = !0;
|
|
304
|
+
const e = d.interval_ms, t = async () => {
|
|
305
|
+
const r = await P();
|
|
306
|
+
console.log(`[crawler] scheduled run, rules=${r.length}`), d.running = !0, d.running_source = void 0;
|
|
307
|
+
try {
|
|
308
|
+
for (const s of r)
|
|
309
|
+
await J(s);
|
|
310
|
+
} finally {
|
|
311
|
+
d.running = !1, d.running_source = void 0, d.next_run_at = new Date(Date.now() + e).toISOString();
|
|
312
|
+
}
|
|
313
|
+
};
|
|
314
|
+
d.next_run_at = new Date(Date.now() + 5e3).toISOString(), setTimeout(t, 5e3), setInterval(t, e);
|
|
315
|
+
}
|
|
316
|
+
export {
|
|
317
|
+
z as initCrawler
|
|
318
|
+
};
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
export type CrawlerStatus = {
|
|
2
|
+
running: boolean;
|
|
3
|
+
running_source?: string;
|
|
4
|
+
next_run_at?: string;
|
|
5
|
+
interval_ms: number;
|
|
6
|
+
};
|
|
7
|
+
export type NewsRule = {
|
|
8
|
+
remark: string;
|
|
9
|
+
base_url: string;
|
|
10
|
+
home_range_selector: string;
|
|
11
|
+
title_selector: string;
|
|
12
|
+
content_selector: string;
|
|
13
|
+
time_selector: string;
|
|
14
|
+
exclude_selectors?: string[];
|
|
15
|
+
enabled?: boolean;
|
|
16
|
+
};
|
|
17
|
+
export type CrawlerConfig = {
|
|
18
|
+
rules?: NewsRule[];
|
|
19
|
+
rulesApiUrl?: string;
|
|
20
|
+
pushApiUrl?: string;
|
|
21
|
+
ruleTransformer?: (data: any) => any;
|
|
22
|
+
newsItemFieldMap?: Partial<Record<keyof NewsItem, string | '-'>>;
|
|
23
|
+
};
|
|
24
|
+
export type NewsItem = {
|
|
25
|
+
url: string;
|
|
26
|
+
title: string;
|
|
27
|
+
content_html: string;
|
|
28
|
+
content_markdown: string;
|
|
29
|
+
published_at: string;
|
|
30
|
+
};
|
|
31
|
+
export type CrawlResult = {
|
|
32
|
+
success: boolean;
|
|
33
|
+
data?: NewsItem[];
|
|
34
|
+
error?: string;
|
|
35
|
+
};
|
|
36
|
+
/**
|
|
37
|
+
* 获取爬虫规则
|
|
38
|
+
* 优先使用配置中的rules数组,然后使用API获取,最后使用本地存储的规则
|
|
39
|
+
* @returns 爬虫规则数组
|
|
40
|
+
*/
|
|
41
|
+
export declare function getNewsRules(): Promise<NewsRule[]>;
|
|
42
|
+
/**
|
|
43
|
+
* 初始化爬虫配置
|
|
44
|
+
* @param config 爬虫配置
|
|
45
|
+
*/
|
|
46
|
+
export declare function initCrawler(config: CrawlerConfig): void;
|
|
47
|
+
export declare function getCrawlerStatus(): CrawlerStatus;
|
|
48
|
+
/**
|
|
49
|
+
* 爬取单个规则的新闻
|
|
50
|
+
* @param rule 爬虫规则
|
|
51
|
+
* @returns 爬取结果
|
|
52
|
+
*/
|
|
53
|
+
export declare function crawlRule(rule: NewsRule): Promise<CrawlResult>;
|
package/package.json
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@howuse/electron-crawler",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "基于Electron的爬虫工具包,用于爬取新闻和股票详情",
|
|
5
|
+
"keywords": [
|
|
6
|
+
"electron",
|
|
7
|
+
"crawler",
|
|
8
|
+
"news",
|
|
9
|
+
"spider"
|
|
10
|
+
],
|
|
11
|
+
"author": "无名氏",
|
|
12
|
+
"license": "MIT",
|
|
13
|
+
"repository": {
|
|
14
|
+
"type": "git",
|
|
15
|
+
"url": "https://github.com/your-username/your-repo.git"
|
|
16
|
+
},
|
|
17
|
+
"bugs": {
|
|
18
|
+
"url": "https://github.com/your-username/your-repo/issues"
|
|
19
|
+
},
|
|
20
|
+
"homepage": "https://github.com/your-username/your-repo#readme",
|
|
21
|
+
"main": "dist/index.cjs",
|
|
22
|
+
"module": "dist/index.js",
|
|
23
|
+
"types": "dist/index.d.ts",
|
|
24
|
+
"files": [
|
|
25
|
+
"dist/**/*",
|
|
26
|
+
"README.md",
|
|
27
|
+
"LICENSE"
|
|
28
|
+
],
|
|
29
|
+
"exports": {
|
|
30
|
+
".": {
|
|
31
|
+
"types": "./dist/index.d.ts",
|
|
32
|
+
"import": "./dist/index.js",
|
|
33
|
+
"require": "./dist/index.cjs"
|
|
34
|
+
}
|
|
35
|
+
},
|
|
36
|
+
"scripts": {
|
|
37
|
+
"build": "vite build",
|
|
38
|
+
"dev": "vite build --watch",
|
|
39
|
+
"prepublishOnly": "npm run build"
|
|
40
|
+
},
|
|
41
|
+
"dependencies": {
|
|
42
|
+
"electron": "^39.2.6",
|
|
43
|
+
"electron-store": "^9.0.0",
|
|
44
|
+
"turndown": "^7.2.2"
|
|
45
|
+
},
|
|
46
|
+
"devDependencies": {
|
|
47
|
+
"@types/node": "^22.19.1",
|
|
48
|
+
"typescript": "^5.9.3",
|
|
49
|
+
"vite": "^5.0.0",
|
|
50
|
+
"vite-plugin-dts": "^4.0.0"
|
|
51
|
+
},
|
|
52
|
+
"peerDependencies": {
|
|
53
|
+
"electron": "^39.2.6"
|
|
54
|
+
},
|
|
55
|
+
"engines": {
|
|
56
|
+
"node": ">=18.0.0"
|
|
57
|
+
}
|
|
58
|
+
}
|