@howuse/electron-crawler 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -0
- package/dist/index.js +11 -11
- package/dist/index.mjs +135 -118
- package/dist/newsCrawler.d.ts +14 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -63,6 +63,36 @@ initCrawler({
|
|
|
63
63
|
})
|
|
64
64
|
```
|
|
65
65
|
|
|
66
|
+
### 使用推送过滤函数
|
|
67
|
+
|
|
68
|
+
```javascript
|
|
69
|
+
import { initCrawler } from '@howuse/electron-crawler'
|
|
70
|
+
|
|
71
|
+
initCrawler({
|
|
72
|
+
rules: [/* ... */],
|
|
73
|
+
pushApiUrl: 'https://api.example.com/news/push',
|
|
74
|
+
// 只推送包含特定关键词的新闻
|
|
75
|
+
isPush: (item) => {
|
|
76
|
+
if (item.title.includes('重要') || item.title.includes('紧急')) {
|
|
77
|
+
return true // 允许推送
|
|
78
|
+
}
|
|
79
|
+
return false // 不推送
|
|
80
|
+
}
|
|
81
|
+
})
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### 开发模式
|
|
85
|
+
|
|
86
|
+
```javascript
|
|
87
|
+
import { initCrawler } from '@howuse/electron-crawler'
|
|
88
|
+
|
|
89
|
+
initCrawler({
|
|
90
|
+
rules: [/* ... */],
|
|
91
|
+
pushApiUrl: 'https://api.example.com/news/push',
|
|
92
|
+
devMode: true // 开发模式:不使用本地缓存,每次都从 API 拉取最新规则
|
|
93
|
+
})
|
|
94
|
+
```
|
|
95
|
+
|
|
66
96
|
### 配置选项
|
|
67
97
|
|
|
68
98
|
| 选项 | 类型 | 描述 |
|
|
@@ -71,6 +101,8 @@ initCrawler({
|
|
|
71
101
|
| `rulesApiUrl` | `string` | 规则API接口URL |
|
|
72
102
|
| `pushApiUrl` | `string` | 结果推送API接口URL(必填以启用推送) |
|
|
73
103
|
| `ruleTransformer` | `(data: any) => any` | 规则转换函数 |
|
|
104
|
+
| `devMode` | `boolean` | 是否处于开发模式。开发模式:不使用本地缓存,每次都优先使用内存 rules,其次直接从 API 拉取;生产模式:会使用本地缓存(带 5 小时过期时间) |
|
|
105
|
+
| `isPush` | `(item: NewsItem) => boolean \| null \| undefined` | 推送前判断函数。返回 `true`、`undefined` 或 `null` 时允许推送,返回 `false` 时不推送。可用于过滤不需要推送的新闻项 |
|
|
74
106
|
| `newsItemFieldMap` | `Partial<Record<keyof NewsItem, string \| '-'>>` | 推送字段映射,值为 '-' 表示忽略该字段 |
|
|
75
107
|
|
|
76
108
|
### NewsRule 结构
|
package/dist/index.js
CHANGED
|
@@ -1,16 +1,16 @@
|
|
|
1
|
-
"use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const
|
|
2
|
-
`:""});
|
|
3
|
-
`});
|
|
1
|
+
"use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const A=require("electron-store"),x=require("electron"),O=require("turndown");function P(){const t=x.BrowserWindow.getAllWindows().find(r=>r.title==="crawler-hidden-window");return t||new x.BrowserWindow({show:!1,webPreferences:{sandbox:!1},title:"crawler-hidden-window"})}const I=new O({headingStyle:"atx",codeBlockStyle:"fenced",bulletListMarker:"-",emDelimiter:"*",strongDelimiter:"**",linkStyle:"inlined",linkReferenceStyle:"full",preformattedCode:!1,blankReplacement:(t,e)=>e.nodeName==="BR"?`
|
|
2
|
+
`:""});I.addRule("preserveLineBreaks",{filter:["br"],replacement:()=>`
|
|
3
|
+
`});I.addRule("images",{filter:"img",replacement:(t,e)=>{const r=e.alt||"",s=e.src||e.getAttribute("src")||"",o=e.title||"";return o?``:``}});function b(t){return t.replace(/<script[\s\S]*?<\/script>/gi,"").replace(/<style[\s\S]*?<\/style>/gi,"").replace(/<noscript[\s\S]*?<\/noscript>/gi,"").replace(/<iframe[\s\S]*?<\/iframe>/gi,"").replace(/on\w+="[^"]*"/gi,"").replace(/on\w+='[^']*'/gi,"").trim()}function v(t){if(!t||!t.trim())return"";try{const e=b(t);if(!e)return"";let r=I.turndown(e);return r=r.replace(/\n{3,}/g,`
|
|
4
4
|
|
|
5
5
|
`),r=r.split(`
|
|
6
6
|
`).map(s=>s.trimEnd()).join(`
|
|
7
|
-
`),r.trim()}catch(
|
|
7
|
+
`),r.trim()}catch(e){return console.error("[normalizeMarkdown] 转换失败:",e),b(t).replace(/<[^>]+>/g,"").replace(/\n{3,}/g,`
|
|
8
8
|
|
|
9
|
-
`).trim()}}function
|
|
9
|
+
`).trim()}}function D(t){const e=new Date,r=e.getFullYear(),s=e.getMonth()+1,o=e.getDate(),a=e.getHours(),i=e.getMinutes();if(!t||!t.trim())return e.toISOString();const c=t.trim(),g=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/,S=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,R=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/,U=/(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,T=/(\d{1,2})[月\-/](\d{1,2})[日]?/,M=/(\d{1,2})[:时](\d{1,2})[分]?/;let h=r,u=s,f=o,d=a,p=i,n=c.match(g);if(n)h=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),d=parseInt(n[4],10),p=parseInt(n[5],10);else if(n=c.match(S),n)h=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),d=parseInt(n[4],10),p=parseInt(n[5],10);else if(n=c.match(R),n)h=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10);else if(n=c.match(U),n)u=parseInt(n[1],10),f=parseInt(n[2],10),d=parseInt(n[3],10),p=parseInt(n[4],10);else if(n=c.match(T),n)u=parseInt(n[1],10),f=parseInt(n[2],10);else if(n=c.match(M),n)d=parseInt(n[1],10),p=parseInt(n[2],10);else{const k=new Date(c);return Number.isNaN(k.getTime())?e.toISOString():k.toISOString()}(u<1||u>12)&&(u=s),(f<1||f>31)&&(f=o),(d<0||d>23)&&(d=a),(p<0||p>59)&&(p=i);const y=new Date(h,u-1,f,d,p,0,0);return y.getFullYear()!==h||y.getMonth()!==u-1||y.getDate()!==f?e.toISOString():L(y)}function L(t){const e=S=>S.toString().padStart(2,"0"),r=t.getFullYear(),s=e(t.getMonth()+1),o=e(t.getDate()),a=e(t.getHours()),i=e(t.getMinutes()),c=e(t.getSeconds()),g=t.getMilliseconds().toString().padStart(3,"0");return`${r}-${s}-${o}T${a}:${i}:${c}.${g}`}const C=A.default||A,w=new C;async function E(){if(l.rules&&l.rules.length>0)return l.rules;if(l.devMode)return await _();const e=w.get("news.rules");let r=[];if(Array.isArray(e))r=e;else if(e&&Array.isArray(e.rules)){const o=e.updatedAt?new Date(e.updatedAt).getTime():0,a=Date.now();o>0&&a-o<=18e6?r=e.rules:w.delete("news.rules")}if(r.length>0)return r;const s=await _();return s.length>0?(l.devMode||w.set("news.rules",{rules:s,updatedAt:new Date().toISOString()}),s):[]}const m={running:!1,interval_ms:1800*1e3};let l={rules:[],rulesApiUrl:void 0,pushApiUrl:void 0,devMode:!1,ruleTransformer:t=>t,isPush:void 0,newsItemFieldMap:void 0},$=!1;function H(t){l={...l,...t,ruleTransformer:t.ruleTransformer||(e=>e&&typeof e=="object"&&"data"in e?e.data:e)},j()}async function _(){if(!l.rulesApiUrl)return[];try{const t=await fetch(l.rulesApiUrl);if(!t.ok)throw new Error(`Failed to fetch rules from API: ${t.status} ${t.statusText}`);const e=await t.json(),r=l.ruleTransformer(e);return Array.isArray(r)?r:(console.warn("[crawler] Rules API returned non-array data, using empty array instead"),[])}catch(t){return console.error("[crawler] Failed to fetch rules from API:",t),[]}}async function N(t){if(!l.pushApiUrl)return;const e=w.get("news.pushedUrls")||[];if(e.includes(t.url)){console.log(`[crawler] URL already pushed, skipping: ${t.url}`);return}try{const r=F(t),s=await fetch(l.pushApiUrl,{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify(r)});if(!s.ok)throw new Error(`Failed to push results to API: ${s.status} ${s.statusText}`);const o=[...e,t.url];w.set("news.pushedUrls",o),console.log("[crawler] Results pushed to API successfully")}catch(r){console.error("[crawler] Failed to push results to API:",r)}}function F(t){const e=l.newsItemFieldMap;if(!e||Object.keys(e).length===0)return t;const r={},s=Object.entries(t);for(const[o,a]of s){const i=e[o];if(i==="-")continue;const c=typeof i=="string"?i:o;r[c]=a}return r}async function q(t,e){return await t.webContents.executeJavaScript(`
|
|
10
10
|
(() => {
|
|
11
11
|
const links = []
|
|
12
12
|
// 在指定范围内查找所有链接
|
|
13
|
-
const rangeElements = document.querySelectorAll(${JSON.stringify(
|
|
13
|
+
const rangeElements = document.querySelectorAll(${JSON.stringify(e.home_range_selector)})
|
|
14
14
|
if (rangeElements.length === 0) {
|
|
15
15
|
// 如果没有找到范围,则在整个文档中查找
|
|
16
16
|
const allLinks = document.querySelectorAll('a')
|
|
@@ -39,7 +39,7 @@
|
|
|
39
39
|
const uniqueLinks = [...new Map(links.map(l => [l.href, l])).values()]
|
|
40
40
|
return uniqueLinks.map(l => l.href)
|
|
41
41
|
})()
|
|
42
|
-
`)}async function W(e,
|
|
42
|
+
`)}async function W(t,e,r){try{return await t.loadURL(r,{httpReferrer:e.base_url}),await t.webContents.executeJavaScript(`
|
|
43
43
|
(() => {
|
|
44
44
|
const pickText = (sel) => {
|
|
45
45
|
const el = document.querySelector(sel)
|
|
@@ -77,10 +77,10 @@
|
|
|
77
77
|
return clone.innerHTML || ''
|
|
78
78
|
}
|
|
79
79
|
return {
|
|
80
|
-
title: pickText(${JSON.stringify(
|
|
81
|
-
contentHtml: pickContentHtml(${JSON.stringify(
|
|
82
|
-
timeText: pickText(${JSON.stringify(
|
|
80
|
+
title: pickText(${JSON.stringify(e.title_selector)}),
|
|
81
|
+
contentHtml: pickContentHtml(${JSON.stringify(e.content_selector)}, ${JSON.stringify(e.exclude_selectors||[])}),
|
|
82
|
+
timeText: pickText(${JSON.stringify(e.time_selector)}),
|
|
83
83
|
url: location.href
|
|
84
84
|
}
|
|
85
85
|
})()
|
|
86
|
-
`)}catch(s){return console.warn("[crawler] failed to extract page content",r,s),null}}async function J(
|
|
86
|
+
`)}catch(s){return console.warn("[crawler] failed to extract page content",r,s),null}}async function J(t){const e=P();try{await e.loadURL(t.base_url,{httpReferrer:t.base_url});const r=await q(e,t);console.log(`[crawler] found ${r.length} links from ${t.remark||t.base_url}`);const s=[];for(const o of r){const a=await W(e,t,o);if(!a||!a.title||!a.contentHtml){console.log(`[crawler] skip empty result for ${o}`);continue}const i={url:a.url||o,title:a.title,content_html:a.contentHtml,content_markdown:v(a.contentHtml),published_at:D(a.timeText)};if(s.push(i),(l.isPush?l.isPush(i):!0)===!1){console.log(`[crawler] skip push due to isPush filter: ${i.url}`);continue}try{await N(i)}catch(g){console.warn("[crawler] push single news item failed",g)}}return console.log(`[crawler] processed ${s.length} items from ${t.remark||t.base_url}`),{success:!0,data:s}}catch(r){return console.warn("[crawler] rule failed",t.remark||t.base_url,r),{success:!1,error:r instanceof Error?r.message:String(r)}}}function j(){if($)return;$=!0;const t=m.interval_ms,e=async()=>{const r=await E();console.log(`[crawler] scheduled run, rules=${r.length}`),m.running=!0,m.running_source=void 0;try{for(const s of r)await J(s)}finally{m.running=!1,m.running_source=void 0,m.next_run_at=new Date(Date.now()+t).toISOString()}};m.next_run_at=new Date(Date.now()+5e3).toISOString(),setTimeout(e,5e3),setInterval(e,t)}exports.initCrawler=H;
|
package/dist/index.mjs
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import
|
|
2
|
-
import { BrowserWindow as
|
|
3
|
-
import
|
|
4
|
-
function
|
|
5
|
-
const
|
|
6
|
-
return
|
|
1
|
+
import A from "electron-store";
|
|
2
|
+
import { BrowserWindow as x } from "electron";
|
|
3
|
+
import O from "turndown";
|
|
4
|
+
function P() {
|
|
5
|
+
const t = x.getAllWindows().find((r) => r.title === "crawler-hidden-window");
|
|
6
|
+
return t || new x({
|
|
7
7
|
show: !1,
|
|
8
8
|
webPreferences: {
|
|
9
9
|
sandbox: !1
|
|
@@ -11,7 +11,7 @@ function O() {
|
|
|
11
11
|
title: "crawler-hidden-window"
|
|
12
12
|
});
|
|
13
13
|
}
|
|
14
|
-
const
|
|
14
|
+
const I = new O({
|
|
15
15
|
headingStyle: "atx",
|
|
16
16
|
// 使用 # 格式的标题
|
|
17
17
|
codeBlockStyle: "fenced",
|
|
@@ -28,125 +28,139 @@ const k = new L({
|
|
|
28
28
|
// 完整的链接引用格式
|
|
29
29
|
preformattedCode: !1,
|
|
30
30
|
// 不使用预格式化代码
|
|
31
|
-
blankReplacement: (
|
|
31
|
+
blankReplacement: (t, e) => e.nodeName === "BR" ? `
|
|
32
32
|
` : ""
|
|
33
33
|
});
|
|
34
|
-
|
|
34
|
+
I.addRule("preserveLineBreaks", {
|
|
35
35
|
filter: ["br"],
|
|
36
36
|
replacement: () => `
|
|
37
37
|
`
|
|
38
38
|
});
|
|
39
|
-
|
|
39
|
+
I.addRule("images", {
|
|
40
40
|
filter: "img",
|
|
41
|
-
replacement: (
|
|
42
|
-
const r =
|
|
41
|
+
replacement: (t, e) => {
|
|
42
|
+
const r = e.alt || "", s = e.src || e.getAttribute("src") || "", o = e.title || "";
|
|
43
43
|
return o ? `` : ``;
|
|
44
44
|
}
|
|
45
45
|
});
|
|
46
|
-
function $(
|
|
47
|
-
return
|
|
46
|
+
function $(t) {
|
|
47
|
+
return t.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<noscript[\s\S]*?<\/noscript>/gi, "").replace(/<iframe[\s\S]*?<\/iframe>/gi, "").replace(/on\w+="[^"]*"/gi, "").replace(/on\w+='[^']*'/gi, "").trim();
|
|
48
48
|
}
|
|
49
|
-
function
|
|
50
|
-
if (!
|
|
49
|
+
function v(t) {
|
|
50
|
+
if (!t || !t.trim())
|
|
51
51
|
return "";
|
|
52
52
|
try {
|
|
53
|
-
const
|
|
54
|
-
if (!
|
|
53
|
+
const e = $(t);
|
|
54
|
+
if (!e)
|
|
55
55
|
return "";
|
|
56
|
-
let r =
|
|
56
|
+
let r = I.turndown(e);
|
|
57
57
|
return r = r.replace(/\n{3,}/g, `
|
|
58
58
|
|
|
59
59
|
`), r = r.split(`
|
|
60
60
|
`).map((s) => s.trimEnd()).join(`
|
|
61
61
|
`), r.trim();
|
|
62
|
-
} catch (
|
|
63
|
-
return console.error("[normalizeMarkdown] 转换失败:",
|
|
62
|
+
} catch (e) {
|
|
63
|
+
return console.error("[normalizeMarkdown] 转换失败:", e), $(t).replace(/<[^>]+>/g, "").replace(/\n{3,}/g, `
|
|
64
64
|
|
|
65
65
|
`).trim();
|
|
66
66
|
}
|
|
67
67
|
}
|
|
68
|
-
function
|
|
69
|
-
const
|
|
70
|
-
if (!
|
|
71
|
-
return
|
|
72
|
-
const
|
|
73
|
-
let
|
|
68
|
+
function D(t) {
|
|
69
|
+
const e = /* @__PURE__ */ new Date(), r = e.getFullYear(), s = e.getMonth() + 1, o = e.getDate(), a = e.getHours(), i = e.getMinutes();
|
|
70
|
+
if (!t || !t.trim())
|
|
71
|
+
return e.toISOString();
|
|
72
|
+
const c = t.trim(), g = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/, S = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, R = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/, U = /(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, T = /(\d{1,2})[月\-/](\d{1,2})[日]?/, M = /(\d{1,2})[:时](\d{1,2})[分]?/;
|
|
73
|
+
let h = r, u = s, f = o, d = a, p = i, n = c.match(g);
|
|
74
74
|
if (n)
|
|
75
|
-
|
|
76
|
-
else if (n =
|
|
77
|
-
|
|
78
|
-
else if (n =
|
|
79
|
-
|
|
80
|
-
else if (n =
|
|
81
|
-
u = parseInt(n[1], 10), f = parseInt(n[2], 10),
|
|
82
|
-
else if (n =
|
|
75
|
+
h = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), d = parseInt(n[4], 10), p = parseInt(n[5], 10);
|
|
76
|
+
else if (n = c.match(S), n)
|
|
77
|
+
h = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), d = parseInt(n[4], 10), p = parseInt(n[5], 10);
|
|
78
|
+
else if (n = c.match(R), n)
|
|
79
|
+
h = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10);
|
|
80
|
+
else if (n = c.match(U), n)
|
|
81
|
+
u = parseInt(n[1], 10), f = parseInt(n[2], 10), d = parseInt(n[3], 10), p = parseInt(n[4], 10);
|
|
82
|
+
else if (n = c.match(T), n)
|
|
83
83
|
u = parseInt(n[1], 10), f = parseInt(n[2], 10);
|
|
84
|
-
else if (n =
|
|
85
|
-
|
|
84
|
+
else if (n = c.match(M), n)
|
|
85
|
+
d = parseInt(n[1], 10), p = parseInt(n[2], 10);
|
|
86
86
|
else {
|
|
87
|
-
const
|
|
88
|
-
return Number.isNaN(
|
|
87
|
+
const k = new Date(c);
|
|
88
|
+
return Number.isNaN(k.getTime()) ? e.toISOString() : k.toISOString();
|
|
89
89
|
}
|
|
90
|
-
(u < 1 || u > 12) && (u = s), (f < 1 || f > 31) && (f = o), (
|
|
91
|
-
const
|
|
92
|
-
return
|
|
90
|
+
(u < 1 || u > 12) && (u = s), (f < 1 || f > 31) && (f = o), (d < 0 || d > 23) && (d = a), (p < 0 || p > 59) && (p = i);
|
|
91
|
+
const y = new Date(h, u - 1, f, d, p, 0, 0);
|
|
92
|
+
return y.getFullYear() !== h || y.getMonth() !== u - 1 || y.getDate() !== f ? e.toISOString() : L(y);
|
|
93
93
|
}
|
|
94
|
-
function
|
|
95
|
-
const
|
|
96
|
-
return `${r}-${s}-${o}T${a}:${
|
|
94
|
+
function L(t) {
|
|
95
|
+
const e = (S) => S.toString().padStart(2, "0"), r = t.getFullYear(), s = e(t.getMonth() + 1), o = e(t.getDate()), a = e(t.getHours()), i = e(t.getMinutes()), c = e(t.getSeconds()), g = t.getMilliseconds().toString().padStart(3, "0");
|
|
96
|
+
return `${r}-${s}-${o}T${a}:${i}:${c}.${g}`;
|
|
97
97
|
}
|
|
98
|
-
const
|
|
99
|
-
async function
|
|
100
|
-
if (
|
|
101
|
-
return
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
98
|
+
const C = A.default || A, w = new C();
|
|
99
|
+
async function E() {
|
|
100
|
+
if (l.rules && l.rules.length > 0)
|
|
101
|
+
return l.rules;
|
|
102
|
+
if (l.devMode)
|
|
103
|
+
return await b();
|
|
104
|
+
const e = w.get("news.rules");
|
|
105
|
+
let r = [];
|
|
106
|
+
if (Array.isArray(e))
|
|
107
|
+
r = e;
|
|
108
|
+
else if (e && Array.isArray(e.rules)) {
|
|
109
|
+
const o = e.updatedAt ? new Date(e.updatedAt).getTime() : 0, a = Date.now();
|
|
110
|
+
o > 0 && a - o <= 18e6 ? r = e.rules : w.delete("news.rules");
|
|
111
|
+
}
|
|
112
|
+
if (r.length > 0)
|
|
113
|
+
return r;
|
|
114
|
+
const s = await b();
|
|
115
|
+
return s.length > 0 ? (l.devMode || w.set("news.rules", {
|
|
116
|
+
rules: s,
|
|
117
|
+
updatedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
118
|
+
}), s) : [];
|
|
107
119
|
}
|
|
108
|
-
const
|
|
120
|
+
const m = {
|
|
109
121
|
running: !1,
|
|
110
122
|
interval_ms: 1800 * 1e3
|
|
111
123
|
};
|
|
112
|
-
let
|
|
124
|
+
let l = {
|
|
113
125
|
rules: [],
|
|
114
126
|
rulesApiUrl: void 0,
|
|
115
127
|
pushApiUrl: void 0,
|
|
116
|
-
|
|
128
|
+
devMode: !1,
|
|
129
|
+
ruleTransformer: (t) => t,
|
|
130
|
+
isPush: void 0,
|
|
117
131
|
newsItemFieldMap: void 0
|
|
118
|
-
},
|
|
119
|
-
function z(
|
|
120
|
-
|
|
121
|
-
...
|
|
122
|
-
...
|
|
132
|
+
}, _ = !1;
|
|
133
|
+
function z(t) {
|
|
134
|
+
l = {
|
|
135
|
+
...l,
|
|
136
|
+
...t,
|
|
123
137
|
// 确保ruleTransformer始终存在
|
|
124
|
-
ruleTransformer:
|
|
138
|
+
ruleTransformer: t.ruleTransformer || ((e) => e && typeof e == "object" && "data" in e ? e.data : e)
|
|
125
139
|
}, W();
|
|
126
140
|
}
|
|
127
|
-
async function
|
|
128
|
-
if (!
|
|
141
|
+
async function b() {
|
|
142
|
+
if (!l.rulesApiUrl)
|
|
129
143
|
return [];
|
|
130
144
|
try {
|
|
131
|
-
const
|
|
132
|
-
if (!
|
|
133
|
-
throw new Error(`Failed to fetch rules from API: ${
|
|
134
|
-
const
|
|
145
|
+
const t = await fetch(l.rulesApiUrl);
|
|
146
|
+
if (!t.ok)
|
|
147
|
+
throw new Error(`Failed to fetch rules from API: ${t.status} ${t.statusText}`);
|
|
148
|
+
const e = await t.json(), r = l.ruleTransformer(e);
|
|
135
149
|
return Array.isArray(r) ? r : (console.warn("[crawler] Rules API returned non-array data, using empty array instead"), []);
|
|
136
|
-
} catch (
|
|
137
|
-
return console.error("[crawler] Failed to fetch rules from API:",
|
|
150
|
+
} catch (t) {
|
|
151
|
+
return console.error("[crawler] Failed to fetch rules from API:", t), [];
|
|
138
152
|
}
|
|
139
153
|
}
|
|
140
|
-
async function H(
|
|
141
|
-
if (!
|
|
154
|
+
async function H(t) {
|
|
155
|
+
if (!l.pushApiUrl)
|
|
142
156
|
return;
|
|
143
|
-
const
|
|
144
|
-
if (
|
|
145
|
-
console.log(`[crawler] URL already pushed, skipping: ${
|
|
157
|
+
const e = w.get("news.pushedUrls") || [];
|
|
158
|
+
if (e.includes(t.url)) {
|
|
159
|
+
console.log(`[crawler] URL already pushed, skipping: ${t.url}`);
|
|
146
160
|
return;
|
|
147
161
|
}
|
|
148
162
|
try {
|
|
149
|
-
const r = N(
|
|
163
|
+
const r = N(t), s = await fetch(l.pushApiUrl, {
|
|
150
164
|
method: "POST",
|
|
151
165
|
headers: {
|
|
152
166
|
"Content-Type": "application/json"
|
|
@@ -155,32 +169,32 @@ async function H(e) {
|
|
|
155
169
|
});
|
|
156
170
|
if (!s.ok)
|
|
157
171
|
throw new Error(`Failed to push results to API: ${s.status} ${s.statusText}`);
|
|
158
|
-
const o = [...
|
|
172
|
+
const o = [...e, t.url];
|
|
159
173
|
w.set("news.pushedUrls", o), console.log("[crawler] Results pushed to API successfully");
|
|
160
174
|
} catch (r) {
|
|
161
175
|
console.error("[crawler] Failed to push results to API:", r);
|
|
162
176
|
}
|
|
163
177
|
}
|
|
164
|
-
function N(
|
|
165
|
-
const
|
|
166
|
-
if (!
|
|
167
|
-
return
|
|
168
|
-
const r = {}, s = Object.entries(
|
|
178
|
+
function N(t) {
|
|
179
|
+
const e = l.newsItemFieldMap;
|
|
180
|
+
if (!e || Object.keys(e).length === 0)
|
|
181
|
+
return t;
|
|
182
|
+
const r = {}, s = Object.entries(t);
|
|
169
183
|
for (const [o, a] of s) {
|
|
170
|
-
const
|
|
171
|
-
if (
|
|
172
|
-
const
|
|
173
|
-
r[
|
|
184
|
+
const i = e[o];
|
|
185
|
+
if (i === "-") continue;
|
|
186
|
+
const c = typeof i == "string" ? i : o;
|
|
187
|
+
r[c] = a;
|
|
174
188
|
}
|
|
175
189
|
return r;
|
|
176
190
|
}
|
|
177
|
-
async function F(
|
|
178
|
-
return await
|
|
191
|
+
async function F(t, e) {
|
|
192
|
+
return await t.webContents.executeJavaScript(
|
|
179
193
|
`
|
|
180
194
|
(() => {
|
|
181
195
|
const links = []
|
|
182
196
|
// 在指定范围内查找所有链接
|
|
183
|
-
const rangeElements = document.querySelectorAll(${JSON.stringify(
|
|
197
|
+
const rangeElements = document.querySelectorAll(${JSON.stringify(e.home_range_selector)})
|
|
184
198
|
if (rangeElements.length === 0) {
|
|
185
199
|
// 如果没有找到范围,则在整个文档中查找
|
|
186
200
|
const allLinks = document.querySelectorAll('a')
|
|
@@ -212,9 +226,9 @@ async function F(e, t) {
|
|
|
212
226
|
`
|
|
213
227
|
);
|
|
214
228
|
}
|
|
215
|
-
async function q(
|
|
229
|
+
async function q(t, e, r) {
|
|
216
230
|
try {
|
|
217
|
-
return await
|
|
231
|
+
return await t.loadURL(r, { httpReferrer: e.base_url }), await t.webContents.executeJavaScript(
|
|
218
232
|
`
|
|
219
233
|
(() => {
|
|
220
234
|
const pickText = (sel) => {
|
|
@@ -253,11 +267,11 @@ async function q(e, t, r) {
|
|
|
253
267
|
return clone.innerHTML || ''
|
|
254
268
|
}
|
|
255
269
|
return {
|
|
256
|
-
title: pickText(${JSON.stringify(
|
|
257
|
-
contentHtml: pickContentHtml(${JSON.stringify(
|
|
258
|
-
|
|
270
|
+
title: pickText(${JSON.stringify(e.title_selector)}),
|
|
271
|
+
contentHtml: pickContentHtml(${JSON.stringify(e.content_selector)}, ${JSON.stringify(
|
|
272
|
+
e.exclude_selectors || []
|
|
259
273
|
)}),
|
|
260
|
-
timeText: pickText(${JSON.stringify(
|
|
274
|
+
timeText: pickText(${JSON.stringify(e.time_selector)}),
|
|
261
275
|
url: location.href
|
|
262
276
|
}
|
|
263
277
|
})()
|
|
@@ -267,58 +281,61 @@ async function q(e, t, r) {
|
|
|
267
281
|
return console.warn("[crawler] failed to extract page content", r, s), null;
|
|
268
282
|
}
|
|
269
283
|
}
|
|
270
|
-
async function J(
|
|
271
|
-
const
|
|
284
|
+
async function J(t) {
|
|
285
|
+
const e = P();
|
|
272
286
|
try {
|
|
273
|
-
await
|
|
274
|
-
const r = await F(
|
|
275
|
-
console.log(`[crawler] found ${r.length} links from ${
|
|
287
|
+
await e.loadURL(t.base_url, { httpReferrer: t.base_url });
|
|
288
|
+
const r = await F(e, t);
|
|
289
|
+
console.log(`[crawler] found ${r.length} links from ${t.remark || t.base_url}`);
|
|
276
290
|
const s = [];
|
|
277
291
|
for (const o of r) {
|
|
278
|
-
const a = await q(
|
|
292
|
+
const a = await q(e, t, o);
|
|
279
293
|
if (!a || !a.title || !a.contentHtml) {
|
|
280
294
|
console.log(`[crawler] skip empty result for ${o}`);
|
|
281
295
|
continue;
|
|
282
296
|
}
|
|
283
|
-
const
|
|
297
|
+
const i = {
|
|
284
298
|
url: a.url || o,
|
|
285
299
|
title: a.title,
|
|
286
300
|
content_html: a.contentHtml,
|
|
287
|
-
content_markdown:
|
|
288
|
-
published_at:
|
|
301
|
+
content_markdown: v(a.contentHtml),
|
|
302
|
+
published_at: D(a.timeText)
|
|
289
303
|
};
|
|
290
|
-
s.push(
|
|
304
|
+
if (s.push(i), (l.isPush ? l.isPush(i) : !0) === !1) {
|
|
305
|
+
console.log(`[crawler] skip push due to isPush filter: ${i.url}`);
|
|
306
|
+
continue;
|
|
307
|
+
}
|
|
291
308
|
try {
|
|
292
|
-
await H(
|
|
293
|
-
} catch (
|
|
294
|
-
console.warn("[crawler] push single news item failed",
|
|
309
|
+
await H(i);
|
|
310
|
+
} catch (g) {
|
|
311
|
+
console.warn("[crawler] push single news item failed", g);
|
|
295
312
|
}
|
|
296
313
|
}
|
|
297
|
-
return console.log(`[crawler] processed ${s.length} items from ${
|
|
314
|
+
return console.log(`[crawler] processed ${s.length} items from ${t.remark || t.base_url}`), {
|
|
298
315
|
success: !0,
|
|
299
316
|
data: s
|
|
300
317
|
};
|
|
301
318
|
} catch (r) {
|
|
302
|
-
return console.warn("[crawler] rule failed",
|
|
319
|
+
return console.warn("[crawler] rule failed", t.remark || t.base_url, r), {
|
|
303
320
|
success: !1,
|
|
304
321
|
error: r instanceof Error ? r.message : String(r)
|
|
305
322
|
};
|
|
306
323
|
}
|
|
307
324
|
}
|
|
308
325
|
function W() {
|
|
309
|
-
if (
|
|
310
|
-
|
|
311
|
-
const
|
|
312
|
-
const r = await
|
|
313
|
-
console.log(`[crawler] scheduled run, rules=${r.length}`),
|
|
326
|
+
if (_) return;
|
|
327
|
+
_ = !0;
|
|
328
|
+
const t = m.interval_ms, e = async () => {
|
|
329
|
+
const r = await E();
|
|
330
|
+
console.log(`[crawler] scheduled run, rules=${r.length}`), m.running = !0, m.running_source = void 0;
|
|
314
331
|
try {
|
|
315
332
|
for (const s of r)
|
|
316
333
|
await J(s);
|
|
317
334
|
} finally {
|
|
318
|
-
|
|
335
|
+
m.running = !1, m.running_source = void 0, m.next_run_at = new Date(Date.now() + t).toISOString();
|
|
319
336
|
}
|
|
320
337
|
};
|
|
321
|
-
|
|
338
|
+
m.next_run_at = new Date(Date.now() + 5e3).toISOString(), setTimeout(e, 5e3), setInterval(e, t);
|
|
322
339
|
}
|
|
323
340
|
export {
|
|
324
341
|
z as initCrawler
|
package/dist/newsCrawler.d.ts
CHANGED
|
@@ -19,6 +19,20 @@ export type CrawlerConfig = {
|
|
|
19
19
|
rulesApiUrl?: string;
|
|
20
20
|
pushApiUrl?: string;
|
|
21
21
|
ruleTransformer?: (data: any) => any;
|
|
22
|
+
/**
|
|
23
|
+
* 是否处于开发模式
|
|
24
|
+
* - 开发模式:不使用本地缓存,每次都优先使用内存 rules,其次直接从 API 拉取
|
|
25
|
+
* - 生产模式:会使用本地缓存(带 5 小时过期时间)
|
|
26
|
+
*/
|
|
27
|
+
devMode?: boolean;
|
|
28
|
+
/**
|
|
29
|
+
* 推送前判断函数
|
|
30
|
+
* - 返回 true、undefined 或 null:可以推送
|
|
31
|
+
* - 返回 false:不推送
|
|
32
|
+
* @param item 新闻项
|
|
33
|
+
* @returns 是否允许推送
|
|
34
|
+
*/
|
|
35
|
+
isPush?: (item: NewsItem) => boolean | null | undefined;
|
|
22
36
|
newsItemFieldMap?: Partial<Record<keyof NewsItem, string | '-'>>;
|
|
23
37
|
};
|
|
24
38
|
export type NewsItem = {
|