@howuse/electron-crawler 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +32 -0
- package/dist/index.js +4 -4
- package/dist/index.mjs +61 -57
- package/dist/newsCrawler.d.ts +8 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -63,6 +63,36 @@ initCrawler({
|
|
|
63
63
|
})
|
|
64
64
|
```
|
|
65
65
|
|
|
66
|
+
### 使用推送过滤函数
|
|
67
|
+
|
|
68
|
+
```javascript
|
|
69
|
+
import { initCrawler } from '@howuse/electron-crawler'
|
|
70
|
+
|
|
71
|
+
initCrawler({
|
|
72
|
+
rules: [/* ... */],
|
|
73
|
+
pushApiUrl: 'https://api.example.com/news/push',
|
|
74
|
+
// 只推送包含特定关键词的新闻
|
|
75
|
+
isPush: (item) => {
|
|
76
|
+
if (item.title.includes('重要') || item.title.includes('紧急')) {
|
|
77
|
+
return true // 允许推送
|
|
78
|
+
}
|
|
79
|
+
return false // 不推送
|
|
80
|
+
}
|
|
81
|
+
})
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
### 开发模式
|
|
85
|
+
|
|
86
|
+
```javascript
|
|
87
|
+
import { initCrawler } from '@howuse/electron-crawler'
|
|
88
|
+
|
|
89
|
+
initCrawler({
|
|
90
|
+
rules: [/* ... */],
|
|
91
|
+
pushApiUrl: 'https://api.example.com/news/push',
|
|
92
|
+
devMode: true // 开发模式:不使用本地缓存,每次都从 API 拉取最新规则
|
|
93
|
+
})
|
|
94
|
+
```
|
|
95
|
+
|
|
66
96
|
### 配置选项
|
|
67
97
|
|
|
68
98
|
| 选项 | 类型 | 描述 |
|
|
@@ -71,6 +101,8 @@ initCrawler({
|
|
|
71
101
|
| `rulesApiUrl` | `string` | 规则API接口URL |
|
|
72
102
|
| `pushApiUrl` | `string` | 结果推送API接口URL(必填以启用推送) |
|
|
73
103
|
| `ruleTransformer` | `(data: any) => any` | 规则转换函数 |
|
|
104
|
+
| `devMode` | `boolean` | 是否处于开发模式。开发模式:不使用本地缓存,每次都优先使用内存 rules,其次直接从 API 拉取;生产模式:会使用本地缓存(带 5 小时过期时间) |
|
|
105
|
+
| `isPush` | `(item: NewsItem) => boolean \| null \| undefined` | 推送前判断函数。返回 `true`、`undefined` 或 `null` 时允许推送,返回 `false` 时不推送。可用于过滤不需要推送的新闻项 |
|
|
74
106
|
| `newsItemFieldMap` | `Partial<Record<keyof NewsItem, string \| '-'>>` | 推送字段映射,值为 '-' 表示忽略该字段 |
|
|
75
107
|
|
|
76
108
|
### NewsRule 结构
|
package/dist/index.js
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
"use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const A=require("electron-store"),x=require("electron"),O=require("turndown");function
|
|
1
|
+
"use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const A=require("electron-store"),x=require("electron"),O=require("turndown");function P(){const t=x.BrowserWindow.getAllWindows().find(r=>r.title==="crawler-hidden-window");return t||new x.BrowserWindow({show:!1,webPreferences:{sandbox:!1},title:"crawler-hidden-window"})}const I=new O({headingStyle:"atx",codeBlockStyle:"fenced",bulletListMarker:"-",emDelimiter:"*",strongDelimiter:"**",linkStyle:"inlined",linkReferenceStyle:"full",preformattedCode:!1,blankReplacement:(t,e)=>e.nodeName==="BR"?`
|
|
2
2
|
`:""});I.addRule("preserveLineBreaks",{filter:["br"],replacement:()=>`
|
|
3
|
-
`});I.addRule("images",{filter:"img",replacement:(t,e)=>{const r=e.alt||"",s=e.src||e.getAttribute("src")||"",o=e.title||"";return o?``:``}});function b(t){return t.replace(/<script[\s\S]*?<\/script>/gi,"").replace(/<style[\s\S]*?<\/style>/gi,"").replace(/<noscript[\s\S]*?<\/noscript>/gi,"").replace(/<iframe[\s\S]*?<\/iframe>/gi,"").replace(/on\w+="[^"]*"/gi,"").replace(/on\w+='[^']*'/gi,"").trim()}function
|
|
3
|
+
`});I.addRule("images",{filter:"img",replacement:(t,e)=>{const r=e.alt||"",s=e.src||e.getAttribute("src")||"",o=e.title||"";return o?``:``}});function b(t){return t.replace(/<script[\s\S]*?<\/script>/gi,"").replace(/<style[\s\S]*?<\/style>/gi,"").replace(/<noscript[\s\S]*?<\/noscript>/gi,"").replace(/<iframe[\s\S]*?<\/iframe>/gi,"").replace(/on\w+="[^"]*"/gi,"").replace(/on\w+='[^']*'/gi,"").trim()}function v(t){if(!t||!t.trim())return"";try{const e=b(t);if(!e)return"";let r=I.turndown(e);return r=r.replace(/\n{3,}/g,`
|
|
4
4
|
|
|
5
5
|
`),r=r.split(`
|
|
6
6
|
`).map(s=>s.trimEnd()).join(`
|
|
7
7
|
`),r.trim()}catch(e){return console.error("[normalizeMarkdown] 转换失败:",e),b(t).replace(/<[^>]+>/g,"").replace(/\n{3,}/g,`
|
|
8
8
|
|
|
9
|
-
`).trim()}}function
|
|
9
|
+
`).trim()}}function D(t){const e=new Date,r=e.getFullYear(),s=e.getMonth()+1,o=e.getDate(),a=e.getHours(),i=e.getMinutes();if(!t||!t.trim())return e.toISOString();const c=t.trim(),g=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/,S=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,R=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/,U=/(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,T=/(\d{1,2})[月\-/](\d{1,2})[日]?/,M=/(\d{1,2})[:时](\d{1,2})[分]?/;let h=r,u=s,f=o,d=a,p=i,n=c.match(g);if(n)h=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),d=parseInt(n[4],10),p=parseInt(n[5],10);else if(n=c.match(S),n)h=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),d=parseInt(n[4],10),p=parseInt(n[5],10);else if(n=c.match(R),n)h=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10);else if(n=c.match(U),n)u=parseInt(n[1],10),f=parseInt(n[2],10),d=parseInt(n[3],10),p=parseInt(n[4],10);else if(n=c.match(T),n)u=parseInt(n[1],10),f=parseInt(n[2],10);else if(n=c.match(M),n)d=parseInt(n[1],10),p=parseInt(n[2],10);else{const k=new Date(c);return Number.isNaN(k.getTime())?e.toISOString():k.toISOString()}(u<1||u>12)&&(u=s),(f<1||f>31)&&(f=o),(d<0||d>23)&&(d=a),(p<0||p>59)&&(p=i);const y=new Date(h,u-1,f,d,p,0,0);return y.getFullYear()!==h||y.getMonth()!==u-1||y.getDate()!==f?e.toISOString():L(y)}function L(t){const e=S=>S.toString().padStart(2,"0"),r=t.getFullYear(),s=e(t.getMonth()+1),o=e(t.getDate()),a=e(t.getHours()),i=e(t.getMinutes()),c=e(t.getSeconds()),g=t.getMilliseconds().toString().padStart(3,"0");return`${r}-${s}-${o}T${a}:${i}:${c}.${g}`}const C=A.default||A,w=new C;async function E(){if(l.rules&&l.rules.length>0)return l.rules;if(l.devMode)return await _();const e=w.get("news.rules");let r=[];if(Array.isArray(e))r=e;else if(e&&Array.isArray(e.rules)){const o=e.updatedAt?new Date(e.updatedAt).getTime():0,a=Date.now();o>0&&a-o<=18e6?r=e.rules:w.delete("news.rules")}if(r.length>0)return r;const s=await _();return s.length>0?(l.devMode||w.set("news.rules",{rules:s,updatedAt:new Date().toISOString()}),s):[]}const m={running:!1,interval_ms:1800*1e3};let l={rules:[],rulesApiUrl:void 0,pushApiUrl:void 0,devMode:!1,ruleTransformer:t=>t,isPush:void 0,newsItemFieldMap:void 0},$=!1;function H(t){l={...l,...t,ruleTransformer:t.ruleTransformer||(e=>e&&typeof e=="object"&&"data"in e?e.data:e)},j()}async function _(){if(!l.rulesApiUrl)return[];try{const t=await fetch(l.rulesApiUrl);if(!t.ok)throw new Error(`Failed to fetch rules from API: ${t.status} ${t.statusText}`);const e=await t.json(),r=l.ruleTransformer(e);return Array.isArray(r)?r:(console.warn("[crawler] Rules API returned non-array data, using empty array instead"),[])}catch(t){return console.error("[crawler] Failed to fetch rules from API:",t),[]}}async function N(t){if(!l.pushApiUrl)return;const e=w.get("news.pushedUrls")||[];if(e.includes(t.url)){console.log(`[crawler] URL already pushed, skipping: ${t.url}`);return}try{const r=F(t),s=await fetch(l.pushApiUrl,{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify(r)});if(!s.ok)throw new Error(`Failed to push results to API: ${s.status} ${s.statusText}`);const o=[...e,t.url];w.set("news.pushedUrls",o),console.log("[crawler] Results pushed to API successfully")}catch(r){console.error("[crawler] Failed to push results to API:",r)}}function F(t){const e=l.newsItemFieldMap;if(!e||Object.keys(e).length===0)return t;const r={},s=Object.entries(t);for(const[o,a]of s){const i=e[o];if(i==="-")continue;const c=typeof i=="string"?i:o;r[c]=a}return r}async function q(t,e){return await t.webContents.executeJavaScript(`
|
|
10
10
|
(() => {
|
|
11
11
|
const links = []
|
|
12
12
|
// 在指定范围内查找所有链接
|
|
@@ -83,4 +83,4 @@
|
|
|
83
83
|
url: location.href
|
|
84
84
|
}
|
|
85
85
|
})()
|
|
86
|
-
`)}catch(s){return console.warn("[crawler] failed to extract page content",r,s),null}}async function J(t){const e=
|
|
86
|
+
`)}catch(s){return console.warn("[crawler] failed to extract page content",r,s),null}}async function J(t){const e=P();try{await e.loadURL(t.base_url,{httpReferrer:t.base_url});const r=await q(e,t);console.log(`[crawler] found ${r.length} links from ${t.remark||t.base_url}`);const s=[];for(const o of r){const a=await W(e,t,o);if(!a||!a.title||!a.contentHtml){console.log(`[crawler] skip empty result for ${o}`);continue}const i={url:a.url||o,title:a.title,content_html:a.contentHtml,content_markdown:v(a.contentHtml),published_at:D(a.timeText)};if(s.push(i),(l.isPush?l.isPush(i):!0)===!1){console.log(`[crawler] skip push due to isPush filter: ${i.url}`);continue}try{await N(i)}catch(g){console.warn("[crawler] push single news item failed",g)}}return console.log(`[crawler] processed ${s.length} items from ${t.remark||t.base_url}`),{success:!0,data:s}}catch(r){return console.warn("[crawler] rule failed",t.remark||t.base_url,r),{success:!1,error:r instanceof Error?r.message:String(r)}}}function j(){if($)return;$=!0;const t=m.interval_ms,e=async()=>{const r=await E();console.log(`[crawler] scheduled run, rules=${r.length}`),m.running=!0,m.running_source=void 0;try{for(const s of r)await J(s)}finally{m.running=!1,m.running_source=void 0,m.next_run_at=new Date(Date.now()+t).toISOString()}};m.next_run_at=new Date(Date.now()+5e3).toISOString(),setTimeout(e,5e3),setInterval(e,t)}exports.initCrawler=H;
|
package/dist/index.mjs
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import A from "electron-store";
|
|
2
2
|
import { BrowserWindow as x } from "electron";
|
|
3
3
|
import O from "turndown";
|
|
4
|
-
function
|
|
4
|
+
function P() {
|
|
5
5
|
const t = x.getAllWindows().find((r) => r.title === "crawler-hidden-window");
|
|
6
6
|
return t || new x({
|
|
7
7
|
show: !1,
|
|
@@ -43,14 +43,14 @@ I.addRule("images", {
|
|
|
43
43
|
return o ? `` : ``;
|
|
44
44
|
}
|
|
45
45
|
});
|
|
46
|
-
function
|
|
46
|
+
function $(t) {
|
|
47
47
|
return t.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<noscript[\s\S]*?<\/noscript>/gi, "").replace(/<iframe[\s\S]*?<\/iframe>/gi, "").replace(/on\w+="[^"]*"/gi, "").replace(/on\w+='[^']*'/gi, "").trim();
|
|
48
48
|
}
|
|
49
49
|
function v(t) {
|
|
50
50
|
if (!t || !t.trim())
|
|
51
51
|
return "";
|
|
52
52
|
try {
|
|
53
|
-
const e =
|
|
53
|
+
const e = $(t);
|
|
54
54
|
if (!e)
|
|
55
55
|
return "";
|
|
56
56
|
let r = I.turndown(e);
|
|
@@ -60,46 +60,46 @@ function v(t) {
|
|
|
60
60
|
`).map((s) => s.trimEnd()).join(`
|
|
61
61
|
`), r.trim();
|
|
62
62
|
} catch (e) {
|
|
63
|
-
return console.error("[normalizeMarkdown] 转换失败:", e),
|
|
63
|
+
return console.error("[normalizeMarkdown] 转换失败:", e), $(t).replace(/<[^>]+>/g, "").replace(/\n{3,}/g, `
|
|
64
64
|
|
|
65
65
|
`).trim();
|
|
66
66
|
}
|
|
67
67
|
}
|
|
68
|
-
function
|
|
68
|
+
function D(t) {
|
|
69
69
|
const e = /* @__PURE__ */ new Date(), r = e.getFullYear(), s = e.getMonth() + 1, o = e.getDate(), a = e.getHours(), i = e.getMinutes();
|
|
70
70
|
if (!t || !t.trim())
|
|
71
71
|
return e.toISOString();
|
|
72
|
-
const
|
|
73
|
-
let
|
|
72
|
+
const c = t.trim(), g = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/, S = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, R = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/, U = /(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, T = /(\d{1,2})[月\-/](\d{1,2})[日]?/, M = /(\d{1,2})[:时](\d{1,2})[分]?/;
|
|
73
|
+
let h = r, u = s, f = o, d = a, p = i, n = c.match(g);
|
|
74
74
|
if (n)
|
|
75
|
-
|
|
76
|
-
else if (n =
|
|
77
|
-
|
|
78
|
-
else if (n =
|
|
79
|
-
|
|
80
|
-
else if (n =
|
|
81
|
-
u = parseInt(n[1], 10), f = parseInt(n[2], 10), d = parseInt(n[3], 10),
|
|
82
|
-
else if (n =
|
|
75
|
+
h = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), d = parseInt(n[4], 10), p = parseInt(n[5], 10);
|
|
76
|
+
else if (n = c.match(S), n)
|
|
77
|
+
h = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), d = parseInt(n[4], 10), p = parseInt(n[5], 10);
|
|
78
|
+
else if (n = c.match(R), n)
|
|
79
|
+
h = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10);
|
|
80
|
+
else if (n = c.match(U), n)
|
|
81
|
+
u = parseInt(n[1], 10), f = parseInt(n[2], 10), d = parseInt(n[3], 10), p = parseInt(n[4], 10);
|
|
82
|
+
else if (n = c.match(T), n)
|
|
83
83
|
u = parseInt(n[1], 10), f = parseInt(n[2], 10);
|
|
84
|
-
else if (n =
|
|
85
|
-
d = parseInt(n[1], 10),
|
|
84
|
+
else if (n = c.match(M), n)
|
|
85
|
+
d = parseInt(n[1], 10), p = parseInt(n[2], 10);
|
|
86
86
|
else {
|
|
87
|
-
const k = new Date(
|
|
87
|
+
const k = new Date(c);
|
|
88
88
|
return Number.isNaN(k.getTime()) ? e.toISOString() : k.toISOString();
|
|
89
89
|
}
|
|
90
|
-
(u < 1 || u > 12) && (u = s), (f < 1 || f > 31) && (f = o), (d < 0 || d > 23) && (d = a), (
|
|
91
|
-
const
|
|
92
|
-
return
|
|
90
|
+
(u < 1 || u > 12) && (u = s), (f < 1 || f > 31) && (f = o), (d < 0 || d > 23) && (d = a), (p < 0 || p > 59) && (p = i);
|
|
91
|
+
const y = new Date(h, u - 1, f, d, p, 0, 0);
|
|
92
|
+
return y.getFullYear() !== h || y.getMonth() !== u - 1 || y.getDate() !== f ? e.toISOString() : L(y);
|
|
93
93
|
}
|
|
94
|
-
function
|
|
95
|
-
const e = (S) => S.toString().padStart(2, "0"), r = t.getFullYear(), s = e(t.getMonth() + 1), o = e(t.getDate()), a = e(t.getHours()), i = e(t.getMinutes()),
|
|
96
|
-
return `${r}-${s}-${o}T${a}:${i}:${
|
|
94
|
+
function L(t) {
|
|
95
|
+
const e = (S) => S.toString().padStart(2, "0"), r = t.getFullYear(), s = e(t.getMonth() + 1), o = e(t.getDate()), a = e(t.getHours()), i = e(t.getMinutes()), c = e(t.getSeconds()), g = t.getMilliseconds().toString().padStart(3, "0");
|
|
96
|
+
return `${r}-${s}-${o}T${a}:${i}:${c}.${g}`;
|
|
97
97
|
}
|
|
98
|
-
const
|
|
99
|
-
async function
|
|
100
|
-
if (
|
|
101
|
-
return
|
|
102
|
-
if (
|
|
98
|
+
const C = A.default || A, w = new C();
|
|
99
|
+
async function E() {
|
|
100
|
+
if (l.rules && l.rules.length > 0)
|
|
101
|
+
return l.rules;
|
|
102
|
+
if (l.devMode)
|
|
103
103
|
return await b();
|
|
104
104
|
const e = w.get("news.rules");
|
|
105
105
|
let r = [];
|
|
@@ -112,46 +112,47 @@ async function H() {
|
|
|
112
112
|
if (r.length > 0)
|
|
113
113
|
return r;
|
|
114
114
|
const s = await b();
|
|
115
|
-
return s.length > 0 ? (
|
|
115
|
+
return s.length > 0 ? (l.devMode || w.set("news.rules", {
|
|
116
116
|
rules: s,
|
|
117
117
|
updatedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
118
118
|
}), s) : [];
|
|
119
119
|
}
|
|
120
|
-
const
|
|
120
|
+
const m = {
|
|
121
121
|
running: !1,
|
|
122
122
|
interval_ms: 1800 * 1e3
|
|
123
123
|
};
|
|
124
|
-
let
|
|
124
|
+
let l = {
|
|
125
125
|
rules: [],
|
|
126
126
|
rulesApiUrl: void 0,
|
|
127
127
|
pushApiUrl: void 0,
|
|
128
128
|
devMode: !1,
|
|
129
129
|
ruleTransformer: (t) => t,
|
|
130
|
+
isPush: void 0,
|
|
130
131
|
newsItemFieldMap: void 0
|
|
131
|
-
},
|
|
132
|
+
}, _ = !1;
|
|
132
133
|
function z(t) {
|
|
133
|
-
|
|
134
|
-
...
|
|
134
|
+
l = {
|
|
135
|
+
...l,
|
|
135
136
|
...t,
|
|
136
137
|
// 确保ruleTransformer始终存在
|
|
137
138
|
ruleTransformer: t.ruleTransformer || ((e) => e && typeof e == "object" && "data" in e ? e.data : e)
|
|
138
139
|
}, W();
|
|
139
140
|
}
|
|
140
141
|
async function b() {
|
|
141
|
-
if (!
|
|
142
|
+
if (!l.rulesApiUrl)
|
|
142
143
|
return [];
|
|
143
144
|
try {
|
|
144
|
-
const t = await fetch(
|
|
145
|
+
const t = await fetch(l.rulesApiUrl);
|
|
145
146
|
if (!t.ok)
|
|
146
147
|
throw new Error(`Failed to fetch rules from API: ${t.status} ${t.statusText}`);
|
|
147
|
-
const e = await t.json(), r =
|
|
148
|
+
const e = await t.json(), r = l.ruleTransformer(e);
|
|
148
149
|
return Array.isArray(r) ? r : (console.warn("[crawler] Rules API returned non-array data, using empty array instead"), []);
|
|
149
150
|
} catch (t) {
|
|
150
151
|
return console.error("[crawler] Failed to fetch rules from API:", t), [];
|
|
151
152
|
}
|
|
152
153
|
}
|
|
153
|
-
async function
|
|
154
|
-
if (!
|
|
154
|
+
async function H(t) {
|
|
155
|
+
if (!l.pushApiUrl)
|
|
155
156
|
return;
|
|
156
157
|
const e = w.get("news.pushedUrls") || [];
|
|
157
158
|
if (e.includes(t.url)) {
|
|
@@ -159,7 +160,7 @@ async function P(t) {
|
|
|
159
160
|
return;
|
|
160
161
|
}
|
|
161
162
|
try {
|
|
162
|
-
const r = N(t), s = await fetch(
|
|
163
|
+
const r = N(t), s = await fetch(l.pushApiUrl, {
|
|
163
164
|
method: "POST",
|
|
164
165
|
headers: {
|
|
165
166
|
"Content-Type": "application/json"
|
|
@@ -175,15 +176,15 @@ async function P(t) {
|
|
|
175
176
|
}
|
|
176
177
|
}
|
|
177
178
|
function N(t) {
|
|
178
|
-
const e =
|
|
179
|
+
const e = l.newsItemFieldMap;
|
|
179
180
|
if (!e || Object.keys(e).length === 0)
|
|
180
181
|
return t;
|
|
181
182
|
const r = {}, s = Object.entries(t);
|
|
182
183
|
for (const [o, a] of s) {
|
|
183
184
|
const i = e[o];
|
|
184
185
|
if (i === "-") continue;
|
|
185
|
-
const
|
|
186
|
-
r[
|
|
186
|
+
const c = typeof i == "string" ? i : o;
|
|
187
|
+
r[c] = a;
|
|
187
188
|
}
|
|
188
189
|
return r;
|
|
189
190
|
}
|
|
@@ -281,7 +282,7 @@ async function q(t, e, r) {
|
|
|
281
282
|
}
|
|
282
283
|
}
|
|
283
284
|
async function J(t) {
|
|
284
|
-
const e =
|
|
285
|
+
const e = P();
|
|
285
286
|
try {
|
|
286
287
|
await e.loadURL(t.base_url, { httpReferrer: t.base_url });
|
|
287
288
|
const r = await F(e, t);
|
|
@@ -298,13 +299,16 @@ async function J(t) {
|
|
|
298
299
|
title: a.title,
|
|
299
300
|
content_html: a.contentHtml,
|
|
300
301
|
content_markdown: v(a.contentHtml),
|
|
301
|
-
published_at:
|
|
302
|
+
published_at: D(a.timeText)
|
|
302
303
|
};
|
|
303
|
-
s.push(i)
|
|
304
|
+
if (s.push(i), (l.isPush ? l.isPush(i) : !0) === !1) {
|
|
305
|
+
console.log(`[crawler] skip push due to isPush filter: ${i.url}`);
|
|
306
|
+
continue;
|
|
307
|
+
}
|
|
304
308
|
try {
|
|
305
|
-
await
|
|
306
|
-
} catch (
|
|
307
|
-
console.warn("[crawler] push single news item failed",
|
|
309
|
+
await H(i);
|
|
310
|
+
} catch (g) {
|
|
311
|
+
console.warn("[crawler] push single news item failed", g);
|
|
308
312
|
}
|
|
309
313
|
}
|
|
310
314
|
return console.log(`[crawler] processed ${s.length} items from ${t.remark || t.base_url}`), {
|
|
@@ -319,19 +323,19 @@ async function J(t) {
|
|
|
319
323
|
}
|
|
320
324
|
}
|
|
321
325
|
function W() {
|
|
322
|
-
if (
|
|
323
|
-
|
|
324
|
-
const t =
|
|
325
|
-
const r = await
|
|
326
|
-
console.log(`[crawler] scheduled run, rules=${r.length}`),
|
|
326
|
+
if (_) return;
|
|
327
|
+
_ = !0;
|
|
328
|
+
const t = m.interval_ms, e = async () => {
|
|
329
|
+
const r = await E();
|
|
330
|
+
console.log(`[crawler] scheduled run, rules=${r.length}`), m.running = !0, m.running_source = void 0;
|
|
327
331
|
try {
|
|
328
332
|
for (const s of r)
|
|
329
333
|
await J(s);
|
|
330
334
|
} finally {
|
|
331
|
-
|
|
335
|
+
m.running = !1, m.running_source = void 0, m.next_run_at = new Date(Date.now() + t).toISOString();
|
|
332
336
|
}
|
|
333
337
|
};
|
|
334
|
-
|
|
338
|
+
m.next_run_at = new Date(Date.now() + 5e3).toISOString(), setTimeout(e, 5e3), setInterval(e, t);
|
|
335
339
|
}
|
|
336
340
|
export {
|
|
337
341
|
z as initCrawler
|
package/dist/newsCrawler.d.ts
CHANGED
|
@@ -25,6 +25,14 @@ export type CrawlerConfig = {
|
|
|
25
25
|
* - 生产模式:会使用本地缓存(带 5 小时过期时间)
|
|
26
26
|
*/
|
|
27
27
|
devMode?: boolean;
|
|
28
|
+
/**
|
|
29
|
+
* 推送前判断函数
|
|
30
|
+
* - 返回 true、undefined 或 null:可以推送
|
|
31
|
+
* - 返回 false:不推送
|
|
32
|
+
* @param item 新闻项
|
|
33
|
+
* @returns 是否允许推送
|
|
34
|
+
*/
|
|
35
|
+
isPush?: (item: NewsItem) => boolean | null | undefined;
|
|
28
36
|
newsItemFieldMap?: Partial<Record<keyof NewsItem, string | '-'>>;
|
|
29
37
|
};
|
|
30
38
|
export type NewsItem = {
|