@howuse/electron-crawler 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +11 -11
- package/dist/index.mjs +125 -112
- package/dist/newsCrawler.d.ts +6 -0
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -1,16 +1,16 @@
|
|
|
1
|
-
"use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const
|
|
2
|
-
`:""});
|
|
3
|
-
`});
|
|
1
|
+
"use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const A=require("electron-store"),x=require("electron"),O=require("turndown");function v(){const t=x.BrowserWindow.getAllWindows().find(r=>r.title==="crawler-hidden-window");return t||new x.BrowserWindow({show:!1,webPreferences:{sandbox:!1},title:"crawler-hidden-window"})}const I=new O({headingStyle:"atx",codeBlockStyle:"fenced",bulletListMarker:"-",emDelimiter:"*",strongDelimiter:"**",linkStyle:"inlined",linkReferenceStyle:"full",preformattedCode:!1,blankReplacement:(t,e)=>e.nodeName==="BR"?`
|
|
2
|
+
`:""});I.addRule("preserveLineBreaks",{filter:["br"],replacement:()=>`
|
|
3
|
+
`});I.addRule("images",{filter:"img",replacement:(t,e)=>{const r=e.alt||"",s=e.src||e.getAttribute("src")||"",o=e.title||"";return o?``:``}});function b(t){return t.replace(/<script[\s\S]*?<\/script>/gi,"").replace(/<style[\s\S]*?<\/style>/gi,"").replace(/<noscript[\s\S]*?<\/noscript>/gi,"").replace(/<iframe[\s\S]*?<\/iframe>/gi,"").replace(/on\w+="[^"]*"/gi,"").replace(/on\w+='[^']*'/gi,"").trim()}function D(t){if(!t||!t.trim())return"";try{const e=b(t);if(!e)return"";let r=I.turndown(e);return r=r.replace(/\n{3,}/g,`
|
|
4
4
|
|
|
5
5
|
`),r=r.split(`
|
|
6
6
|
`).map(s=>s.trimEnd()).join(`
|
|
7
|
-
`),r.trim()}catch(
|
|
7
|
+
`),r.trim()}catch(e){return console.error("[normalizeMarkdown] 转换失败:",e),b(t).replace(/<[^>]+>/g,"").replace(/\n{3,}/g,`
|
|
8
8
|
|
|
9
|
-
`).trim()}}function
|
|
9
|
+
`).trim()}}function L(t){const e=new Date,r=e.getFullYear(),s=e.getMonth()+1,o=e.getDate(),a=e.getHours(),i=e.getMinutes();if(!t||!t.trim())return e.toISOString();const l=t.trim(),y=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/,S=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,R=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/,U=/(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,T=/(\d{1,2})[月\-/](\d{1,2})[日]?/,M=/(\d{1,2})[:时](\d{1,2})[分]?/;let g=r,u=s,f=o,d=a,p=i,n=l.match(y);if(n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),d=parseInt(n[4],10),p=parseInt(n[5],10);else if(n=l.match(S),n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),d=parseInt(n[4],10),p=parseInt(n[5],10);else if(n=l.match(R),n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10);else if(n=l.match(U),n)u=parseInt(n[1],10),f=parseInt(n[2],10),d=parseInt(n[3],10),p=parseInt(n[4],10);else if(n=l.match(T),n)u=parseInt(n[1],10),f=parseInt(n[2],10);else if(n=l.match(M),n)d=parseInt(n[1],10),p=parseInt(n[2],10);else{const k=new Date(l);return Number.isNaN(k.getTime())?e.toISOString():k.toISOString()}(u<1||u>12)&&(u=s),(f<1||f>31)&&(f=o),(d<0||d>23)&&(d=a),(p<0||p>59)&&(p=i);const h=new Date(g,u-1,f,d,p,0,0);return h.getFullYear()!==g||h.getMonth()!==u-1||h.getDate()!==f?e.toISOString():C(h)}function C(t){const e=S=>S.toString().padStart(2,"0"),r=t.getFullYear(),s=e(t.getMonth()+1),o=e(t.getDate()),a=e(t.getHours()),i=e(t.getMinutes()),l=e(t.getSeconds()),y=t.getMilliseconds().toString().padStart(3,"0");return`${r}-${s}-${o}T${a}:${i}:${l}.${y}`}const P=A.default||A,w=new P;async function E(){if(c.rules&&c.rules.length>0)return c.rules;if(c.devMode)return await $();const e=w.get("news.rules");let r=[];if(Array.isArray(e))r=e;else if(e&&Array.isArray(e.rules)){const o=e.updatedAt?new Date(e.updatedAt).getTime():0,a=Date.now();o>0&&a-o<=18e6?r=e.rules:w.delete("news.rules")}if(r.length>0)return r;const s=await $();return s.length>0?(c.devMode||w.set("news.rules",{rules:s,updatedAt:new Date().toISOString()}),s):[]}const m={running:!1,interval_ms:1800*1e3};let c={rules:[],rulesApiUrl:void 0,pushApiUrl:void 0,devMode:!1,ruleTransformer:t=>t,newsItemFieldMap:void 0},_=!1;function H(t){c={...c,...t,ruleTransformer:t.ruleTransformer||(e=>e&&typeof e=="object"&&"data"in e?e.data:e)},j()}async function $(){if(!c.rulesApiUrl)return[];try{const t=await fetch(c.rulesApiUrl);if(!t.ok)throw new Error(`Failed to fetch rules from API: ${t.status} ${t.statusText}`);const e=await t.json(),r=c.ruleTransformer(e);return Array.isArray(r)?r:(console.warn("[crawler] Rules API returned non-array data, using empty array instead"),[])}catch(t){return console.error("[crawler] Failed to fetch rules from API:",t),[]}}async function N(t){if(!c.pushApiUrl)return;const e=w.get("news.pushedUrls")||[];if(e.includes(t.url)){console.log(`[crawler] URL already pushed, skipping: ${t.url}`);return}try{const r=F(t),s=await fetch(c.pushApiUrl,{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify(r)});if(!s.ok)throw new Error(`Failed to push results to API: ${s.status} ${s.statusText}`);const o=[...e,t.url];w.set("news.pushedUrls",o),console.log("[crawler] Results pushed to API successfully")}catch(r){console.error("[crawler] Failed to push results to API:",r)}}function F(t){const e=c.newsItemFieldMap;if(!e||Object.keys(e).length===0)return t;const r={},s=Object.entries(t);for(const[o,a]of s){const i=e[o];if(i==="-")continue;const l=typeof i=="string"?i:o;r[l]=a}return r}async function q(t,e){return await t.webContents.executeJavaScript(`
|
|
10
10
|
(() => {
|
|
11
11
|
const links = []
|
|
12
12
|
// 在指定范围内查找所有链接
|
|
13
|
-
const rangeElements = document.querySelectorAll(${JSON.stringify(
|
|
13
|
+
const rangeElements = document.querySelectorAll(${JSON.stringify(e.home_range_selector)})
|
|
14
14
|
if (rangeElements.length === 0) {
|
|
15
15
|
// 如果没有找到范围,则在整个文档中查找
|
|
16
16
|
const allLinks = document.querySelectorAll('a')
|
|
@@ -39,7 +39,7 @@
|
|
|
39
39
|
const uniqueLinks = [...new Map(links.map(l => [l.href, l])).values()]
|
|
40
40
|
return uniqueLinks.map(l => l.href)
|
|
41
41
|
})()
|
|
42
|
-
`)}async function W(e,
|
|
42
|
+
`)}async function W(t,e,r){try{return await t.loadURL(r,{httpReferrer:e.base_url}),await t.webContents.executeJavaScript(`
|
|
43
43
|
(() => {
|
|
44
44
|
const pickText = (sel) => {
|
|
45
45
|
const el = document.querySelector(sel)
|
|
@@ -77,10 +77,10 @@
|
|
|
77
77
|
return clone.innerHTML || ''
|
|
78
78
|
}
|
|
79
79
|
return {
|
|
80
|
-
title: pickText(${JSON.stringify(
|
|
81
|
-
contentHtml: pickContentHtml(${JSON.stringify(
|
|
82
|
-
timeText: pickText(${JSON.stringify(
|
|
80
|
+
title: pickText(${JSON.stringify(e.title_selector)}),
|
|
81
|
+
contentHtml: pickContentHtml(${JSON.stringify(e.content_selector)}, ${JSON.stringify(e.exclude_selectors||[])}),
|
|
82
|
+
timeText: pickText(${JSON.stringify(e.time_selector)}),
|
|
83
83
|
url: location.href
|
|
84
84
|
}
|
|
85
85
|
})()
|
|
86
|
-
`)}catch(s){return console.warn("[crawler] failed to extract page content",r,s),null}}async function J(
|
|
86
|
+
`)}catch(s){return console.warn("[crawler] failed to extract page content",r,s),null}}async function J(t){const e=v();try{await e.loadURL(t.base_url,{httpReferrer:t.base_url});const r=await q(e,t);console.log(`[crawler] found ${r.length} links from ${t.remark||t.base_url}`);const s=[];for(const o of r){const a=await W(e,t,o);if(!a||!a.title||!a.contentHtml){console.log(`[crawler] skip empty result for ${o}`);continue}const i={url:a.url||o,title:a.title,content_html:a.contentHtml,content_markdown:D(a.contentHtml),published_at:L(a.timeText)};s.push(i);try{await N(i)}catch(l){console.warn("[crawler] push single news item failed",l)}}return console.log(`[crawler] processed ${s.length} items from ${t.remark||t.base_url}`),{success:!0,data:s}}catch(r){return console.warn("[crawler] rule failed",t.remark||t.base_url,r),{success:!1,error:r instanceof Error?r.message:String(r)}}}function j(){if(_)return;_=!0;const t=m.interval_ms,e=async()=>{const r=await E();console.log(`[crawler] scheduled run, rules=${r.length}`),m.running=!0,m.running_source=void 0;try{for(const s of r)await J(s)}finally{m.running=!1,m.running_source=void 0,m.next_run_at=new Date(Date.now()+t).toISOString()}};m.next_run_at=new Date(Date.now()+5e3).toISOString(),setTimeout(e,5e3),setInterval(e,t)}exports.initCrawler=H;
|
package/dist/index.mjs
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import
|
|
2
|
-
import { BrowserWindow as
|
|
3
|
-
import
|
|
4
|
-
function
|
|
5
|
-
const
|
|
6
|
-
return
|
|
1
|
+
import A from "electron-store";
|
|
2
|
+
import { BrowserWindow as x } from "electron";
|
|
3
|
+
import O from "turndown";
|
|
4
|
+
function D() {
|
|
5
|
+
const t = x.getAllWindows().find((r) => r.title === "crawler-hidden-window");
|
|
6
|
+
return t || new x({
|
|
7
7
|
show: !1,
|
|
8
8
|
webPreferences: {
|
|
9
9
|
sandbox: !1
|
|
@@ -11,7 +11,7 @@ function O() {
|
|
|
11
11
|
title: "crawler-hidden-window"
|
|
12
12
|
});
|
|
13
13
|
}
|
|
14
|
-
const
|
|
14
|
+
const I = new O({
|
|
15
15
|
headingStyle: "atx",
|
|
16
16
|
// 使用 # 格式的标题
|
|
17
17
|
codeBlockStyle: "fenced",
|
|
@@ -28,125 +28,138 @@ const k = new L({
|
|
|
28
28
|
// 完整的链接引用格式
|
|
29
29
|
preformattedCode: !1,
|
|
30
30
|
// 不使用预格式化代码
|
|
31
|
-
blankReplacement: (
|
|
31
|
+
blankReplacement: (t, e) => e.nodeName === "BR" ? `
|
|
32
32
|
` : ""
|
|
33
33
|
});
|
|
34
|
-
|
|
34
|
+
I.addRule("preserveLineBreaks", {
|
|
35
35
|
filter: ["br"],
|
|
36
36
|
replacement: () => `
|
|
37
37
|
`
|
|
38
38
|
});
|
|
39
|
-
|
|
39
|
+
I.addRule("images", {
|
|
40
40
|
filter: "img",
|
|
41
|
-
replacement: (
|
|
42
|
-
const r =
|
|
41
|
+
replacement: (t, e) => {
|
|
42
|
+
const r = e.alt || "", s = e.src || e.getAttribute("src") || "", o = e.title || "";
|
|
43
43
|
return o ? `` : ``;
|
|
44
44
|
}
|
|
45
45
|
});
|
|
46
|
-
function
|
|
47
|
-
return
|
|
46
|
+
function _(t) {
|
|
47
|
+
return t.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<noscript[\s\S]*?<\/noscript>/gi, "").replace(/<iframe[\s\S]*?<\/iframe>/gi, "").replace(/on\w+="[^"]*"/gi, "").replace(/on\w+='[^']*'/gi, "").trim();
|
|
48
48
|
}
|
|
49
|
-
function
|
|
50
|
-
if (!
|
|
49
|
+
function v(t) {
|
|
50
|
+
if (!t || !t.trim())
|
|
51
51
|
return "";
|
|
52
52
|
try {
|
|
53
|
-
const
|
|
54
|
-
if (!
|
|
53
|
+
const e = _(t);
|
|
54
|
+
if (!e)
|
|
55
55
|
return "";
|
|
56
|
-
let r =
|
|
56
|
+
let r = I.turndown(e);
|
|
57
57
|
return r = r.replace(/\n{3,}/g, `
|
|
58
58
|
|
|
59
59
|
`), r = r.split(`
|
|
60
60
|
`).map((s) => s.trimEnd()).join(`
|
|
61
61
|
`), r.trim();
|
|
62
|
-
} catch (
|
|
63
|
-
return console.error("[normalizeMarkdown] 转换失败:",
|
|
62
|
+
} catch (e) {
|
|
63
|
+
return console.error("[normalizeMarkdown] 转换失败:", e), _(t).replace(/<[^>]+>/g, "").replace(/\n{3,}/g, `
|
|
64
64
|
|
|
65
65
|
`).trim();
|
|
66
66
|
}
|
|
67
67
|
}
|
|
68
|
-
function
|
|
69
|
-
const
|
|
70
|
-
if (!
|
|
71
|
-
return
|
|
72
|
-
const l =
|
|
73
|
-
let g = r, u = s, f = o,
|
|
68
|
+
function L(t) {
|
|
69
|
+
const e = /* @__PURE__ */ new Date(), r = e.getFullYear(), s = e.getMonth() + 1, o = e.getDate(), a = e.getHours(), i = e.getMinutes();
|
|
70
|
+
if (!t || !t.trim())
|
|
71
|
+
return e.toISOString();
|
|
72
|
+
const l = t.trim(), y = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/, S = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, R = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/, U = /(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, T = /(\d{1,2})[月\-/](\d{1,2})[日]?/, M = /(\d{1,2})[:时](\d{1,2})[分]?/;
|
|
73
|
+
let g = r, u = s, f = o, d = a, m = i, n = l.match(y);
|
|
74
74
|
if (n)
|
|
75
|
-
g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10),
|
|
75
|
+
g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), d = parseInt(n[4], 10), m = parseInt(n[5], 10);
|
|
76
76
|
else if (n = l.match(S), n)
|
|
77
|
-
g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10),
|
|
78
|
-
else if (n = l.match(
|
|
77
|
+
g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), d = parseInt(n[4], 10), m = parseInt(n[5], 10);
|
|
78
|
+
else if (n = l.match(R), n)
|
|
79
79
|
g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10);
|
|
80
|
-
else if (n = l.match(
|
|
81
|
-
u = parseInt(n[1], 10), f = parseInt(n[2], 10),
|
|
80
|
+
else if (n = l.match(U), n)
|
|
81
|
+
u = parseInt(n[1], 10), f = parseInt(n[2], 10), d = parseInt(n[3], 10), m = parseInt(n[4], 10);
|
|
82
82
|
else if (n = l.match(T), n)
|
|
83
83
|
u = parseInt(n[1], 10), f = parseInt(n[2], 10);
|
|
84
|
-
else if (n = l.match(
|
|
85
|
-
|
|
84
|
+
else if (n = l.match(M), n)
|
|
85
|
+
d = parseInt(n[1], 10), m = parseInt(n[2], 10);
|
|
86
86
|
else {
|
|
87
|
-
const
|
|
88
|
-
return Number.isNaN(
|
|
87
|
+
const k = new Date(l);
|
|
88
|
+
return Number.isNaN(k.getTime()) ? e.toISOString() : k.toISOString();
|
|
89
89
|
}
|
|
90
|
-
(u < 1 || u > 12) && (u = s), (f < 1 || f > 31) && (f = o), (
|
|
91
|
-
const h = new Date(g, u - 1, f,
|
|
92
|
-
return h.getFullYear() !== g || h.getMonth() !== u - 1 || h.getDate() !== f ?
|
|
90
|
+
(u < 1 || u > 12) && (u = s), (f < 1 || f > 31) && (f = o), (d < 0 || d > 23) && (d = a), (m < 0 || m > 59) && (m = i);
|
|
91
|
+
const h = new Date(g, u - 1, f, d, m, 0, 0);
|
|
92
|
+
return h.getFullYear() !== g || h.getMonth() !== u - 1 || h.getDate() !== f ? e.toISOString() : C(h);
|
|
93
93
|
}
|
|
94
|
-
function C(
|
|
95
|
-
const
|
|
96
|
-
return `${r}-${s}-${o}T${a}:${
|
|
94
|
+
function C(t) {
|
|
95
|
+
const e = (S) => S.toString().padStart(2, "0"), r = t.getFullYear(), s = e(t.getMonth() + 1), o = e(t.getDate()), a = e(t.getHours()), i = e(t.getMinutes()), l = e(t.getSeconds()), y = t.getMilliseconds().toString().padStart(3, "0");
|
|
96
|
+
return `${r}-${s}-${o}T${a}:${i}:${l}.${y}`;
|
|
97
97
|
}
|
|
98
|
-
const
|
|
99
|
-
async function
|
|
100
|
-
if (
|
|
101
|
-
return
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
98
|
+
const E = A.default || A, w = new E();
|
|
99
|
+
async function H() {
|
|
100
|
+
if (c.rules && c.rules.length > 0)
|
|
101
|
+
return c.rules;
|
|
102
|
+
if (c.devMode)
|
|
103
|
+
return await b();
|
|
104
|
+
const e = w.get("news.rules");
|
|
105
|
+
let r = [];
|
|
106
|
+
if (Array.isArray(e))
|
|
107
|
+
r = e;
|
|
108
|
+
else if (e && Array.isArray(e.rules)) {
|
|
109
|
+
const o = e.updatedAt ? new Date(e.updatedAt).getTime() : 0, a = Date.now();
|
|
110
|
+
o > 0 && a - o <= 18e6 ? r = e.rules : w.delete("news.rules");
|
|
111
|
+
}
|
|
112
|
+
if (r.length > 0)
|
|
113
|
+
return r;
|
|
114
|
+
const s = await b();
|
|
115
|
+
return s.length > 0 ? (c.devMode || w.set("news.rules", {
|
|
116
|
+
rules: s,
|
|
117
|
+
updatedAt: (/* @__PURE__ */ new Date()).toISOString()
|
|
118
|
+
}), s) : [];
|
|
107
119
|
}
|
|
108
|
-
const
|
|
120
|
+
const p = {
|
|
109
121
|
running: !1,
|
|
110
122
|
interval_ms: 1800 * 1e3
|
|
111
123
|
};
|
|
112
|
-
let
|
|
124
|
+
let c = {
|
|
113
125
|
rules: [],
|
|
114
126
|
rulesApiUrl: void 0,
|
|
115
127
|
pushApiUrl: void 0,
|
|
116
|
-
|
|
128
|
+
devMode: !1,
|
|
129
|
+
ruleTransformer: (t) => t,
|
|
117
130
|
newsItemFieldMap: void 0
|
|
118
|
-
},
|
|
119
|
-
function z(
|
|
120
|
-
|
|
121
|
-
...
|
|
122
|
-
...
|
|
131
|
+
}, $ = !1;
|
|
132
|
+
function z(t) {
|
|
133
|
+
c = {
|
|
134
|
+
...c,
|
|
135
|
+
...t,
|
|
123
136
|
// 确保ruleTransformer始终存在
|
|
124
|
-
ruleTransformer:
|
|
137
|
+
ruleTransformer: t.ruleTransformer || ((e) => e && typeof e == "object" && "data" in e ? e.data : e)
|
|
125
138
|
}, W();
|
|
126
139
|
}
|
|
127
|
-
async function
|
|
128
|
-
if (!
|
|
140
|
+
async function b() {
|
|
141
|
+
if (!c.rulesApiUrl)
|
|
129
142
|
return [];
|
|
130
143
|
try {
|
|
131
|
-
const
|
|
132
|
-
if (!
|
|
133
|
-
throw new Error(`Failed to fetch rules from API: ${
|
|
134
|
-
const
|
|
144
|
+
const t = await fetch(c.rulesApiUrl);
|
|
145
|
+
if (!t.ok)
|
|
146
|
+
throw new Error(`Failed to fetch rules from API: ${t.status} ${t.statusText}`);
|
|
147
|
+
const e = await t.json(), r = c.ruleTransformer(e);
|
|
135
148
|
return Array.isArray(r) ? r : (console.warn("[crawler] Rules API returned non-array data, using empty array instead"), []);
|
|
136
|
-
} catch (
|
|
137
|
-
return console.error("[crawler] Failed to fetch rules from API:",
|
|
149
|
+
} catch (t) {
|
|
150
|
+
return console.error("[crawler] Failed to fetch rules from API:", t), [];
|
|
138
151
|
}
|
|
139
152
|
}
|
|
140
|
-
async function
|
|
141
|
-
if (!
|
|
153
|
+
async function P(t) {
|
|
154
|
+
if (!c.pushApiUrl)
|
|
142
155
|
return;
|
|
143
|
-
const
|
|
144
|
-
if (
|
|
145
|
-
console.log(`[crawler] URL already pushed, skipping: ${
|
|
156
|
+
const e = w.get("news.pushedUrls") || [];
|
|
157
|
+
if (e.includes(t.url)) {
|
|
158
|
+
console.log(`[crawler] URL already pushed, skipping: ${t.url}`);
|
|
146
159
|
return;
|
|
147
160
|
}
|
|
148
161
|
try {
|
|
149
|
-
const r = N(
|
|
162
|
+
const r = N(t), s = await fetch(c.pushApiUrl, {
|
|
150
163
|
method: "POST",
|
|
151
164
|
headers: {
|
|
152
165
|
"Content-Type": "application/json"
|
|
@@ -155,32 +168,32 @@ async function H(e) {
|
|
|
155
168
|
});
|
|
156
169
|
if (!s.ok)
|
|
157
170
|
throw new Error(`Failed to push results to API: ${s.status} ${s.statusText}`);
|
|
158
|
-
const o = [...
|
|
171
|
+
const o = [...e, t.url];
|
|
159
172
|
w.set("news.pushedUrls", o), console.log("[crawler] Results pushed to API successfully");
|
|
160
173
|
} catch (r) {
|
|
161
174
|
console.error("[crawler] Failed to push results to API:", r);
|
|
162
175
|
}
|
|
163
176
|
}
|
|
164
|
-
function N(
|
|
165
|
-
const
|
|
166
|
-
if (!
|
|
167
|
-
return
|
|
168
|
-
const r = {}, s = Object.entries(
|
|
177
|
+
function N(t) {
|
|
178
|
+
const e = c.newsItemFieldMap;
|
|
179
|
+
if (!e || Object.keys(e).length === 0)
|
|
180
|
+
return t;
|
|
181
|
+
const r = {}, s = Object.entries(t);
|
|
169
182
|
for (const [o, a] of s) {
|
|
170
|
-
const
|
|
171
|
-
if (
|
|
172
|
-
const l = typeof
|
|
183
|
+
const i = e[o];
|
|
184
|
+
if (i === "-") continue;
|
|
185
|
+
const l = typeof i == "string" ? i : o;
|
|
173
186
|
r[l] = a;
|
|
174
187
|
}
|
|
175
188
|
return r;
|
|
176
189
|
}
|
|
177
|
-
async function F(
|
|
178
|
-
return await
|
|
190
|
+
async function F(t, e) {
|
|
191
|
+
return await t.webContents.executeJavaScript(
|
|
179
192
|
`
|
|
180
193
|
(() => {
|
|
181
194
|
const links = []
|
|
182
195
|
// 在指定范围内查找所有链接
|
|
183
|
-
const rangeElements = document.querySelectorAll(${JSON.stringify(
|
|
196
|
+
const rangeElements = document.querySelectorAll(${JSON.stringify(e.home_range_selector)})
|
|
184
197
|
if (rangeElements.length === 0) {
|
|
185
198
|
// 如果没有找到范围,则在整个文档中查找
|
|
186
199
|
const allLinks = document.querySelectorAll('a')
|
|
@@ -212,9 +225,9 @@ async function F(e, t) {
|
|
|
212
225
|
`
|
|
213
226
|
);
|
|
214
227
|
}
|
|
215
|
-
async function q(
|
|
228
|
+
async function q(t, e, r) {
|
|
216
229
|
try {
|
|
217
|
-
return await
|
|
230
|
+
return await t.loadURL(r, { httpReferrer: e.base_url }), await t.webContents.executeJavaScript(
|
|
218
231
|
`
|
|
219
232
|
(() => {
|
|
220
233
|
const pickText = (sel) => {
|
|
@@ -253,11 +266,11 @@ async function q(e, t, r) {
|
|
|
253
266
|
return clone.innerHTML || ''
|
|
254
267
|
}
|
|
255
268
|
return {
|
|
256
|
-
title: pickText(${JSON.stringify(
|
|
257
|
-
contentHtml: pickContentHtml(${JSON.stringify(
|
|
258
|
-
|
|
269
|
+
title: pickText(${JSON.stringify(e.title_selector)}),
|
|
270
|
+
contentHtml: pickContentHtml(${JSON.stringify(e.content_selector)}, ${JSON.stringify(
|
|
271
|
+
e.exclude_selectors || []
|
|
259
272
|
)}),
|
|
260
|
-
timeText: pickText(${JSON.stringify(
|
|
273
|
+
timeText: pickText(${JSON.stringify(e.time_selector)}),
|
|
261
274
|
url: location.href
|
|
262
275
|
}
|
|
263
276
|
})()
|
|
@@ -267,58 +280,58 @@ async function q(e, t, r) {
|
|
|
267
280
|
return console.warn("[crawler] failed to extract page content", r, s), null;
|
|
268
281
|
}
|
|
269
282
|
}
|
|
270
|
-
async function J(
|
|
271
|
-
const
|
|
283
|
+
async function J(t) {
|
|
284
|
+
const e = D();
|
|
272
285
|
try {
|
|
273
|
-
await
|
|
274
|
-
const r = await F(
|
|
275
|
-
console.log(`[crawler] found ${r.length} links from ${
|
|
286
|
+
await e.loadURL(t.base_url, { httpReferrer: t.base_url });
|
|
287
|
+
const r = await F(e, t);
|
|
288
|
+
console.log(`[crawler] found ${r.length} links from ${t.remark || t.base_url}`);
|
|
276
289
|
const s = [];
|
|
277
290
|
for (const o of r) {
|
|
278
|
-
const a = await q(
|
|
291
|
+
const a = await q(e, t, o);
|
|
279
292
|
if (!a || !a.title || !a.contentHtml) {
|
|
280
293
|
console.log(`[crawler] skip empty result for ${o}`);
|
|
281
294
|
continue;
|
|
282
295
|
}
|
|
283
|
-
const
|
|
296
|
+
const i = {
|
|
284
297
|
url: a.url || o,
|
|
285
298
|
title: a.title,
|
|
286
299
|
content_html: a.contentHtml,
|
|
287
|
-
content_markdown:
|
|
288
|
-
published_at:
|
|
300
|
+
content_markdown: v(a.contentHtml),
|
|
301
|
+
published_at: L(a.timeText)
|
|
289
302
|
};
|
|
290
|
-
s.push(
|
|
303
|
+
s.push(i);
|
|
291
304
|
try {
|
|
292
|
-
await
|
|
305
|
+
await P(i);
|
|
293
306
|
} catch (l) {
|
|
294
307
|
console.warn("[crawler] push single news item failed", l);
|
|
295
308
|
}
|
|
296
309
|
}
|
|
297
|
-
return console.log(`[crawler] processed ${s.length} items from ${
|
|
310
|
+
return console.log(`[crawler] processed ${s.length} items from ${t.remark || t.base_url}`), {
|
|
298
311
|
success: !0,
|
|
299
312
|
data: s
|
|
300
313
|
};
|
|
301
314
|
} catch (r) {
|
|
302
|
-
return console.warn("[crawler] rule failed",
|
|
315
|
+
return console.warn("[crawler] rule failed", t.remark || t.base_url, r), {
|
|
303
316
|
success: !1,
|
|
304
317
|
error: r instanceof Error ? r.message : String(r)
|
|
305
318
|
};
|
|
306
319
|
}
|
|
307
320
|
}
|
|
308
321
|
function W() {
|
|
309
|
-
if (
|
|
310
|
-
|
|
311
|
-
const
|
|
312
|
-
const r = await
|
|
313
|
-
console.log(`[crawler] scheduled run, rules=${r.length}`),
|
|
322
|
+
if ($) return;
|
|
323
|
+
$ = !0;
|
|
324
|
+
const t = p.interval_ms, e = async () => {
|
|
325
|
+
const r = await H();
|
|
326
|
+
console.log(`[crawler] scheduled run, rules=${r.length}`), p.running = !0, p.running_source = void 0;
|
|
314
327
|
try {
|
|
315
328
|
for (const s of r)
|
|
316
329
|
await J(s);
|
|
317
330
|
} finally {
|
|
318
|
-
|
|
331
|
+
p.running = !1, p.running_source = void 0, p.next_run_at = new Date(Date.now() + t).toISOString();
|
|
319
332
|
}
|
|
320
333
|
};
|
|
321
|
-
|
|
334
|
+
p.next_run_at = new Date(Date.now() + 5e3).toISOString(), setTimeout(e, 5e3), setInterval(e, t);
|
|
322
335
|
}
|
|
323
336
|
export {
|
|
324
337
|
z as initCrawler
|
package/dist/newsCrawler.d.ts
CHANGED
|
@@ -19,6 +19,12 @@ export type CrawlerConfig = {
|
|
|
19
19
|
rulesApiUrl?: string;
|
|
20
20
|
pushApiUrl?: string;
|
|
21
21
|
ruleTransformer?: (data: any) => any;
|
|
22
|
+
/**
|
|
23
|
+
* 是否处于开发模式
|
|
24
|
+
* - 开发模式:不使用本地缓存,每次都优先使用内存 rules,其次直接从 API 拉取
|
|
25
|
+
* - 生产模式:会使用本地缓存(带 5 小时过期时间)
|
|
26
|
+
*/
|
|
27
|
+
devMode?: boolean;
|
|
22
28
|
newsItemFieldMap?: Partial<Record<keyof NewsItem, string | '-'>>;
|
|
23
29
|
};
|
|
24
30
|
export type NewsItem = {
|