@howuse/electron-crawler 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -63,6 +63,36 @@ initCrawler({
63
63
  })
64
64
  ```
65
65
 
66
+ ### 使用推送过滤函数
67
+
68
+ ```javascript
69
+ import { initCrawler } from '@howuse/electron-crawler'
70
+
71
+ initCrawler({
72
+ rules: [/* ... */],
73
+ pushApiUrl: 'https://api.example.com/news/push',
74
+ // 只推送包含特定关键词的新闻
75
+ isPush: (item) => {
76
+ if (item.title.includes('重要') || item.title.includes('紧急')) {
77
+ return true // 允许推送
78
+ }
79
+ return false // 不推送
80
+ }
81
+ })
82
+ ```
83
+
84
+ ### 开发模式
85
+
86
+ ```javascript
87
+ import { initCrawler } from '@howuse/electron-crawler'
88
+
89
+ initCrawler({
90
+ rules: [/* ... */],
91
+ pushApiUrl: 'https://api.example.com/news/push',
92
+ devMode: true // 开发模式:不使用本地缓存,每次都从 API 拉取最新规则
93
+ })
94
+ ```
95
+
66
96
  ### 配置选项
67
97
 
68
98
  | 选项 | 类型 | 描述 |
@@ -71,6 +101,8 @@ initCrawler({
71
101
  | `rulesApiUrl` | `string` | 规则API接口URL |
72
102
  | `pushApiUrl` | `string` | 结果推送API接口URL(必填以启用推送) |
73
103
  | `ruleTransformer` | `(data: any) => any` | 规则转换函数 |
104
+ | `devMode` | `boolean` | 是否处于开发模式。开发模式:不使用本地缓存,每次都优先使用内存 rules,其次直接从 API 拉取;生产模式:会使用本地缓存(带 5 小时过期时间) |
105
+ | `isPush` | `(item: NewsItem) => boolean \| null \| undefined` | 推送前判断函数。返回 `true`、`undefined` 或 `null` 时允许推送,返回 `false` 时不推送。可用于过滤不需要推送的新闻项 |
74
106
  | `newsItemFieldMap` | `Partial<Record<keyof NewsItem, string \| '-'>>` | 推送字段映射,值为 '-' 表示忽略该字段 |
75
107
 
76
108
  ### NewsRule 结构
package/dist/index.js CHANGED
@@ -1,16 +1,16 @@
1
- "use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const x=require("electron-store"),A=require("electron"),O=require("turndown");function L(){const e=A.BrowserWindow.getAllWindows().find(r=>r.title==="crawler-hidden-window");return e||new A.BrowserWindow({show:!1,webPreferences:{sandbox:!1},title:"crawler-hidden-window"})}const k=new O({headingStyle:"atx",codeBlockStyle:"fenced",bulletListMarker:"-",emDelimiter:"*",strongDelimiter:"**",linkStyle:"inlined",linkReferenceStyle:"full",preformattedCode:!1,blankReplacement:(e,t)=>t.nodeName==="BR"?`
2
- `:""});k.addRule("preserveLineBreaks",{filter:["br"],replacement:()=>`
3
- `});k.addRule("images",{filter:"img",replacement:(e,t)=>{const r=t.alt||"",s=t.src||t.getAttribute("src")||"",o=t.title||"";return o?`![${r}](${s} "${o}")`:`![${r}](${s})`}});function b(e){return e.replace(/<script[\s\S]*?<\/script>/gi,"").replace(/<style[\s\S]*?<\/style>/gi,"").replace(/<noscript[\s\S]*?<\/noscript>/gi,"").replace(/<iframe[\s\S]*?<\/iframe>/gi,"").replace(/on\w+="[^"]*"/gi,"").replace(/on\w+='[^']*'/gi,"").trim()}function v(e){if(!e||!e.trim())return"";try{const t=b(e);if(!t)return"";let r=k.turndown(t);return r=r.replace(/\n{3,}/g,`
1
+ "use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const A=require("electron-store"),x=require("electron"),O=require("turndown");function P(){const t=x.BrowserWindow.getAllWindows().find(r=>r.title==="crawler-hidden-window");return t||new x.BrowserWindow({show:!1,webPreferences:{sandbox:!1},title:"crawler-hidden-window"})}const I=new O({headingStyle:"atx",codeBlockStyle:"fenced",bulletListMarker:"-",emDelimiter:"*",strongDelimiter:"**",linkStyle:"inlined",linkReferenceStyle:"full",preformattedCode:!1,blankReplacement:(t,e)=>e.nodeName==="BR"?`
2
+ `:""});I.addRule("preserveLineBreaks",{filter:["br"],replacement:()=>`
3
+ `});I.addRule("images",{filter:"img",replacement:(t,e)=>{const r=e.alt||"",s=e.src||e.getAttribute("src")||"",o=e.title||"";return o?`![${r}](${s} "${o}")`:`![${r}](${s})`}});function b(t){return t.replace(/<script[\s\S]*?<\/script>/gi,"").replace(/<style[\s\S]*?<\/style>/gi,"").replace(/<noscript[\s\S]*?<\/noscript>/gi,"").replace(/<iframe[\s\S]*?<\/iframe>/gi,"").replace(/on\w+="[^"]*"/gi,"").replace(/on\w+='[^']*'/gi,"").trim()}function v(t){if(!t||!t.trim())return"";try{const e=b(t);if(!e)return"";let r=I.turndown(e);return r=r.replace(/\n{3,}/g,`
4
4
 
5
5
  `),r=r.split(`
6
6
  `).map(s=>s.trimEnd()).join(`
7
- `),r.trim()}catch(t){return console.error("[normalizeMarkdown] 转换失败:",t),b(e).replace(/<[^>]+>/g,"").replace(/\n{3,}/g,`
7
+ `),r.trim()}catch(e){return console.error("[normalizeMarkdown] 转换失败:",e),b(t).replace(/<[^>]+>/g,"").replace(/\n{3,}/g,`
8
8
 
9
- `).trim()}}function C(e){const t=new Date,r=t.getFullYear(),s=t.getMonth()+1,o=t.getDate(),a=t.getHours(),c=t.getMinutes();if(!e||!e.trim())return t.toISOString();const l=e.trim(),y=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/,S=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,T=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/,U=/(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,_=/(\d{1,2})[月\-/](\d{1,2})[日]?/,R=/(\d{1,2})[:时](\d{1,2})[分]?/;let g=r,u=s,f=o,d=a,m=c,n=l.match(y);if(n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),d=parseInt(n[4],10),m=parseInt(n[5],10);else if(n=l.match(S),n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),d=parseInt(n[4],10),m=parseInt(n[5],10);else if(n=l.match(T),n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10);else if(n=l.match(U),n)u=parseInt(n[1],10),f=parseInt(n[2],10),d=parseInt(n[3],10),m=parseInt(n[4],10);else if(n=l.match(_),n)u=parseInt(n[1],10),f=parseInt(n[2],10);else if(n=l.match(R),n)d=parseInt(n[1],10),m=parseInt(n[2],10);else{const I=new Date(l);return Number.isNaN(I.getTime())?t.toISOString():I.toISOString()}(u<1||u>12)&&(u=s),(f<1||f>31)&&(f=o),(d<0||d>23)&&(d=a),(m<0||m>59)&&(m=c);const h=new Date(g,u-1,f,d,m,0,0);return h.getFullYear()!==g||h.getMonth()!==u-1||h.getDate()!==f?t.toISOString():D(h)}function D(e){const t=S=>S.toString().padStart(2,"0"),r=e.getFullYear(),s=t(e.getMonth()+1),o=t(e.getDate()),a=t(e.getHours()),c=t(e.getMinutes()),l=t(e.getSeconds()),y=e.getMilliseconds().toString().padStart(3,"0");return`${r}-${s}-${o}T${a}:${c}:${l}.${y}`}const M=x.default||x,w=new M;async function P(){if(i.rules&&i.rules.length>0)return i.rules;const e=w.get("news.rules")||[];if(e.length>0)return e;const t=await H();return t.length>0?(w.set("news.rules",t),t):[]}const p={running:!1,interval_ms:1800*1e3};let i={rules:[],rulesApiUrl:void 0,pushApiUrl:void 0,ruleTransformer:e=>e,newsItemFieldMap:void 0},$=!1;function E(e){i={...i,...e,ruleTransformer:e.ruleTransformer||(t=>t&&typeof t=="object"&&"data"in t?t.data:t)},j()}async function H(){if(!i.rulesApiUrl)return[];try{const e=await fetch(i.rulesApiUrl);if(!e.ok)throw new Error(`Failed to fetch rules from API: ${e.status} ${e.statusText}`);const t=await e.json(),r=i.ruleTransformer(t);return Array.isArray(r)?r:(console.warn("[crawler] Rules API returned non-array data, using empty array instead"),[])}catch(e){return console.error("[crawler] Failed to fetch rules from API:",e),[]}}async function N(e){if(!i.pushApiUrl)return;const t=w.get("news.pushedUrls")||[];if(t.includes(e.url)){console.log(`[crawler] URL already pushed, skipping: ${e.url}`);return}try{const r=q(e),s=await fetch(i.pushApiUrl,{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify(r)});if(!s.ok)throw new Error(`Failed to push results to API: ${s.status} ${s.statusText}`);const o=[...t,e.url];w.set("news.pushedUrls",o),console.log("[crawler] Results pushed to API successfully")}catch(r){console.error("[crawler] Failed to push results to API:",r)}}function q(e){const t=i.newsItemFieldMap;if(!t||Object.keys(t).length===0)return e;const r={},s=Object.entries(e);for(const[o,a]of s){const c=t[o];if(c==="-")continue;const l=typeof c=="string"?c:o;r[l]=a}return r}async function F(e,t){return await e.webContents.executeJavaScript(`
9
+ `).trim()}}function D(t){const e=new Date,r=e.getFullYear(),s=e.getMonth()+1,o=e.getDate(),a=e.getHours(),i=e.getMinutes();if(!t||!t.trim())return e.toISOString();const c=t.trim(),g=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/,S=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,R=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/,U=/(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,T=/(\d{1,2})[月\-/](\d{1,2})[日]?/,M=/(\d{1,2})[:时](\d{1,2})[分]?/;let h=r,u=s,f=o,d=a,p=i,n=c.match(g);if(n)h=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),d=parseInt(n[4],10),p=parseInt(n[5],10);else if(n=c.match(S),n)h=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),d=parseInt(n[4],10),p=parseInt(n[5],10);else if(n=c.match(R),n)h=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10);else if(n=c.match(U),n)u=parseInt(n[1],10),f=parseInt(n[2],10),d=parseInt(n[3],10),p=parseInt(n[4],10);else if(n=c.match(T),n)u=parseInt(n[1],10),f=parseInt(n[2],10);else if(n=c.match(M),n)d=parseInt(n[1],10),p=parseInt(n[2],10);else{const k=new Date(c);return Number.isNaN(k.getTime())?e.toISOString():k.toISOString()}(u<1||u>12)&&(u=s),(f<1||f>31)&&(f=o),(d<0||d>23)&&(d=a),(p<0||p>59)&&(p=i);const y=new Date(h,u-1,f,d,p,0,0);return y.getFullYear()!==h||y.getMonth()!==u-1||y.getDate()!==f?e.toISOString():L(y)}function L(t){const e=S=>S.toString().padStart(2,"0"),r=t.getFullYear(),s=e(t.getMonth()+1),o=e(t.getDate()),a=e(t.getHours()),i=e(t.getMinutes()),c=e(t.getSeconds()),g=t.getMilliseconds().toString().padStart(3,"0");return`${r}-${s}-${o}T${a}:${i}:${c}.${g}`}const C=A.default||A,w=new C;async function E(){if(l.rules&&l.rules.length>0)return l.rules;if(l.devMode)return await _();const e=w.get("news.rules");let r=[];if(Array.isArray(e))r=e;else if(e&&Array.isArray(e.rules)){const o=e.updatedAt?new Date(e.updatedAt).getTime():0,a=Date.now();o>0&&a-o<=18e6?r=e.rules:w.delete("news.rules")}if(r.length>0)return r;const s=await _();return s.length>0?(l.devMode||w.set("news.rules",{rules:s,updatedAt:new Date().toISOString()}),s):[]}const m={running:!1,interval_ms:1800*1e3};let l={rules:[],rulesApiUrl:void 0,pushApiUrl:void 0,devMode:!1,ruleTransformer:t=>t,isPush:void 0,newsItemFieldMap:void 0},$=!1;function H(t){l={...l,...t,ruleTransformer:t.ruleTransformer||(e=>e&&typeof e=="object"&&"data"in e?e.data:e)},j()}async function _(){if(!l.rulesApiUrl)return[];try{const t=await fetch(l.rulesApiUrl);if(!t.ok)throw new Error(`Failed to fetch rules from API: ${t.status} ${t.statusText}`);const e=await t.json(),r=l.ruleTransformer(e);return Array.isArray(r)?r:(console.warn("[crawler] Rules API returned non-array data, using empty array instead"),[])}catch(t){return console.error("[crawler] Failed to fetch rules from API:",t),[]}}async function N(t){if(!l.pushApiUrl)return;const e=w.get("news.pushedUrls")||[];if(e.includes(t.url)){console.log(`[crawler] URL already pushed, skipping: ${t.url}`);return}try{const r=F(t),s=await fetch(l.pushApiUrl,{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify(r)});if(!s.ok)throw new Error(`Failed to push results to API: ${s.status} ${s.statusText}`);const o=[...e,t.url];w.set("news.pushedUrls",o),console.log("[crawler] Results pushed to API successfully")}catch(r){console.error("[crawler] Failed to push results to API:",r)}}function F(t){const e=l.newsItemFieldMap;if(!e||Object.keys(e).length===0)return t;const r={},s=Object.entries(t);for(const[o,a]of s){const i=e[o];if(i==="-")continue;const c=typeof i=="string"?i:o;r[c]=a}return r}async function q(t,e){return await t.webContents.executeJavaScript(`
10
10
  (() => {
11
11
  const links = []
12
12
  // 在指定范围内查找所有链接
13
- const rangeElements = document.querySelectorAll(${JSON.stringify(t.home_range_selector)})
13
+ const rangeElements = document.querySelectorAll(${JSON.stringify(e.home_range_selector)})
14
14
  if (rangeElements.length === 0) {
15
15
  // 如果没有找到范围,则在整个文档中查找
16
16
  const allLinks = document.querySelectorAll('a')
@@ -39,7 +39,7 @@
39
39
  const uniqueLinks = [...new Map(links.map(l => [l.href, l])).values()]
40
40
  return uniqueLinks.map(l => l.href)
41
41
  })()
42
- `)}async function W(e,t,r){try{return await e.loadURL(r,{httpReferrer:t.base_url}),await e.webContents.executeJavaScript(`
42
+ `)}async function W(t,e,r){try{return await t.loadURL(r,{httpReferrer:e.base_url}),await t.webContents.executeJavaScript(`
43
43
  (() => {
44
44
  const pickText = (sel) => {
45
45
  const el = document.querySelector(sel)
@@ -77,10 +77,10 @@
77
77
  return clone.innerHTML || ''
78
78
  }
79
79
  return {
80
- title: pickText(${JSON.stringify(t.title_selector)}),
81
- contentHtml: pickContentHtml(${JSON.stringify(t.content_selector)}, ${JSON.stringify(t.exclude_selectors||[])}),
82
- timeText: pickText(${JSON.stringify(t.time_selector)}),
80
+ title: pickText(${JSON.stringify(e.title_selector)}),
81
+ contentHtml: pickContentHtml(${JSON.stringify(e.content_selector)}, ${JSON.stringify(e.exclude_selectors||[])}),
82
+ timeText: pickText(${JSON.stringify(e.time_selector)}),
83
83
  url: location.href
84
84
  }
85
85
  })()
86
- `)}catch(s){return console.warn("[crawler] failed to extract page content",r,s),null}}async function J(e){const t=L();try{await t.loadURL(e.base_url,{httpReferrer:e.base_url});const r=await F(t,e);console.log(`[crawler] found ${r.length} links from ${e.remark||e.base_url}`);const s=[];for(const o of r){const a=await W(t,e,o);if(!a||!a.title||!a.contentHtml){console.log(`[crawler] skip empty result for ${o}`);continue}const c={url:a.url||o,title:a.title,content_html:a.contentHtml,content_markdown:v(a.contentHtml),published_at:C(a.timeText)};s.push(c);try{await N(c)}catch(l){console.warn("[crawler] push single news item failed",l)}}return console.log(`[crawler] processed ${s.length} items from ${e.remark||e.base_url}`),{success:!0,data:s}}catch(r){return console.warn("[crawler] rule failed",e.remark||e.base_url,r),{success:!1,error:r instanceof Error?r.message:String(r)}}}function j(){if($)return;$=!0;const e=p.interval_ms,t=async()=>{const r=await P();console.log(`[crawler] scheduled run, rules=${r.length}`),p.running=!0,p.running_source=void 0;try{for(const s of r)await J(s)}finally{p.running=!1,p.running_source=void 0,p.next_run_at=new Date(Date.now()+e).toISOString()}};p.next_run_at=new Date(Date.now()+5e3).toISOString(),setTimeout(t,5e3),setInterval(t,e)}exports.initCrawler=E;
86
+ `)}catch(s){return console.warn("[crawler] failed to extract page content",r,s),null}}async function J(t){const e=P();try{await e.loadURL(t.base_url,{httpReferrer:t.base_url});const r=await q(e,t);console.log(`[crawler] found ${r.length} links from ${t.remark||t.base_url}`);const s=[];for(const o of r){const a=await W(e,t,o);if(!a||!a.title||!a.contentHtml){console.log(`[crawler] skip empty result for ${o}`);continue}const i={url:a.url||o,title:a.title,content_html:a.contentHtml,content_markdown:v(a.contentHtml),published_at:D(a.timeText)};if(s.push(i),(l.isPush?l.isPush(i):!0)===!1){console.log(`[crawler] skip push due to isPush filter: ${i.url}`);continue}try{await N(i)}catch(g){console.warn("[crawler] push single news item failed",g)}}return console.log(`[crawler] processed ${s.length} items from ${t.remark||t.base_url}`),{success:!0,data:s}}catch(r){return console.warn("[crawler] rule failed",t.remark||t.base_url,r),{success:!1,error:r instanceof Error?r.message:String(r)}}}function j(){if($)return;$=!0;const t=m.interval_ms,e=async()=>{const r=await E();console.log(`[crawler] scheduled run, rules=${r.length}`),m.running=!0,m.running_source=void 0;try{for(const s of r)await J(s)}finally{m.running=!1,m.running_source=void 0,m.next_run_at=new Date(Date.now()+t).toISOString()}};m.next_run_at=new Date(Date.now()+5e3).toISOString(),setTimeout(e,5e3),setInterval(e,t)}exports.initCrawler=H;
package/dist/index.mjs CHANGED
@@ -1,9 +1,9 @@
1
- import x from "electron-store";
2
- import { BrowserWindow as A } from "electron";
3
- import L from "turndown";
4
- function O() {
5
- const e = A.getAllWindows().find((r) => r.title === "crawler-hidden-window");
6
- return e || new A({
1
+ import A from "electron-store";
2
+ import { BrowserWindow as x } from "electron";
3
+ import O from "turndown";
4
+ function P() {
5
+ const t = x.getAllWindows().find((r) => r.title === "crawler-hidden-window");
6
+ return t || new x({
7
7
  show: !1,
8
8
  webPreferences: {
9
9
  sandbox: !1
@@ -11,7 +11,7 @@ function O() {
11
11
  title: "crawler-hidden-window"
12
12
  });
13
13
  }
14
- const k = new L({
14
+ const I = new O({
15
15
  headingStyle: "atx",
16
16
  // 使用 # 格式的标题
17
17
  codeBlockStyle: "fenced",
@@ -28,125 +28,139 @@ const k = new L({
28
28
  // 完整的链接引用格式
29
29
  preformattedCode: !1,
30
30
  // 不使用预格式化代码
31
- blankReplacement: (e, t) => t.nodeName === "BR" ? `
31
+ blankReplacement: (t, e) => e.nodeName === "BR" ? `
32
32
  ` : ""
33
33
  });
34
- k.addRule("preserveLineBreaks", {
34
+ I.addRule("preserveLineBreaks", {
35
35
  filter: ["br"],
36
36
  replacement: () => `
37
37
  `
38
38
  });
39
- k.addRule("images", {
39
+ I.addRule("images", {
40
40
  filter: "img",
41
- replacement: (e, t) => {
42
- const r = t.alt || "", s = t.src || t.getAttribute("src") || "", o = t.title || "";
41
+ replacement: (t, e) => {
42
+ const r = e.alt || "", s = e.src || e.getAttribute("src") || "", o = e.title || "";
43
43
  return o ? `![${r}](${s} "${o}")` : `![${r}](${s})`;
44
44
  }
45
45
  });
46
- function $(e) {
47
- return e.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<noscript[\s\S]*?<\/noscript>/gi, "").replace(/<iframe[\s\S]*?<\/iframe>/gi, "").replace(/on\w+="[^"]*"/gi, "").replace(/on\w+='[^']*'/gi, "").trim();
46
+ function $(t) {
47
+ return t.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<noscript[\s\S]*?<\/noscript>/gi, "").replace(/<iframe[\s\S]*?<\/iframe>/gi, "").replace(/on\w+="[^"]*"/gi, "").replace(/on\w+='[^']*'/gi, "").trim();
48
48
  }
49
- function D(e) {
50
- if (!e || !e.trim())
49
+ function v(t) {
50
+ if (!t || !t.trim())
51
51
  return "";
52
52
  try {
53
- const t = $(e);
54
- if (!t)
53
+ const e = $(t);
54
+ if (!e)
55
55
  return "";
56
- let r = k.turndown(t);
56
+ let r = I.turndown(e);
57
57
  return r = r.replace(/\n{3,}/g, `
58
58
 
59
59
  `), r = r.split(`
60
60
  `).map((s) => s.trimEnd()).join(`
61
61
  `), r.trim();
62
- } catch (t) {
63
- return console.error("[normalizeMarkdown] 转换失败:", t), $(e).replace(/<[^>]+>/g, "").replace(/\n{3,}/g, `
62
+ } catch (e) {
63
+ return console.error("[normalizeMarkdown] 转换失败:", e), $(t).replace(/<[^>]+>/g, "").replace(/\n{3,}/g, `
64
64
 
65
65
  `).trim();
66
66
  }
67
67
  }
68
- function v(e) {
69
- const t = /* @__PURE__ */ new Date(), r = t.getFullYear(), s = t.getMonth() + 1, o = t.getDate(), a = t.getHours(), c = t.getMinutes();
70
- if (!e || !e.trim())
71
- return t.toISOString();
72
- const l = e.trim(), y = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/, S = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, U = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/, _ = /(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, T = /(\d{1,2})[月\-/](\d{1,2})[日]?/, R = /(\d{1,2})[:时](\d{1,2})[分]?/;
73
- let g = r, u = s, f = o, m = a, p = c, n = l.match(y);
68
+ function D(t) {
69
+ const e = /* @__PURE__ */ new Date(), r = e.getFullYear(), s = e.getMonth() + 1, o = e.getDate(), a = e.getHours(), i = e.getMinutes();
70
+ if (!t || !t.trim())
71
+ return e.toISOString();
72
+ const c = t.trim(), g = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/, S = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, R = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/, U = /(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, T = /(\d{1,2})[月\-/](\d{1,2})[日]?/, M = /(\d{1,2})[:时](\d{1,2})[分]?/;
73
+ let h = r, u = s, f = o, d = a, p = i, n = c.match(g);
74
74
  if (n)
75
- g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), m = parseInt(n[4], 10), p = parseInt(n[5], 10);
76
- else if (n = l.match(S), n)
77
- g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), m = parseInt(n[4], 10), p = parseInt(n[5], 10);
78
- else if (n = l.match(U), n)
79
- g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10);
80
- else if (n = l.match(_), n)
81
- u = parseInt(n[1], 10), f = parseInt(n[2], 10), m = parseInt(n[3], 10), p = parseInt(n[4], 10);
82
- else if (n = l.match(T), n)
75
+ h = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), d = parseInt(n[4], 10), p = parseInt(n[5], 10);
76
+ else if (n = c.match(S), n)
77
+ h = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), d = parseInt(n[4], 10), p = parseInt(n[5], 10);
78
+ else if (n = c.match(R), n)
79
+ h = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10);
80
+ else if (n = c.match(U), n)
81
+ u = parseInt(n[1], 10), f = parseInt(n[2], 10), d = parseInt(n[3], 10), p = parseInt(n[4], 10);
82
+ else if (n = c.match(T), n)
83
83
  u = parseInt(n[1], 10), f = parseInt(n[2], 10);
84
- else if (n = l.match(R), n)
85
- m = parseInt(n[1], 10), p = parseInt(n[2], 10);
84
+ else if (n = c.match(M), n)
85
+ d = parseInt(n[1], 10), p = parseInt(n[2], 10);
86
86
  else {
87
- const I = new Date(l);
88
- return Number.isNaN(I.getTime()) ? t.toISOString() : I.toISOString();
87
+ const k = new Date(c);
88
+ return Number.isNaN(k.getTime()) ? e.toISOString() : k.toISOString();
89
89
  }
90
- (u < 1 || u > 12) && (u = s), (f < 1 || f > 31) && (f = o), (m < 0 || m > 23) && (m = a), (p < 0 || p > 59) && (p = c);
91
- const h = new Date(g, u - 1, f, m, p, 0, 0);
92
- return h.getFullYear() !== g || h.getMonth() !== u - 1 || h.getDate() !== f ? t.toISOString() : C(h);
90
+ (u < 1 || u > 12) && (u = s), (f < 1 || f > 31) && (f = o), (d < 0 || d > 23) && (d = a), (p < 0 || p > 59) && (p = i);
91
+ const y = new Date(h, u - 1, f, d, p, 0, 0);
92
+ return y.getFullYear() !== h || y.getMonth() !== u - 1 || y.getDate() !== f ? e.toISOString() : L(y);
93
93
  }
94
- function C(e) {
95
- const t = (S) => S.toString().padStart(2, "0"), r = e.getFullYear(), s = t(e.getMonth() + 1), o = t(e.getDate()), a = t(e.getHours()), c = t(e.getMinutes()), l = t(e.getSeconds()), y = e.getMilliseconds().toString().padStart(3, "0");
96
- return `${r}-${s}-${o}T${a}:${c}:${l}.${y}`;
94
+ function L(t) {
95
+ const e = (S) => S.toString().padStart(2, "0"), r = t.getFullYear(), s = e(t.getMonth() + 1), o = e(t.getDate()), a = e(t.getHours()), i = e(t.getMinutes()), c = e(t.getSeconds()), g = t.getMilliseconds().toString().padStart(3, "0");
96
+ return `${r}-${s}-${o}T${a}:${i}:${c}.${g}`;
97
97
  }
98
- const M = x.default || x, w = new M();
99
- async function P() {
100
- if (i.rules && i.rules.length > 0)
101
- return i.rules;
102
- const e = w.get("news.rules") || [];
103
- if (e.length > 0)
104
- return e;
105
- const t = await E();
106
- return t.length > 0 ? (w.set("news.rules", t), t) : [];
98
+ const C = A.default || A, w = new C();
99
+ async function E() {
100
+ if (l.rules && l.rules.length > 0)
101
+ return l.rules;
102
+ if (l.devMode)
103
+ return await b();
104
+ const e = w.get("news.rules");
105
+ let r = [];
106
+ if (Array.isArray(e))
107
+ r = e;
108
+ else if (e && Array.isArray(e.rules)) {
109
+ const o = e.updatedAt ? new Date(e.updatedAt).getTime() : 0, a = Date.now();
110
+ o > 0 && a - o <= 18e6 ? r = e.rules : w.delete("news.rules");
111
+ }
112
+ if (r.length > 0)
113
+ return r;
114
+ const s = await b();
115
+ return s.length > 0 ? (l.devMode || w.set("news.rules", {
116
+ rules: s,
117
+ updatedAt: (/* @__PURE__ */ new Date()).toISOString()
118
+ }), s) : [];
107
119
  }
108
- const d = {
120
+ const m = {
109
121
  running: !1,
110
122
  interval_ms: 1800 * 1e3
111
123
  };
112
- let i = {
124
+ let l = {
113
125
  rules: [],
114
126
  rulesApiUrl: void 0,
115
127
  pushApiUrl: void 0,
116
- ruleTransformer: (e) => e,
128
+ devMode: !1,
129
+ ruleTransformer: (t) => t,
130
+ isPush: void 0,
117
131
  newsItemFieldMap: void 0
118
- }, b = !1;
119
- function z(e) {
120
- i = {
121
- ...i,
122
- ...e,
132
+ }, _ = !1;
133
+ function z(t) {
134
+ l = {
135
+ ...l,
136
+ ...t,
123
137
  // 确保ruleTransformer始终存在
124
- ruleTransformer: e.ruleTransformer || ((t) => t && typeof t == "object" && "data" in t ? t.data : t)
138
+ ruleTransformer: t.ruleTransformer || ((e) => e && typeof e == "object" && "data" in e ? e.data : e)
125
139
  }, W();
126
140
  }
127
- async function E() {
128
- if (!i.rulesApiUrl)
141
+ async function b() {
142
+ if (!l.rulesApiUrl)
129
143
  return [];
130
144
  try {
131
- const e = await fetch(i.rulesApiUrl);
132
- if (!e.ok)
133
- throw new Error(`Failed to fetch rules from API: ${e.status} ${e.statusText}`);
134
- const t = await e.json(), r = i.ruleTransformer(t);
145
+ const t = await fetch(l.rulesApiUrl);
146
+ if (!t.ok)
147
+ throw new Error(`Failed to fetch rules from API: ${t.status} ${t.statusText}`);
148
+ const e = await t.json(), r = l.ruleTransformer(e);
135
149
  return Array.isArray(r) ? r : (console.warn("[crawler] Rules API returned non-array data, using empty array instead"), []);
136
- } catch (e) {
137
- return console.error("[crawler] Failed to fetch rules from API:", e), [];
150
+ } catch (t) {
151
+ return console.error("[crawler] Failed to fetch rules from API:", t), [];
138
152
  }
139
153
  }
140
- async function H(e) {
141
- if (!i.pushApiUrl)
154
+ async function H(t) {
155
+ if (!l.pushApiUrl)
142
156
  return;
143
- const t = w.get("news.pushedUrls") || [];
144
- if (t.includes(e.url)) {
145
- console.log(`[crawler] URL already pushed, skipping: ${e.url}`);
157
+ const e = w.get("news.pushedUrls") || [];
158
+ if (e.includes(t.url)) {
159
+ console.log(`[crawler] URL already pushed, skipping: ${t.url}`);
146
160
  return;
147
161
  }
148
162
  try {
149
- const r = N(e), s = await fetch(i.pushApiUrl, {
163
+ const r = N(t), s = await fetch(l.pushApiUrl, {
150
164
  method: "POST",
151
165
  headers: {
152
166
  "Content-Type": "application/json"
@@ -155,32 +169,32 @@ async function H(e) {
155
169
  });
156
170
  if (!s.ok)
157
171
  throw new Error(`Failed to push results to API: ${s.status} ${s.statusText}`);
158
- const o = [...t, e.url];
172
+ const o = [...e, t.url];
159
173
  w.set("news.pushedUrls", o), console.log("[crawler] Results pushed to API successfully");
160
174
  } catch (r) {
161
175
  console.error("[crawler] Failed to push results to API:", r);
162
176
  }
163
177
  }
164
- function N(e) {
165
- const t = i.newsItemFieldMap;
166
- if (!t || Object.keys(t).length === 0)
167
- return e;
168
- const r = {}, s = Object.entries(e);
178
+ function N(t) {
179
+ const e = l.newsItemFieldMap;
180
+ if (!e || Object.keys(e).length === 0)
181
+ return t;
182
+ const r = {}, s = Object.entries(t);
169
183
  for (const [o, a] of s) {
170
- const c = t[o];
171
- if (c === "-") continue;
172
- const l = typeof c == "string" ? c : o;
173
- r[l] = a;
184
+ const i = e[o];
185
+ if (i === "-") continue;
186
+ const c = typeof i == "string" ? i : o;
187
+ r[c] = a;
174
188
  }
175
189
  return r;
176
190
  }
177
- async function F(e, t) {
178
- return await e.webContents.executeJavaScript(
191
+ async function F(t, e) {
192
+ return await t.webContents.executeJavaScript(
179
193
  `
180
194
  (() => {
181
195
  const links = []
182
196
  // 在指定范围内查找所有链接
183
- const rangeElements = document.querySelectorAll(${JSON.stringify(t.home_range_selector)})
197
+ const rangeElements = document.querySelectorAll(${JSON.stringify(e.home_range_selector)})
184
198
  if (rangeElements.length === 0) {
185
199
  // 如果没有找到范围,则在整个文档中查找
186
200
  const allLinks = document.querySelectorAll('a')
@@ -212,9 +226,9 @@ async function F(e, t) {
212
226
  `
213
227
  );
214
228
  }
215
- async function q(e, t, r) {
229
+ async function q(t, e, r) {
216
230
  try {
217
- return await e.loadURL(r, { httpReferrer: t.base_url }), await e.webContents.executeJavaScript(
231
+ return await t.loadURL(r, { httpReferrer: e.base_url }), await t.webContents.executeJavaScript(
218
232
  `
219
233
  (() => {
220
234
  const pickText = (sel) => {
@@ -253,11 +267,11 @@ async function q(e, t, r) {
253
267
  return clone.innerHTML || ''
254
268
  }
255
269
  return {
256
- title: pickText(${JSON.stringify(t.title_selector)}),
257
- contentHtml: pickContentHtml(${JSON.stringify(t.content_selector)}, ${JSON.stringify(
258
- t.exclude_selectors || []
270
+ title: pickText(${JSON.stringify(e.title_selector)}),
271
+ contentHtml: pickContentHtml(${JSON.stringify(e.content_selector)}, ${JSON.stringify(
272
+ e.exclude_selectors || []
259
273
  )}),
260
- timeText: pickText(${JSON.stringify(t.time_selector)}),
274
+ timeText: pickText(${JSON.stringify(e.time_selector)}),
261
275
  url: location.href
262
276
  }
263
277
  })()
@@ -267,58 +281,61 @@ async function q(e, t, r) {
267
281
  return console.warn("[crawler] failed to extract page content", r, s), null;
268
282
  }
269
283
  }
270
- async function J(e) {
271
- const t = O();
284
+ async function J(t) {
285
+ const e = P();
272
286
  try {
273
- await t.loadURL(e.base_url, { httpReferrer: e.base_url });
274
- const r = await F(t, e);
275
- console.log(`[crawler] found ${r.length} links from ${e.remark || e.base_url}`);
287
+ await e.loadURL(t.base_url, { httpReferrer: t.base_url });
288
+ const r = await F(e, t);
289
+ console.log(`[crawler] found ${r.length} links from ${t.remark || t.base_url}`);
276
290
  const s = [];
277
291
  for (const o of r) {
278
- const a = await q(t, e, o);
292
+ const a = await q(e, t, o);
279
293
  if (!a || !a.title || !a.contentHtml) {
280
294
  console.log(`[crawler] skip empty result for ${o}`);
281
295
  continue;
282
296
  }
283
- const c = {
297
+ const i = {
284
298
  url: a.url || o,
285
299
  title: a.title,
286
300
  content_html: a.contentHtml,
287
- content_markdown: D(a.contentHtml),
288
- published_at: v(a.timeText)
301
+ content_markdown: v(a.contentHtml),
302
+ published_at: D(a.timeText)
289
303
  };
290
- s.push(c);
304
+ if (s.push(i), (l.isPush ? l.isPush(i) : !0) === !1) {
305
+ console.log(`[crawler] skip push due to isPush filter: ${i.url}`);
306
+ continue;
307
+ }
291
308
  try {
292
- await H(c);
293
- } catch (l) {
294
- console.warn("[crawler] push single news item failed", l);
309
+ await H(i);
310
+ } catch (g) {
311
+ console.warn("[crawler] push single news item failed", g);
295
312
  }
296
313
  }
297
- return console.log(`[crawler] processed ${s.length} items from ${e.remark || e.base_url}`), {
314
+ return console.log(`[crawler] processed ${s.length} items from ${t.remark || t.base_url}`), {
298
315
  success: !0,
299
316
  data: s
300
317
  };
301
318
  } catch (r) {
302
- return console.warn("[crawler] rule failed", e.remark || e.base_url, r), {
319
+ return console.warn("[crawler] rule failed", t.remark || t.base_url, r), {
303
320
  success: !1,
304
321
  error: r instanceof Error ? r.message : String(r)
305
322
  };
306
323
  }
307
324
  }
308
325
  function W() {
309
- if (b) return;
310
- b = !0;
311
- const e = d.interval_ms, t = async () => {
312
- const r = await P();
313
- console.log(`[crawler] scheduled run, rules=${r.length}`), d.running = !0, d.running_source = void 0;
326
+ if (_) return;
327
+ _ = !0;
328
+ const t = m.interval_ms, e = async () => {
329
+ const r = await E();
330
+ console.log(`[crawler] scheduled run, rules=${r.length}`), m.running = !0, m.running_source = void 0;
314
331
  try {
315
332
  for (const s of r)
316
333
  await J(s);
317
334
  } finally {
318
- d.running = !1, d.running_source = void 0, d.next_run_at = new Date(Date.now() + e).toISOString();
335
+ m.running = !1, m.running_source = void 0, m.next_run_at = new Date(Date.now() + t).toISOString();
319
336
  }
320
337
  };
321
- d.next_run_at = new Date(Date.now() + 5e3).toISOString(), setTimeout(t, 5e3), setInterval(t, e);
338
+ m.next_run_at = new Date(Date.now() + 5e3).toISOString(), setTimeout(e, 5e3), setInterval(e, t);
322
339
  }
323
340
  export {
324
341
  z as initCrawler
@@ -19,6 +19,20 @@ export type CrawlerConfig = {
19
19
  rulesApiUrl?: string;
20
20
  pushApiUrl?: string;
21
21
  ruleTransformer?: (data: any) => any;
22
+ /**
23
+ * 是否处于开发模式
24
+ * - 开发模式:不使用本地缓存,每次都优先使用内存 rules,其次直接从 API 拉取
25
+ * - 生产模式:会使用本地缓存(带 5 小时过期时间)
26
+ */
27
+ devMode?: boolean;
28
+ /**
29
+ * 推送前判断函数
30
+ * - 返回 true、undefined 或 null:可以推送
31
+ * - 返回 false:不推送
32
+ * @param item 新闻项
33
+ * @returns 是否允许推送
34
+ */
35
+ isPush?: (item: NewsItem) => boolean | null | undefined;
22
36
  newsItemFieldMap?: Partial<Record<keyof NewsItem, string | '-'>>;
23
37
  };
24
38
  export type NewsItem = {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@howuse/electron-crawler",
3
- "version": "0.3.0",
3
+ "version": "0.5.0",
4
4
  "description": "基于Electron的爬虫工具包,用于爬取新闻和股票详情",
5
5
  "keywords": [
6
6
  "electron",