@howuse/electron-crawler 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,16 +1,16 @@
1
- "use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const x=require("electron-store"),A=require("electron"),O=require("turndown");function L(){const e=A.BrowserWindow.getAllWindows().find(r=>r.title==="crawler-hidden-window");return e||new A.BrowserWindow({show:!1,webPreferences:{sandbox:!1},title:"crawler-hidden-window"})}const k=new O({headingStyle:"atx",codeBlockStyle:"fenced",bulletListMarker:"-",emDelimiter:"*",strongDelimiter:"**",linkStyle:"inlined",linkReferenceStyle:"full",preformattedCode:!1,blankReplacement:(e,t)=>t.nodeName==="BR"?`
2
- `:""});k.addRule("preserveLineBreaks",{filter:["br"],replacement:()=>`
3
- `});k.addRule("images",{filter:"img",replacement:(e,t)=>{const r=t.alt||"",s=t.src||t.getAttribute("src")||"",o=t.title||"";return o?`![${r}](${s} "${o}")`:`![${r}](${s})`}});function b(e){return e.replace(/<script[\s\S]*?<\/script>/gi,"").replace(/<style[\s\S]*?<\/style>/gi,"").replace(/<noscript[\s\S]*?<\/noscript>/gi,"").replace(/<iframe[\s\S]*?<\/iframe>/gi,"").replace(/on\w+="[^"]*"/gi,"").replace(/on\w+='[^']*'/gi,"").trim()}function v(e){if(!e||!e.trim())return"";try{const t=b(e);if(!t)return"";let r=k.turndown(t);return r=r.replace(/\n{3,}/g,`
1
+ "use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const A=require("electron-store"),x=require("electron"),O=require("turndown");function v(){const t=x.BrowserWindow.getAllWindows().find(r=>r.title==="crawler-hidden-window");return t||new x.BrowserWindow({show:!1,webPreferences:{sandbox:!1},title:"crawler-hidden-window"})}const I=new O({headingStyle:"atx",codeBlockStyle:"fenced",bulletListMarker:"-",emDelimiter:"*",strongDelimiter:"**",linkStyle:"inlined",linkReferenceStyle:"full",preformattedCode:!1,blankReplacement:(t,e)=>e.nodeName==="BR"?`
2
+ `:""});I.addRule("preserveLineBreaks",{filter:["br"],replacement:()=>`
3
+ `});I.addRule("images",{filter:"img",replacement:(t,e)=>{const r=e.alt||"",s=e.src||e.getAttribute("src")||"",o=e.title||"";return o?`![${r}](${s} "${o}")`:`![${r}](${s})`}});function b(t){return t.replace(/<script[\s\S]*?<\/script>/gi,"").replace(/<style[\s\S]*?<\/style>/gi,"").replace(/<noscript[\s\S]*?<\/noscript>/gi,"").replace(/<iframe[\s\S]*?<\/iframe>/gi,"").replace(/on\w+="[^"]*"/gi,"").replace(/on\w+='[^']*'/gi,"").trim()}function D(t){if(!t||!t.trim())return"";try{const e=b(t);if(!e)return"";let r=I.turndown(e);return r=r.replace(/\n{3,}/g,`
4
4
 
5
5
  `),r=r.split(`
6
6
  `).map(s=>s.trimEnd()).join(`
7
- `),r.trim()}catch(t){return console.error("[normalizeMarkdown] 转换失败:",t),b(e).replace(/<[^>]+>/g,"").replace(/\n{3,}/g,`
7
+ `),r.trim()}catch(e){return console.error("[normalizeMarkdown] 转换失败:",e),b(t).replace(/<[^>]+>/g,"").replace(/\n{3,}/g,`
8
8
 
9
- `).trim()}}function C(e){const t=new Date,r=t.getFullYear(),s=t.getMonth()+1,o=t.getDate(),a=t.getHours(),c=t.getMinutes();if(!e||!e.trim())return t.toISOString();const l=e.trim(),y=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/,S=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,T=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/,U=/(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,_=/(\d{1,2})[月\-/](\d{1,2})[日]?/,R=/(\d{1,2})[:时](\d{1,2})[分]?/;let g=r,u=s,f=o,d=a,m=c,n=l.match(y);if(n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),d=parseInt(n[4],10),m=parseInt(n[5],10);else if(n=l.match(S),n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),d=parseInt(n[4],10),m=parseInt(n[5],10);else if(n=l.match(T),n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10);else if(n=l.match(U),n)u=parseInt(n[1],10),f=parseInt(n[2],10),d=parseInt(n[3],10),m=parseInt(n[4],10);else if(n=l.match(_),n)u=parseInt(n[1],10),f=parseInt(n[2],10);else if(n=l.match(R),n)d=parseInt(n[1],10),m=parseInt(n[2],10);else{const I=new Date(l);return Number.isNaN(I.getTime())?t.toISOString():I.toISOString()}(u<1||u>12)&&(u=s),(f<1||f>31)&&(f=o),(d<0||d>23)&&(d=a),(m<0||m>59)&&(m=c);const h=new Date(g,u-1,f,d,m,0,0);return h.getFullYear()!==g||h.getMonth()!==u-1||h.getDate()!==f?t.toISOString():D(h)}function D(e){const t=S=>S.toString().padStart(2,"0"),r=e.getFullYear(),s=t(e.getMonth()+1),o=t(e.getDate()),a=t(e.getHours()),c=t(e.getMinutes()),l=t(e.getSeconds()),y=e.getMilliseconds().toString().padStart(3,"0");return`${r}-${s}-${o}T${a}:${c}:${l}.${y}`}const M=x.default||x,w=new M;async function P(){if(i.rules&&i.rules.length>0)return i.rules;const e=w.get("news.rules")||[];if(e.length>0)return e;const t=await H();return t.length>0?(w.set("news.rules",t),t):[]}const p={running:!1,interval_ms:1800*1e3};let i={rules:[],rulesApiUrl:void 0,pushApiUrl:void 0,ruleTransformer:e=>e,newsItemFieldMap:void 0},$=!1;function E(e){i={...i,...e,ruleTransformer:e.ruleTransformer||(t=>t&&typeof t=="object"&&"data"in t?t.data:t)},j()}async function H(){if(!i.rulesApiUrl)return[];try{const e=await fetch(i.rulesApiUrl);if(!e.ok)throw new Error(`Failed to fetch rules from API: ${e.status} ${e.statusText}`);const t=await e.json(),r=i.ruleTransformer(t);return Array.isArray(r)?r:(console.warn("[crawler] Rules API returned non-array data, using empty array instead"),[])}catch(e){return console.error("[crawler] Failed to fetch rules from API:",e),[]}}async function N(e){if(!i.pushApiUrl)return;const t=w.get("news.pushedUrls")||[];if(t.includes(e.url)){console.log(`[crawler] URL already pushed, skipping: ${e.url}`);return}try{const r=q(e),s=await fetch(i.pushApiUrl,{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify(r)});if(!s.ok)throw new Error(`Failed to push results to API: ${s.status} ${s.statusText}`);const o=[...t,e.url];w.set("news.pushedUrls",o),console.log("[crawler] Results pushed to API successfully")}catch(r){console.error("[crawler] Failed to push results to API:",r)}}function q(e){const t=i.newsItemFieldMap;if(!t||Object.keys(t).length===0)return e;const r={},s=Object.entries(e);for(const[o,a]of s){const c=t[o];if(c==="-")continue;const l=typeof c=="string"?c:o;r[l]=a}return r}async function F(e,t){return await e.webContents.executeJavaScript(`
9
+ `).trim()}}function L(t){const e=new Date,r=e.getFullYear(),s=e.getMonth()+1,o=e.getDate(),a=e.getHours(),i=e.getMinutes();if(!t||!t.trim())return e.toISOString();const l=t.trim(),y=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/,S=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,R=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/,U=/(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,T=/(\d{1,2})[月\-/](\d{1,2})[日]?/,M=/(\d{1,2})[:时](\d{1,2})[分]?/;let g=r,u=s,f=o,d=a,p=i,n=l.match(y);if(n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),d=parseInt(n[4],10),p=parseInt(n[5],10);else if(n=l.match(S),n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),d=parseInt(n[4],10),p=parseInt(n[5],10);else if(n=l.match(R),n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10);else if(n=l.match(U),n)u=parseInt(n[1],10),f=parseInt(n[2],10),d=parseInt(n[3],10),p=parseInt(n[4],10);else if(n=l.match(T),n)u=parseInt(n[1],10),f=parseInt(n[2],10);else if(n=l.match(M),n)d=parseInt(n[1],10),p=parseInt(n[2],10);else{const k=new Date(l);return Number.isNaN(k.getTime())?e.toISOString():k.toISOString()}(u<1||u>12)&&(u=s),(f<1||f>31)&&(f=o),(d<0||d>23)&&(d=a),(p<0||p>59)&&(p=i);const h=new Date(g,u-1,f,d,p,0,0);return h.getFullYear()!==g||h.getMonth()!==u-1||h.getDate()!==f?e.toISOString():C(h)}function C(t){const e=S=>S.toString().padStart(2,"0"),r=t.getFullYear(),s=e(t.getMonth()+1),o=e(t.getDate()),a=e(t.getHours()),i=e(t.getMinutes()),l=e(t.getSeconds()),y=t.getMilliseconds().toString().padStart(3,"0");return`${r}-${s}-${o}T${a}:${i}:${l}.${y}`}const P=A.default||A,w=new P;async function E(){if(c.rules&&c.rules.length>0)return c.rules;if(c.devMode)return await $();const e=w.get("news.rules");let r=[];if(Array.isArray(e))r=e;else if(e&&Array.isArray(e.rules)){const o=e.updatedAt?new Date(e.updatedAt).getTime():0,a=Date.now();o>0&&a-o<=18e6?r=e.rules:w.delete("news.rules")}if(r.length>0)return r;const s=await $();return s.length>0?(c.devMode||w.set("news.rules",{rules:s,updatedAt:new Date().toISOString()}),s):[]}const m={running:!1,interval_ms:1800*1e3};let c={rules:[],rulesApiUrl:void 0,pushApiUrl:void 0,devMode:!1,ruleTransformer:t=>t,newsItemFieldMap:void 0},_=!1;function H(t){c={...c,...t,ruleTransformer:t.ruleTransformer||(e=>e&&typeof e=="object"&&"data"in e?e.data:e)},j()}async function $(){if(!c.rulesApiUrl)return[];try{const t=await fetch(c.rulesApiUrl);if(!t.ok)throw new Error(`Failed to fetch rules from API: ${t.status} ${t.statusText}`);const e=await t.json(),r=c.ruleTransformer(e);return Array.isArray(r)?r:(console.warn("[crawler] Rules API returned non-array data, using empty array instead"),[])}catch(t){return console.error("[crawler] Failed to fetch rules from API:",t),[]}}async function N(t){if(!c.pushApiUrl)return;const e=w.get("news.pushedUrls")||[];if(e.includes(t.url)){console.log(`[crawler] URL already pushed, skipping: ${t.url}`);return}try{const r=F(t),s=await fetch(c.pushApiUrl,{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify(r)});if(!s.ok)throw new Error(`Failed to push results to API: ${s.status} ${s.statusText}`);const o=[...e,t.url];w.set("news.pushedUrls",o),console.log("[crawler] Results pushed to API successfully")}catch(r){console.error("[crawler] Failed to push results to API:",r)}}function F(t){const e=c.newsItemFieldMap;if(!e||Object.keys(e).length===0)return t;const r={},s=Object.entries(t);for(const[o,a]of s){const i=e[o];if(i==="-")continue;const l=typeof i=="string"?i:o;r[l]=a}return r}async function q(t,e){return await t.webContents.executeJavaScript(`
10
10
  (() => {
11
11
  const links = []
12
12
  // 在指定范围内查找所有链接
13
- const rangeElements = document.querySelectorAll(${JSON.stringify(t.home_range_selector)})
13
+ const rangeElements = document.querySelectorAll(${JSON.stringify(e.home_range_selector)})
14
14
  if (rangeElements.length === 0) {
15
15
  // 如果没有找到范围,则在整个文档中查找
16
16
  const allLinks = document.querySelectorAll('a')
@@ -39,7 +39,7 @@
39
39
  const uniqueLinks = [...new Map(links.map(l => [l.href, l])).values()]
40
40
  return uniqueLinks.map(l => l.href)
41
41
  })()
42
- `)}async function W(e,t,r){try{return await e.loadURL(r,{httpReferrer:t.base_url}),await e.webContents.executeJavaScript(`
42
+ `)}async function W(t,e,r){try{return await t.loadURL(r,{httpReferrer:e.base_url}),await t.webContents.executeJavaScript(`
43
43
  (() => {
44
44
  const pickText = (sel) => {
45
45
  const el = document.querySelector(sel)
@@ -77,10 +77,10 @@
77
77
  return clone.innerHTML || ''
78
78
  }
79
79
  return {
80
- title: pickText(${JSON.stringify(t.title_selector)}),
81
- contentHtml: pickContentHtml(${JSON.stringify(t.content_selector)}, ${JSON.stringify(t.exclude_selectors||[])}),
82
- timeText: pickText(${JSON.stringify(t.time_selector)}),
80
+ title: pickText(${JSON.stringify(e.title_selector)}),
81
+ contentHtml: pickContentHtml(${JSON.stringify(e.content_selector)}, ${JSON.stringify(e.exclude_selectors||[])}),
82
+ timeText: pickText(${JSON.stringify(e.time_selector)}),
83
83
  url: location.href
84
84
  }
85
85
  })()
86
- `)}catch(s){return console.warn("[crawler] failed to extract page content",r,s),null}}async function J(e){const t=L();try{await t.loadURL(e.base_url,{httpReferrer:e.base_url});const r=await F(t,e);console.log(`[crawler] found ${r.length} links from ${e.remark||e.base_url}`);const s=[];for(const o of r){const a=await W(t,e,o);if(!a||!a.title||!a.contentHtml){console.log(`[crawler] skip empty result for ${o}`);continue}const c={url:a.url||o,title:a.title,content_html:a.contentHtml,content_markdown:v(a.contentHtml),published_at:C(a.timeText)};s.push(c);try{await N(c)}catch(l){console.warn("[crawler] push single news item failed",l)}}return console.log(`[crawler] processed ${s.length} items from ${e.remark||e.base_url}`),{success:!0,data:s}}catch(r){return console.warn("[crawler] rule failed",e.remark||e.base_url,r),{success:!1,error:r instanceof Error?r.message:String(r)}}}function j(){if($)return;$=!0;const e=p.interval_ms,t=async()=>{const r=await P();console.log(`[crawler] scheduled run, rules=${r.length}`),p.running=!0,p.running_source=void 0;try{for(const s of r)await J(s)}finally{p.running=!1,p.running_source=void 0,p.next_run_at=new Date(Date.now()+e).toISOString()}};p.next_run_at=new Date(Date.now()+5e3).toISOString(),setTimeout(t,5e3),setInterval(t,e)}exports.initCrawler=E;
86
+ `)}catch(s){return console.warn("[crawler] failed to extract page content",r,s),null}}async function J(t){const e=v();try{await e.loadURL(t.base_url,{httpReferrer:t.base_url});const r=await q(e,t);console.log(`[crawler] found ${r.length} links from ${t.remark||t.base_url}`);const s=[];for(const o of r){const a=await W(e,t,o);if(!a||!a.title||!a.contentHtml){console.log(`[crawler] skip empty result for ${o}`);continue}const i={url:a.url||o,title:a.title,content_html:a.contentHtml,content_markdown:D(a.contentHtml),published_at:L(a.timeText)};s.push(i);try{await N(i)}catch(l){console.warn("[crawler] push single news item failed",l)}}return console.log(`[crawler] processed ${s.length} items from ${t.remark||t.base_url}`),{success:!0,data:s}}catch(r){return console.warn("[crawler] rule failed",t.remark||t.base_url,r),{success:!1,error:r instanceof Error?r.message:String(r)}}}function j(){if(_)return;_=!0;const t=m.interval_ms,e=async()=>{const r=await E();console.log(`[crawler] scheduled run, rules=${r.length}`),m.running=!0,m.running_source=void 0;try{for(const s of r)await J(s)}finally{m.running=!1,m.running_source=void 0,m.next_run_at=new Date(Date.now()+t).toISOString()}};m.next_run_at=new Date(Date.now()+5e3).toISOString(),setTimeout(e,5e3),setInterval(e,t)}exports.initCrawler=H;
package/dist/index.mjs CHANGED
@@ -1,9 +1,9 @@
1
- import x from "electron-store";
2
- import { BrowserWindow as A } from "electron";
3
- import L from "turndown";
4
- function O() {
5
- const e = A.getAllWindows().find((r) => r.title === "crawler-hidden-window");
6
- return e || new A({
1
+ import A from "electron-store";
2
+ import { BrowserWindow as x } from "electron";
3
+ import O from "turndown";
4
+ function D() {
5
+ const t = x.getAllWindows().find((r) => r.title === "crawler-hidden-window");
6
+ return t || new x({
7
7
  show: !1,
8
8
  webPreferences: {
9
9
  sandbox: !1
@@ -11,7 +11,7 @@ function O() {
11
11
  title: "crawler-hidden-window"
12
12
  });
13
13
  }
14
- const k = new L({
14
+ const I = new O({
15
15
  headingStyle: "atx",
16
16
  // 使用 # 格式的标题
17
17
  codeBlockStyle: "fenced",
@@ -28,125 +28,138 @@ const k = new L({
28
28
  // 完整的链接引用格式
29
29
  preformattedCode: !1,
30
30
  // 不使用预格式化代码
31
- blankReplacement: (e, t) => t.nodeName === "BR" ? `
31
+ blankReplacement: (t, e) => e.nodeName === "BR" ? `
32
32
  ` : ""
33
33
  });
34
- k.addRule("preserveLineBreaks", {
34
+ I.addRule("preserveLineBreaks", {
35
35
  filter: ["br"],
36
36
  replacement: () => `
37
37
  `
38
38
  });
39
- k.addRule("images", {
39
+ I.addRule("images", {
40
40
  filter: "img",
41
- replacement: (e, t) => {
42
- const r = t.alt || "", s = t.src || t.getAttribute("src") || "", o = t.title || "";
41
+ replacement: (t, e) => {
42
+ const r = e.alt || "", s = e.src || e.getAttribute("src") || "", o = e.title || "";
43
43
  return o ? `![${r}](${s} "${o}")` : `![${r}](${s})`;
44
44
  }
45
45
  });
46
- function $(e) {
47
- return e.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<noscript[\s\S]*?<\/noscript>/gi, "").replace(/<iframe[\s\S]*?<\/iframe>/gi, "").replace(/on\w+="[^"]*"/gi, "").replace(/on\w+='[^']*'/gi, "").trim();
46
+ function _(t) {
47
+ return t.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<noscript[\s\S]*?<\/noscript>/gi, "").replace(/<iframe[\s\S]*?<\/iframe>/gi, "").replace(/on\w+="[^"]*"/gi, "").replace(/on\w+='[^']*'/gi, "").trim();
48
48
  }
49
- function D(e) {
50
- if (!e || !e.trim())
49
+ function v(t) {
50
+ if (!t || !t.trim())
51
51
  return "";
52
52
  try {
53
- const t = $(e);
54
- if (!t)
53
+ const e = _(t);
54
+ if (!e)
55
55
  return "";
56
- let r = k.turndown(t);
56
+ let r = I.turndown(e);
57
57
  return r = r.replace(/\n{3,}/g, `
58
58
 
59
59
  `), r = r.split(`
60
60
  `).map((s) => s.trimEnd()).join(`
61
61
  `), r.trim();
62
- } catch (t) {
63
- return console.error("[normalizeMarkdown] 转换失败:", t), $(e).replace(/<[^>]+>/g, "").replace(/\n{3,}/g, `
62
+ } catch (e) {
63
+ return console.error("[normalizeMarkdown] 转换失败:", e), _(t).replace(/<[^>]+>/g, "").replace(/\n{3,}/g, `
64
64
 
65
65
  `).trim();
66
66
  }
67
67
  }
68
- function v(e) {
69
- const t = /* @__PURE__ */ new Date(), r = t.getFullYear(), s = t.getMonth() + 1, o = t.getDate(), a = t.getHours(), c = t.getMinutes();
70
- if (!e || !e.trim())
71
- return t.toISOString();
72
- const l = e.trim(), y = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/, S = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, U = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/, _ = /(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, T = /(\d{1,2})[月\-/](\d{1,2})[日]?/, R = /(\d{1,2})[:时](\d{1,2})[分]?/;
73
- let g = r, u = s, f = o, m = a, p = c, n = l.match(y);
68
+ function L(t) {
69
+ const e = /* @__PURE__ */ new Date(), r = e.getFullYear(), s = e.getMonth() + 1, o = e.getDate(), a = e.getHours(), i = e.getMinutes();
70
+ if (!t || !t.trim())
71
+ return e.toISOString();
72
+ const l = t.trim(), y = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/, S = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, R = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/, U = /(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, T = /(\d{1,2})[月\-/](\d{1,2})[日]?/, M = /(\d{1,2})[:时](\d{1,2})[分]?/;
73
+ let g = r, u = s, f = o, d = a, m = i, n = l.match(y);
74
74
  if (n)
75
- g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), m = parseInt(n[4], 10), p = parseInt(n[5], 10);
75
+ g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), d = parseInt(n[4], 10), m = parseInt(n[5], 10);
76
76
  else if (n = l.match(S), n)
77
- g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), m = parseInt(n[4], 10), p = parseInt(n[5], 10);
78
- else if (n = l.match(U), n)
77
+ g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), d = parseInt(n[4], 10), m = parseInt(n[5], 10);
78
+ else if (n = l.match(R), n)
79
79
  g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10);
80
- else if (n = l.match(_), n)
81
- u = parseInt(n[1], 10), f = parseInt(n[2], 10), m = parseInt(n[3], 10), p = parseInt(n[4], 10);
80
+ else if (n = l.match(U), n)
81
+ u = parseInt(n[1], 10), f = parseInt(n[2], 10), d = parseInt(n[3], 10), m = parseInt(n[4], 10);
82
82
  else if (n = l.match(T), n)
83
83
  u = parseInt(n[1], 10), f = parseInt(n[2], 10);
84
- else if (n = l.match(R), n)
85
- m = parseInt(n[1], 10), p = parseInt(n[2], 10);
84
+ else if (n = l.match(M), n)
85
+ d = parseInt(n[1], 10), m = parseInt(n[2], 10);
86
86
  else {
87
- const I = new Date(l);
88
- return Number.isNaN(I.getTime()) ? t.toISOString() : I.toISOString();
87
+ const k = new Date(l);
88
+ return Number.isNaN(k.getTime()) ? e.toISOString() : k.toISOString();
89
89
  }
90
- (u < 1 || u > 12) && (u = s), (f < 1 || f > 31) && (f = o), (m < 0 || m > 23) && (m = a), (p < 0 || p > 59) && (p = c);
91
- const h = new Date(g, u - 1, f, m, p, 0, 0);
92
- return h.getFullYear() !== g || h.getMonth() !== u - 1 || h.getDate() !== f ? t.toISOString() : C(h);
90
+ (u < 1 || u > 12) && (u = s), (f < 1 || f > 31) && (f = o), (d < 0 || d > 23) && (d = a), (m < 0 || m > 59) && (m = i);
91
+ const h = new Date(g, u - 1, f, d, m, 0, 0);
92
+ return h.getFullYear() !== g || h.getMonth() !== u - 1 || h.getDate() !== f ? e.toISOString() : C(h);
93
93
  }
94
- function C(e) {
95
- const t = (S) => S.toString().padStart(2, "0"), r = e.getFullYear(), s = t(e.getMonth() + 1), o = t(e.getDate()), a = t(e.getHours()), c = t(e.getMinutes()), l = t(e.getSeconds()), y = e.getMilliseconds().toString().padStart(3, "0");
96
- return `${r}-${s}-${o}T${a}:${c}:${l}.${y}`;
94
+ function C(t) {
95
+ const e = (S) => S.toString().padStart(2, "0"), r = t.getFullYear(), s = e(t.getMonth() + 1), o = e(t.getDate()), a = e(t.getHours()), i = e(t.getMinutes()), l = e(t.getSeconds()), y = t.getMilliseconds().toString().padStart(3, "0");
96
+ return `${r}-${s}-${o}T${a}:${i}:${l}.${y}`;
97
97
  }
98
- const M = x.default || x, w = new M();
99
- async function P() {
100
- if (i.rules && i.rules.length > 0)
101
- return i.rules;
102
- const e = w.get("news.rules") || [];
103
- if (e.length > 0)
104
- return e;
105
- const t = await E();
106
- return t.length > 0 ? (w.set("news.rules", t), t) : [];
98
+ const E = A.default || A, w = new E();
99
+ async function H() {
100
+ if (c.rules && c.rules.length > 0)
101
+ return c.rules;
102
+ if (c.devMode)
103
+ return await b();
104
+ const e = w.get("news.rules");
105
+ let r = [];
106
+ if (Array.isArray(e))
107
+ r = e;
108
+ else if (e && Array.isArray(e.rules)) {
109
+ const o = e.updatedAt ? new Date(e.updatedAt).getTime() : 0, a = Date.now();
110
+ o > 0 && a - o <= 18e6 ? r = e.rules : w.delete("news.rules");
111
+ }
112
+ if (r.length > 0)
113
+ return r;
114
+ const s = await b();
115
+ return s.length > 0 ? (c.devMode || w.set("news.rules", {
116
+ rules: s,
117
+ updatedAt: (/* @__PURE__ */ new Date()).toISOString()
118
+ }), s) : [];
107
119
  }
108
- const d = {
120
+ const p = {
109
121
  running: !1,
110
122
  interval_ms: 1800 * 1e3
111
123
  };
112
- let i = {
124
+ let c = {
113
125
  rules: [],
114
126
  rulesApiUrl: void 0,
115
127
  pushApiUrl: void 0,
116
- ruleTransformer: (e) => e,
128
+ devMode: !1,
129
+ ruleTransformer: (t) => t,
117
130
  newsItemFieldMap: void 0
118
- }, b = !1;
119
- function z(e) {
120
- i = {
121
- ...i,
122
- ...e,
131
+ }, $ = !1;
132
+ function z(t) {
133
+ c = {
134
+ ...c,
135
+ ...t,
123
136
  // 确保ruleTransformer始终存在
124
- ruleTransformer: e.ruleTransformer || ((t) => t && typeof t == "object" && "data" in t ? t.data : t)
137
+ ruleTransformer: t.ruleTransformer || ((e) => e && typeof e == "object" && "data" in e ? e.data : e)
125
138
  }, W();
126
139
  }
127
- async function E() {
128
- if (!i.rulesApiUrl)
140
+ async function b() {
141
+ if (!c.rulesApiUrl)
129
142
  return [];
130
143
  try {
131
- const e = await fetch(i.rulesApiUrl);
132
- if (!e.ok)
133
- throw new Error(`Failed to fetch rules from API: ${e.status} ${e.statusText}`);
134
- const t = await e.json(), r = i.ruleTransformer(t);
144
+ const t = await fetch(c.rulesApiUrl);
145
+ if (!t.ok)
146
+ throw new Error(`Failed to fetch rules from API: ${t.status} ${t.statusText}`);
147
+ const e = await t.json(), r = c.ruleTransformer(e);
135
148
  return Array.isArray(r) ? r : (console.warn("[crawler] Rules API returned non-array data, using empty array instead"), []);
136
- } catch (e) {
137
- return console.error("[crawler] Failed to fetch rules from API:", e), [];
149
+ } catch (t) {
150
+ return console.error("[crawler] Failed to fetch rules from API:", t), [];
138
151
  }
139
152
  }
140
- async function H(e) {
141
- if (!i.pushApiUrl)
153
+ async function P(t) {
154
+ if (!c.pushApiUrl)
142
155
  return;
143
- const t = w.get("news.pushedUrls") || [];
144
- if (t.includes(e.url)) {
145
- console.log(`[crawler] URL already pushed, skipping: ${e.url}`);
156
+ const e = w.get("news.pushedUrls") || [];
157
+ if (e.includes(t.url)) {
158
+ console.log(`[crawler] URL already pushed, skipping: ${t.url}`);
146
159
  return;
147
160
  }
148
161
  try {
149
- const r = N(e), s = await fetch(i.pushApiUrl, {
162
+ const r = N(t), s = await fetch(c.pushApiUrl, {
150
163
  method: "POST",
151
164
  headers: {
152
165
  "Content-Type": "application/json"
@@ -155,32 +168,32 @@ async function H(e) {
155
168
  });
156
169
  if (!s.ok)
157
170
  throw new Error(`Failed to push results to API: ${s.status} ${s.statusText}`);
158
- const o = [...t, e.url];
171
+ const o = [...e, t.url];
159
172
  w.set("news.pushedUrls", o), console.log("[crawler] Results pushed to API successfully");
160
173
  } catch (r) {
161
174
  console.error("[crawler] Failed to push results to API:", r);
162
175
  }
163
176
  }
164
- function N(e) {
165
- const t = i.newsItemFieldMap;
166
- if (!t || Object.keys(t).length === 0)
167
- return e;
168
- const r = {}, s = Object.entries(e);
177
+ function N(t) {
178
+ const e = c.newsItemFieldMap;
179
+ if (!e || Object.keys(e).length === 0)
180
+ return t;
181
+ const r = {}, s = Object.entries(t);
169
182
  for (const [o, a] of s) {
170
- const c = t[o];
171
- if (c === "-") continue;
172
- const l = typeof c == "string" ? c : o;
183
+ const i = e[o];
184
+ if (i === "-") continue;
185
+ const l = typeof i == "string" ? i : o;
173
186
  r[l] = a;
174
187
  }
175
188
  return r;
176
189
  }
177
- async function F(e, t) {
178
- return await e.webContents.executeJavaScript(
190
+ async function F(t, e) {
191
+ return await t.webContents.executeJavaScript(
179
192
  `
180
193
  (() => {
181
194
  const links = []
182
195
  // 在指定范围内查找所有链接
183
- const rangeElements = document.querySelectorAll(${JSON.stringify(t.home_range_selector)})
196
+ const rangeElements = document.querySelectorAll(${JSON.stringify(e.home_range_selector)})
184
197
  if (rangeElements.length === 0) {
185
198
  // 如果没有找到范围,则在整个文档中查找
186
199
  const allLinks = document.querySelectorAll('a')
@@ -212,9 +225,9 @@ async function F(e, t) {
212
225
  `
213
226
  );
214
227
  }
215
- async function q(e, t, r) {
228
+ async function q(t, e, r) {
216
229
  try {
217
- return await e.loadURL(r, { httpReferrer: t.base_url }), await e.webContents.executeJavaScript(
230
+ return await t.loadURL(r, { httpReferrer: e.base_url }), await t.webContents.executeJavaScript(
218
231
  `
219
232
  (() => {
220
233
  const pickText = (sel) => {
@@ -253,11 +266,11 @@ async function q(e, t, r) {
253
266
  return clone.innerHTML || ''
254
267
  }
255
268
  return {
256
- title: pickText(${JSON.stringify(t.title_selector)}),
257
- contentHtml: pickContentHtml(${JSON.stringify(t.content_selector)}, ${JSON.stringify(
258
- t.exclude_selectors || []
269
+ title: pickText(${JSON.stringify(e.title_selector)}),
270
+ contentHtml: pickContentHtml(${JSON.stringify(e.content_selector)}, ${JSON.stringify(
271
+ e.exclude_selectors || []
259
272
  )}),
260
- timeText: pickText(${JSON.stringify(t.time_selector)}),
273
+ timeText: pickText(${JSON.stringify(e.time_selector)}),
261
274
  url: location.href
262
275
  }
263
276
  })()
@@ -267,58 +280,58 @@ async function q(e, t, r) {
267
280
  return console.warn("[crawler] failed to extract page content", r, s), null;
268
281
  }
269
282
  }
270
- async function J(e) {
271
- const t = O();
283
+ async function J(t) {
284
+ const e = D();
272
285
  try {
273
- await t.loadURL(e.base_url, { httpReferrer: e.base_url });
274
- const r = await F(t, e);
275
- console.log(`[crawler] found ${r.length} links from ${e.remark || e.base_url}`);
286
+ await e.loadURL(t.base_url, { httpReferrer: t.base_url });
287
+ const r = await F(e, t);
288
+ console.log(`[crawler] found ${r.length} links from ${t.remark || t.base_url}`);
276
289
  const s = [];
277
290
  for (const o of r) {
278
- const a = await q(t, e, o);
291
+ const a = await q(e, t, o);
279
292
  if (!a || !a.title || !a.contentHtml) {
280
293
  console.log(`[crawler] skip empty result for ${o}`);
281
294
  continue;
282
295
  }
283
- const c = {
296
+ const i = {
284
297
  url: a.url || o,
285
298
  title: a.title,
286
299
  content_html: a.contentHtml,
287
- content_markdown: D(a.contentHtml),
288
- published_at: v(a.timeText)
300
+ content_markdown: v(a.contentHtml),
301
+ published_at: L(a.timeText)
289
302
  };
290
- s.push(c);
303
+ s.push(i);
291
304
  try {
292
- await H(c);
305
+ await P(i);
293
306
  } catch (l) {
294
307
  console.warn("[crawler] push single news item failed", l);
295
308
  }
296
309
  }
297
- return console.log(`[crawler] processed ${s.length} items from ${e.remark || e.base_url}`), {
310
+ return console.log(`[crawler] processed ${s.length} items from ${t.remark || t.base_url}`), {
298
311
  success: !0,
299
312
  data: s
300
313
  };
301
314
  } catch (r) {
302
- return console.warn("[crawler] rule failed", e.remark || e.base_url, r), {
315
+ return console.warn("[crawler] rule failed", t.remark || t.base_url, r), {
303
316
  success: !1,
304
317
  error: r instanceof Error ? r.message : String(r)
305
318
  };
306
319
  }
307
320
  }
308
321
  function W() {
309
- if (b) return;
310
- b = !0;
311
- const e = d.interval_ms, t = async () => {
312
- const r = await P();
313
- console.log(`[crawler] scheduled run, rules=${r.length}`), d.running = !0, d.running_source = void 0;
322
+ if ($) return;
323
+ $ = !0;
324
+ const t = p.interval_ms, e = async () => {
325
+ const r = await H();
326
+ console.log(`[crawler] scheduled run, rules=${r.length}`), p.running = !0, p.running_source = void 0;
314
327
  try {
315
328
  for (const s of r)
316
329
  await J(s);
317
330
  } finally {
318
- d.running = !1, d.running_source = void 0, d.next_run_at = new Date(Date.now() + e).toISOString();
331
+ p.running = !1, p.running_source = void 0, p.next_run_at = new Date(Date.now() + t).toISOString();
319
332
  }
320
333
  };
321
- d.next_run_at = new Date(Date.now() + 5e3).toISOString(), setTimeout(t, 5e3), setInterval(t, e);
334
+ p.next_run_at = new Date(Date.now() + 5e3).toISOString(), setTimeout(e, 5e3), setInterval(e, t);
322
335
  }
323
336
  export {
324
337
  z as initCrawler
@@ -19,6 +19,12 @@ export type CrawlerConfig = {
19
19
  rulesApiUrl?: string;
20
20
  pushApiUrl?: string;
21
21
  ruleTransformer?: (data: any) => any;
22
+ /**
23
+ * 是否处于开发模式
24
+ * - 开发模式:不使用本地缓存,每次都优先使用内存 rules,其次直接从 API 拉取
25
+ * - 生产模式:会使用本地缓存(带 5 小时过期时间)
26
+ */
27
+ devMode?: boolean;
22
28
  newsItemFieldMap?: Partial<Record<keyof NewsItem, string | '-'>>;
23
29
  };
24
30
  export type NewsItem = {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@howuse/electron-crawler",
3
- "version": "0.3.0",
3
+ "version": "0.4.0",
4
4
  "description": "基于Electron的爬虫工具包,用于爬取新闻和股票详情",
5
5
  "keywords": [
6
6
  "electron",