@howuse/electron-crawler 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,16 +1,16 @@
1
- "use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const k=require("electron-store"),x=require("electron"),U=require("turndown");function v(){const e=x.BrowserWindow.getAllWindows().find(r=>r.title==="crawler-hidden-window");return e||new x.BrowserWindow({show:!1,webPreferences:{sandbox:!1},title:"crawler-hidden-window"})}const S=new U({headingStyle:"atx",codeBlockStyle:"fenced",bulletListMarker:"-",emDelimiter:"*",strongDelimiter:"**",linkStyle:"inlined",linkReferenceStyle:"full",preformattedCode:!1,blankReplacement:(e,t)=>t.nodeName==="BR"?`
2
- `:""});S.addRule("preserveLineBreaks",{filter:["br"],replacement:()=>`
3
- `});S.addRule("images",{filter:"img",replacement:(e,t)=>{const r=t.alt||"",s=t.src||t.getAttribute("src")||"",a=t.title||"";return a?`![${r}](${s} "${a}")`:`![${r}](${s})`}});function A(e){return e.replace(/<script[\s\S]*?<\/script>/gi,"").replace(/<style[\s\S]*?<\/style>/gi,"").replace(/<noscript[\s\S]*?<\/noscript>/gi,"").replace(/<iframe[\s\S]*?<\/iframe>/gi,"").replace(/on\w+="[^"]*"/gi,"").replace(/on\w+='[^']*'/gi,"").trim()}function C(e){if(!e||!e.trim())return"";try{const t=A(e);if(!t)return"";let r=S.turndown(t);return r=r.replace(/\n{3,}/g,`
1
+ "use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const A=require("electron-store"),x=require("electron"),O=require("turndown");function v(){const t=x.BrowserWindow.getAllWindows().find(r=>r.title==="crawler-hidden-window");return t||new x.BrowserWindow({show:!1,webPreferences:{sandbox:!1},title:"crawler-hidden-window"})}const I=new O({headingStyle:"atx",codeBlockStyle:"fenced",bulletListMarker:"-",emDelimiter:"*",strongDelimiter:"**",linkStyle:"inlined",linkReferenceStyle:"full",preformattedCode:!1,blankReplacement:(t,e)=>e.nodeName==="BR"?`
2
+ `:""});I.addRule("preserveLineBreaks",{filter:["br"],replacement:()=>`
3
+ `});I.addRule("images",{filter:"img",replacement:(t,e)=>{const r=e.alt||"",s=e.src||e.getAttribute("src")||"",o=e.title||"";return o?`![${r}](${s} "${o}")`:`![${r}](${s})`}});function b(t){return t.replace(/<script[\s\S]*?<\/script>/gi,"").replace(/<style[\s\S]*?<\/style>/gi,"").replace(/<noscript[\s\S]*?<\/noscript>/gi,"").replace(/<iframe[\s\S]*?<\/iframe>/gi,"").replace(/on\w+="[^"]*"/gi,"").replace(/on\w+='[^']*'/gi,"").trim()}function D(t){if(!t||!t.trim())return"";try{const e=b(t);if(!e)return"";let r=I.turndown(e);return r=r.replace(/\n{3,}/g,`
4
4
 
5
5
  `),r=r.split(`
6
6
  `).map(s=>s.trimEnd()).join(`
7
- `),r.trim()}catch(t){return console.error("[normalizeMarkdown] 转换失败:",t),A(e).replace(/<[^>]+>/g,"").replace(/\n{3,}/g,`
7
+ `),r.trim()}catch(e){return console.error("[normalizeMarkdown] 转换失败:",e),b(t).replace(/<[^>]+>/g,"").replace(/\n{3,}/g,`
8
8
 
9
- `).trim()}}function D(e){const t=new Date,r=t.getFullYear(),s=t.getMonth()+1,a=t.getDate(),o=t.getHours(),i=t.getMinutes();if(!e||!e.trim())return t.toISOString();const c=e.trim(),w=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/,y=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,T=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/,_=/(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,R=/(\d{1,2})[月\-/](\d{1,2})[日]?/,O=/(\d{1,2})[:时](\d{1,2})[分]?/;let g=r,u=s,f=a,m=o,d=i,n=c.match(w);if(n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),m=parseInt(n[4],10),d=parseInt(n[5],10);else if(n=c.match(y),n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),m=parseInt(n[4],10),d=parseInt(n[5],10);else if(n=c.match(T),n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10);else if(n=c.match(_),n)u=parseInt(n[1],10),f=parseInt(n[2],10),m=parseInt(n[3],10),d=parseInt(n[4],10);else if(n=c.match(R),n)u=parseInt(n[1],10),f=parseInt(n[2],10);else if(n=c.match(O),n)m=parseInt(n[1],10),d=parseInt(n[2],10);else{const I=new Date(c);return Number.isNaN(I.getTime())?t.toISOString():I.toISOString()}(u<1||u>12)&&(u=s),(f<1||f>31)&&(f=a),(m<0||m>23)&&(m=o),(d<0||d>59)&&(d=i);const h=new Date(g,u-1,f,m,d,0,0);return h.getFullYear()!==g||h.getMonth()!==u-1||h.getDate()!==f?t.toISOString():L(h)}function L(e){const t=y=>y.toString().padStart(2,"0"),r=e.getFullYear(),s=t(e.getMonth()+1),a=t(e.getDate()),o=t(e.getHours()),i=t(e.getMinutes()),c=t(e.getSeconds()),w=e.getMilliseconds().toString().padStart(3,"0");return`${r}-${s}-${a}T${o}:${i}:${c}.${w}`}const M=k.default||k,b=new M;async function P(){if(l.rules&&l.rules.length>0)return l.rules;const e=b.get("news.rules")||[];if(e.length>0)return e;const t=await H();return t.length>0?(b.set("news.rules",t),t):[]}const p={running:!1,interval_ms:1800*1e3};let l={rules:[],rulesApiUrl:void 0,pushApiUrl:void 0,ruleTransformer:e=>e,newsItemFieldMap:void 0},$=!1;function E(e){l={...l,...e,ruleTransformer:e.ruleTransformer||(t=>t&&typeof t=="object"&&"data"in t?t.data:t)},j()}async function H(){if(!l.rulesApiUrl)return[];try{const e=await fetch(l.rulesApiUrl);if(!e.ok)throw new Error(`Failed to fetch rules from API: ${e.status} ${e.statusText}`);const t=await e.json(),r=l.ruleTransformer(t);return Array.isArray(r)?r:(console.warn("[crawler] Rules API returned non-array data, using empty array instead"),[])}catch(e){return console.error("[crawler] Failed to fetch rules from API:",e),[]}}async function N(e){if(l.pushApiUrl)try{const t=q(e),r=await fetch(l.pushApiUrl,{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify(t)});if(!r.ok)throw new Error(`Failed to push results to API: ${r.status} ${r.statusText}`);console.log("[crawler] Results pushed to API successfully")}catch(t){console.error("[crawler] Failed to push results to API:",t)}}function q(e){const t=l.newsItemFieldMap;if(!t||Object.keys(t).length===0)return e;const r={},s=Object.entries(e);for(const[a,o]of s){const i=t[a];if(i==="-")continue;const c=typeof i=="string"?i:a;r[c]=o}return r}async function F(e,t){return await e.webContents.executeJavaScript(`
9
+ `).trim()}}function L(t){const e=new Date,r=e.getFullYear(),s=e.getMonth()+1,o=e.getDate(),a=e.getHours(),i=e.getMinutes();if(!t||!t.trim())return e.toISOString();const l=t.trim(),y=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/,S=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,R=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/,U=/(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,T=/(\d{1,2})[月\-/](\d{1,2})[日]?/,M=/(\d{1,2})[:时](\d{1,2})[分]?/;let g=r,u=s,f=o,d=a,p=i,n=l.match(y);if(n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),d=parseInt(n[4],10),p=parseInt(n[5],10);else if(n=l.match(S),n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),d=parseInt(n[4],10),p=parseInt(n[5],10);else if(n=l.match(R),n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10);else if(n=l.match(U),n)u=parseInt(n[1],10),f=parseInt(n[2],10),d=parseInt(n[3],10),p=parseInt(n[4],10);else if(n=l.match(T),n)u=parseInt(n[1],10),f=parseInt(n[2],10);else if(n=l.match(M),n)d=parseInt(n[1],10),p=parseInt(n[2],10);else{const k=new Date(l);return Number.isNaN(k.getTime())?e.toISOString():k.toISOString()}(u<1||u>12)&&(u=s),(f<1||f>31)&&(f=o),(d<0||d>23)&&(d=a),(p<0||p>59)&&(p=i);const h=new Date(g,u-1,f,d,p,0,0);return h.getFullYear()!==g||h.getMonth()!==u-1||h.getDate()!==f?e.toISOString():C(h)}function C(t){const e=S=>S.toString().padStart(2,"0"),r=t.getFullYear(),s=e(t.getMonth()+1),o=e(t.getDate()),a=e(t.getHours()),i=e(t.getMinutes()),l=e(t.getSeconds()),y=t.getMilliseconds().toString().padStart(3,"0");return`${r}-${s}-${o}T${a}:${i}:${l}.${y}`}const P=A.default||A,w=new P;async function E(){if(c.rules&&c.rules.length>0)return c.rules;if(c.devMode)return await $();const e=w.get("news.rules");let r=[];if(Array.isArray(e))r=e;else if(e&&Array.isArray(e.rules)){const o=e.updatedAt?new Date(e.updatedAt).getTime():0,a=Date.now();o>0&&a-o<=18e6?r=e.rules:w.delete("news.rules")}if(r.length>0)return r;const s=await $();return s.length>0?(c.devMode||w.set("news.rules",{rules:s,updatedAt:new Date().toISOString()}),s):[]}const m={running:!1,interval_ms:1800*1e3};let c={rules:[],rulesApiUrl:void 0,pushApiUrl:void 0,devMode:!1,ruleTransformer:t=>t,newsItemFieldMap:void 0},_=!1;function H(t){c={...c,...t,ruleTransformer:t.ruleTransformer||(e=>e&&typeof e=="object"&&"data"in e?e.data:e)},j()}async function $(){if(!c.rulesApiUrl)return[];try{const t=await fetch(c.rulesApiUrl);if(!t.ok)throw new Error(`Failed to fetch rules from API: ${t.status} ${t.statusText}`);const e=await t.json(),r=c.ruleTransformer(e);return Array.isArray(r)?r:(console.warn("[crawler] Rules API returned non-array data, using empty array instead"),[])}catch(t){return console.error("[crawler] Failed to fetch rules from API:",t),[]}}async function N(t){if(!c.pushApiUrl)return;const e=w.get("news.pushedUrls")||[];if(e.includes(t.url)){console.log(`[crawler] URL already pushed, skipping: ${t.url}`);return}try{const r=F(t),s=await fetch(c.pushApiUrl,{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify(r)});if(!s.ok)throw new Error(`Failed to push results to API: ${s.status} ${s.statusText}`);const o=[...e,t.url];w.set("news.pushedUrls",o),console.log("[crawler] Results pushed to API successfully")}catch(r){console.error("[crawler] Failed to push results to API:",r)}}function F(t){const e=c.newsItemFieldMap;if(!e||Object.keys(e).length===0)return t;const r={},s=Object.entries(t);for(const[o,a]of s){const i=e[o];if(i==="-")continue;const l=typeof i=="string"?i:o;r[l]=a}return r}async function q(t,e){return await t.webContents.executeJavaScript(`
10
10
  (() => {
11
11
  const links = []
12
12
  // 在指定范围内查找所有链接
13
- const rangeElements = document.querySelectorAll(${JSON.stringify(t.home_range_selector)})
13
+ const rangeElements = document.querySelectorAll(${JSON.stringify(e.home_range_selector)})
14
14
  if (rangeElements.length === 0) {
15
15
  // 如果没有找到范围,则在整个文档中查找
16
16
  const allLinks = document.querySelectorAll('a')
@@ -39,7 +39,7 @@
39
39
  const uniqueLinks = [...new Map(links.map(l => [l.href, l])).values()]
40
40
  return uniqueLinks.map(l => l.href)
41
41
  })()
42
- `)}async function W(e,t,r){try{return await e.loadURL(r,{httpReferrer:t.base_url}),await e.webContents.executeJavaScript(`
42
+ `)}async function W(t,e,r){try{return await t.loadURL(r,{httpReferrer:e.base_url}),await t.webContents.executeJavaScript(`
43
43
  (() => {
44
44
  const pickText = (sel) => {
45
45
  const el = document.querySelector(sel)
@@ -77,10 +77,10 @@
77
77
  return clone.innerHTML || ''
78
78
  }
79
79
  return {
80
- title: pickText(${JSON.stringify(t.title_selector)}),
81
- contentHtml: pickContentHtml(${JSON.stringify(t.content_selector)}, ${JSON.stringify(t.exclude_selectors||[])}),
82
- timeText: pickText(${JSON.stringify(t.time_selector)}),
80
+ title: pickText(${JSON.stringify(e.title_selector)}),
81
+ contentHtml: pickContentHtml(${JSON.stringify(e.content_selector)}, ${JSON.stringify(e.exclude_selectors||[])}),
82
+ timeText: pickText(${JSON.stringify(e.time_selector)}),
83
83
  url: location.href
84
84
  }
85
85
  })()
86
- `)}catch(s){return console.warn("[crawler] failed to extract page content",r,s),null}}async function J(e){const t=v();try{await t.loadURL(e.base_url,{httpReferrer:e.base_url});const r=await F(t,e);console.log(`[crawler] found ${r.length} links from ${e.remark||e.base_url}`);const s=[];for(const a of r){const o=await W(t,e,a);if(!o||!o.title||!o.contentHtml){console.log(`[crawler] skip empty result for ${a}`);continue}const i={url:o.url||a,title:o.title,content_html:o.contentHtml,content_markdown:C(o.contentHtml),published_at:D(o.timeText)};s.push(i);try{await N(i)}catch(c){console.warn("[crawler] push single news item failed",c)}}return console.log(`[crawler] processed ${s.length} items from ${e.remark||e.base_url}`),{success:!0,data:s}}catch(r){return console.warn("[crawler] rule failed",e.remark||e.base_url,r),{success:!1,error:r instanceof Error?r.message:String(r)}}}function j(){if($)return;$=!0;const e=p.interval_ms,t=async()=>{const r=await P();console.log(`[crawler] scheduled run, rules=${r.length}`),p.running=!0,p.running_source=void 0;try{for(const s of r)await J(s)}finally{p.running=!1,p.running_source=void 0,p.next_run_at=new Date(Date.now()+e).toISOString()}};p.next_run_at=new Date(Date.now()+5e3).toISOString(),setTimeout(t,5e3),setInterval(t,e)}exports.initCrawler=E;
86
+ `)}catch(s){return console.warn("[crawler] failed to extract page content",r,s),null}}async function J(t){const e=v();try{await e.loadURL(t.base_url,{httpReferrer:t.base_url});const r=await q(e,t);console.log(`[crawler] found ${r.length} links from ${t.remark||t.base_url}`);const s=[];for(const o of r){const a=await W(e,t,o);if(!a||!a.title||!a.contentHtml){console.log(`[crawler] skip empty result for ${o}`);continue}const i={url:a.url||o,title:a.title,content_html:a.contentHtml,content_markdown:D(a.contentHtml),published_at:L(a.timeText)};s.push(i);try{await N(i)}catch(l){console.warn("[crawler] push single news item failed",l)}}return console.log(`[crawler] processed ${s.length} items from ${t.remark||t.base_url}`),{success:!0,data:s}}catch(r){return console.warn("[crawler] rule failed",t.remark||t.base_url,r),{success:!1,error:r instanceof Error?r.message:String(r)}}}function j(){if(_)return;_=!0;const t=m.interval_ms,e=async()=>{const r=await E();console.log(`[crawler] scheduled run, rules=${r.length}`),m.running=!0,m.running_source=void 0;try{for(const s of r)await J(s)}finally{m.running=!1,m.running_source=void 0,m.next_run_at=new Date(Date.now()+t).toISOString()}};m.next_run_at=new Date(Date.now()+5e3).toISOString(),setTimeout(e,5e3),setInterval(e,t)}exports.initCrawler=H;
package/dist/index.mjs CHANGED
@@ -1,9 +1,9 @@
1
- import k from "electron-store";
1
+ import A from "electron-store";
2
2
  import { BrowserWindow as x } from "electron";
3
- import U from "turndown";
3
+ import O from "turndown";
4
4
  function D() {
5
- const e = x.getAllWindows().find((r) => r.title === "crawler-hidden-window");
6
- return e || new x({
5
+ const t = x.getAllWindows().find((r) => r.title === "crawler-hidden-window");
6
+ return t || new x({
7
7
  show: !1,
8
8
  webPreferences: {
9
9
  sandbox: !1
@@ -11,7 +11,7 @@ function D() {
11
11
  title: "crawler-hidden-window"
12
12
  });
13
13
  }
14
- const S = new U({
14
+ const I = new O({
15
15
  headingStyle: "atx",
16
16
  // 使用 # 格式的标题
17
17
  codeBlockStyle: "fenced",
@@ -28,152 +28,172 @@ const S = new U({
28
28
  // 完整的链接引用格式
29
29
  preformattedCode: !1,
30
30
  // 不使用预格式化代码
31
- blankReplacement: (e, t) => t.nodeName === "BR" ? `
31
+ blankReplacement: (t, e) => e.nodeName === "BR" ? `
32
32
  ` : ""
33
33
  });
34
- S.addRule("preserveLineBreaks", {
34
+ I.addRule("preserveLineBreaks", {
35
35
  filter: ["br"],
36
36
  replacement: () => `
37
37
  `
38
38
  });
39
- S.addRule("images", {
39
+ I.addRule("images", {
40
40
  filter: "img",
41
- replacement: (e, t) => {
42
- const r = t.alt || "", s = t.src || t.getAttribute("src") || "", a = t.title || "";
43
- return a ? `![${r}](${s} "${a}")` : `![${r}](${s})`;
41
+ replacement: (t, e) => {
42
+ const r = e.alt || "", s = e.src || e.getAttribute("src") || "", o = e.title || "";
43
+ return o ? `![${r}](${s} "${o}")` : `![${r}](${s})`;
44
44
  }
45
45
  });
46
- function A(e) {
47
- return e.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<noscript[\s\S]*?<\/noscript>/gi, "").replace(/<iframe[\s\S]*?<\/iframe>/gi, "").replace(/on\w+="[^"]*"/gi, "").replace(/on\w+='[^']*'/gi, "").trim();
46
+ function _(t) {
47
+ return t.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<noscript[\s\S]*?<\/noscript>/gi, "").replace(/<iframe[\s\S]*?<\/iframe>/gi, "").replace(/on\w+="[^"]*"/gi, "").replace(/on\w+='[^']*'/gi, "").trim();
48
48
  }
49
- function L(e) {
50
- if (!e || !e.trim())
49
+ function v(t) {
50
+ if (!t || !t.trim())
51
51
  return "";
52
52
  try {
53
- const t = A(e);
54
- if (!t)
53
+ const e = _(t);
54
+ if (!e)
55
55
  return "";
56
- let r = S.turndown(t);
56
+ let r = I.turndown(e);
57
57
  return r = r.replace(/\n{3,}/g, `
58
58
 
59
59
  `), r = r.split(`
60
60
  `).map((s) => s.trimEnd()).join(`
61
61
  `), r.trim();
62
- } catch (t) {
63
- return console.error("[normalizeMarkdown] 转换失败:", t), A(e).replace(/<[^>]+>/g, "").replace(/\n{3,}/g, `
62
+ } catch (e) {
63
+ return console.error("[normalizeMarkdown] 转换失败:", e), _(t).replace(/<[^>]+>/g, "").replace(/\n{3,}/g, `
64
64
 
65
65
  `).trim();
66
66
  }
67
67
  }
68
- function v(e) {
69
- const t = /* @__PURE__ */ new Date(), r = t.getFullYear(), s = t.getMonth() + 1, a = t.getDate(), o = t.getHours(), l = t.getMinutes();
70
- if (!e || !e.trim())
71
- return t.toISOString();
72
- const c = e.trim(), w = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/, y = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, _ = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/, T = /(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, R = /(\d{1,2})[月\-/](\d{1,2})[日]?/, O = /(\d{1,2})[:时](\d{1,2})[分]?/;
73
- let g = r, u = s, f = a, m = o, p = l, n = c.match(w);
68
+ function L(t) {
69
+ const e = /* @__PURE__ */ new Date(), r = e.getFullYear(), s = e.getMonth() + 1, o = e.getDate(), a = e.getHours(), i = e.getMinutes();
70
+ if (!t || !t.trim())
71
+ return e.toISOString();
72
+ const l = t.trim(), y = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/, S = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, R = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/, U = /(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, T = /(\d{1,2})[月\-/](\d{1,2})[日]?/, M = /(\d{1,2})[:时](\d{1,2})[分]?/;
73
+ let g = r, u = s, f = o, d = a, m = i, n = l.match(y);
74
74
  if (n)
75
- g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), m = parseInt(n[4], 10), p = parseInt(n[5], 10);
76
- else if (n = c.match(y), n)
77
- g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), m = parseInt(n[4], 10), p = parseInt(n[5], 10);
78
- else if (n = c.match(_), n)
75
+ g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), d = parseInt(n[4], 10), m = parseInt(n[5], 10);
76
+ else if (n = l.match(S), n)
77
+ g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), d = parseInt(n[4], 10), m = parseInt(n[5], 10);
78
+ else if (n = l.match(R), n)
79
79
  g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10);
80
- else if (n = c.match(T), n)
81
- u = parseInt(n[1], 10), f = parseInt(n[2], 10), m = parseInt(n[3], 10), p = parseInt(n[4], 10);
82
- else if (n = c.match(R), n)
80
+ else if (n = l.match(U), n)
81
+ u = parseInt(n[1], 10), f = parseInt(n[2], 10), d = parseInt(n[3], 10), m = parseInt(n[4], 10);
82
+ else if (n = l.match(T), n)
83
83
  u = parseInt(n[1], 10), f = parseInt(n[2], 10);
84
- else if (n = c.match(O), n)
85
- m = parseInt(n[1], 10), p = parseInt(n[2], 10);
84
+ else if (n = l.match(M), n)
85
+ d = parseInt(n[1], 10), m = parseInt(n[2], 10);
86
86
  else {
87
- const I = new Date(c);
88
- return Number.isNaN(I.getTime()) ? t.toISOString() : I.toISOString();
87
+ const k = new Date(l);
88
+ return Number.isNaN(k.getTime()) ? e.toISOString() : k.toISOString();
89
89
  }
90
- (u < 1 || u > 12) && (u = s), (f < 1 || f > 31) && (f = a), (m < 0 || m > 23) && (m = o), (p < 0 || p > 59) && (p = l);
91
- const h = new Date(g, u - 1, f, m, p, 0, 0);
92
- return h.getFullYear() !== g || h.getMonth() !== u - 1 || h.getDate() !== f ? t.toISOString() : C(h);
90
+ (u < 1 || u > 12) && (u = s), (f < 1 || f > 31) && (f = o), (d < 0 || d > 23) && (d = a), (m < 0 || m > 59) && (m = i);
91
+ const h = new Date(g, u - 1, f, d, m, 0, 0);
92
+ return h.getFullYear() !== g || h.getMonth() !== u - 1 || h.getDate() !== f ? e.toISOString() : C(h);
93
93
  }
94
- function C(e) {
95
- const t = (y) => y.toString().padStart(2, "0"), r = e.getFullYear(), s = t(e.getMonth() + 1), a = t(e.getDate()), o = t(e.getHours()), l = t(e.getMinutes()), c = t(e.getSeconds()), w = e.getMilliseconds().toString().padStart(3, "0");
96
- return `${r}-${s}-${a}T${o}:${l}:${c}.${w}`;
94
+ function C(t) {
95
+ const e = (S) => S.toString().padStart(2, "0"), r = t.getFullYear(), s = e(t.getMonth() + 1), o = e(t.getDate()), a = e(t.getHours()), i = e(t.getMinutes()), l = e(t.getSeconds()), y = t.getMilliseconds().toString().padStart(3, "0");
96
+ return `${r}-${s}-${o}T${a}:${i}:${l}.${y}`;
97
97
  }
98
- const M = k.default || k, b = new M();
99
- async function P() {
100
- if (i.rules && i.rules.length > 0)
101
- return i.rules;
102
- const e = b.get("news.rules") || [];
103
- if (e.length > 0)
104
- return e;
105
- const t = await E();
106
- return t.length > 0 ? (b.set("news.rules", t), t) : [];
98
+ const E = A.default || A, w = new E();
99
+ async function H() {
100
+ if (c.rules && c.rules.length > 0)
101
+ return c.rules;
102
+ if (c.devMode)
103
+ return await b();
104
+ const e = w.get("news.rules");
105
+ let r = [];
106
+ if (Array.isArray(e))
107
+ r = e;
108
+ else if (e && Array.isArray(e.rules)) {
109
+ const o = e.updatedAt ? new Date(e.updatedAt).getTime() : 0, a = Date.now();
110
+ o > 0 && a - o <= 18e6 ? r = e.rules : w.delete("news.rules");
111
+ }
112
+ if (r.length > 0)
113
+ return r;
114
+ const s = await b();
115
+ return s.length > 0 ? (c.devMode || w.set("news.rules", {
116
+ rules: s,
117
+ updatedAt: (/* @__PURE__ */ new Date()).toISOString()
118
+ }), s) : [];
107
119
  }
108
- const d = {
120
+ const p = {
109
121
  running: !1,
110
122
  interval_ms: 1800 * 1e3
111
123
  };
112
- let i = {
124
+ let c = {
113
125
  rules: [],
114
126
  rulesApiUrl: void 0,
115
127
  pushApiUrl: void 0,
116
- ruleTransformer: (e) => e,
128
+ devMode: !1,
129
+ ruleTransformer: (t) => t,
117
130
  newsItemFieldMap: void 0
118
131
  }, $ = !1;
119
- function z(e) {
120
- i = {
121
- ...i,
122
- ...e,
132
+ function z(t) {
133
+ c = {
134
+ ...c,
135
+ ...t,
123
136
  // 确保ruleTransformer始终存在
124
- ruleTransformer: e.ruleTransformer || ((t) => t && typeof t == "object" && "data" in t ? t.data : t)
137
+ ruleTransformer: t.ruleTransformer || ((e) => e && typeof e == "object" && "data" in e ? e.data : e)
125
138
  }, W();
126
139
  }
127
- async function E() {
128
- if (!i.rulesApiUrl)
140
+ async function b() {
141
+ if (!c.rulesApiUrl)
129
142
  return [];
130
143
  try {
131
- const e = await fetch(i.rulesApiUrl);
132
- if (!e.ok)
133
- throw new Error(`Failed to fetch rules from API: ${e.status} ${e.statusText}`);
134
- const t = await e.json(), r = i.ruleTransformer(t);
144
+ const t = await fetch(c.rulesApiUrl);
145
+ if (!t.ok)
146
+ throw new Error(`Failed to fetch rules from API: ${t.status} ${t.statusText}`);
147
+ const e = await t.json(), r = c.ruleTransformer(e);
135
148
  return Array.isArray(r) ? r : (console.warn("[crawler] Rules API returned non-array data, using empty array instead"), []);
136
- } catch (e) {
137
- return console.error("[crawler] Failed to fetch rules from API:", e), [];
149
+ } catch (t) {
150
+ return console.error("[crawler] Failed to fetch rules from API:", t), [];
138
151
  }
139
152
  }
140
- async function H(e) {
141
- if (i.pushApiUrl)
142
- try {
143
- const t = N(e), r = await fetch(i.pushApiUrl, {
144
- method: "POST",
145
- headers: {
146
- "Content-Type": "application/json"
147
- },
148
- body: JSON.stringify(t)
149
- });
150
- if (!r.ok)
151
- throw new Error(`Failed to push results to API: ${r.status} ${r.statusText}`);
152
- console.log("[crawler] Results pushed to API successfully");
153
- } catch (t) {
154
- console.error("[crawler] Failed to push results to API:", t);
155
- }
153
+ async function P(t) {
154
+ if (!c.pushApiUrl)
155
+ return;
156
+ const e = w.get("news.pushedUrls") || [];
157
+ if (e.includes(t.url)) {
158
+ console.log(`[crawler] URL already pushed, skipping: ${t.url}`);
159
+ return;
160
+ }
161
+ try {
162
+ const r = N(t), s = await fetch(c.pushApiUrl, {
163
+ method: "POST",
164
+ headers: {
165
+ "Content-Type": "application/json"
166
+ },
167
+ body: JSON.stringify(r)
168
+ });
169
+ if (!s.ok)
170
+ throw new Error(`Failed to push results to API: ${s.status} ${s.statusText}`);
171
+ const o = [...e, t.url];
172
+ w.set("news.pushedUrls", o), console.log("[crawler] Results pushed to API successfully");
173
+ } catch (r) {
174
+ console.error("[crawler] Failed to push results to API:", r);
175
+ }
156
176
  }
157
- function N(e) {
158
- const t = i.newsItemFieldMap;
159
- if (!t || Object.keys(t).length === 0)
160
- return e;
161
- const r = {}, s = Object.entries(e);
162
- for (const [a, o] of s) {
163
- const l = t[a];
164
- if (l === "-") continue;
165
- const c = typeof l == "string" ? l : a;
166
- r[c] = o;
177
+ function N(t) {
178
+ const e = c.newsItemFieldMap;
179
+ if (!e || Object.keys(e).length === 0)
180
+ return t;
181
+ const r = {}, s = Object.entries(t);
182
+ for (const [o, a] of s) {
183
+ const i = e[o];
184
+ if (i === "-") continue;
185
+ const l = typeof i == "string" ? i : o;
186
+ r[l] = a;
167
187
  }
168
188
  return r;
169
189
  }
170
- async function F(e, t) {
171
- return await e.webContents.executeJavaScript(
190
+ async function F(t, e) {
191
+ return await t.webContents.executeJavaScript(
172
192
  `
173
193
  (() => {
174
194
  const links = []
175
195
  // 在指定范围内查找所有链接
176
- const rangeElements = document.querySelectorAll(${JSON.stringify(t.home_range_selector)})
196
+ const rangeElements = document.querySelectorAll(${JSON.stringify(e.home_range_selector)})
177
197
  if (rangeElements.length === 0) {
178
198
  // 如果没有找到范围,则在整个文档中查找
179
199
  const allLinks = document.querySelectorAll('a')
@@ -205,9 +225,9 @@ async function F(e, t) {
205
225
  `
206
226
  );
207
227
  }
208
- async function q(e, t, r) {
228
+ async function q(t, e, r) {
209
229
  try {
210
- return await e.loadURL(r, { httpReferrer: t.base_url }), await e.webContents.executeJavaScript(
230
+ return await t.loadURL(r, { httpReferrer: e.base_url }), await t.webContents.executeJavaScript(
211
231
  `
212
232
  (() => {
213
233
  const pickText = (sel) => {
@@ -246,11 +266,11 @@ async function q(e, t, r) {
246
266
  return clone.innerHTML || ''
247
267
  }
248
268
  return {
249
- title: pickText(${JSON.stringify(t.title_selector)}),
250
- contentHtml: pickContentHtml(${JSON.stringify(t.content_selector)}, ${JSON.stringify(
251
- t.exclude_selectors || []
269
+ title: pickText(${JSON.stringify(e.title_selector)}),
270
+ contentHtml: pickContentHtml(${JSON.stringify(e.content_selector)}, ${JSON.stringify(
271
+ e.exclude_selectors || []
252
272
  )}),
253
- timeText: pickText(${JSON.stringify(t.time_selector)}),
273
+ timeText: pickText(${JSON.stringify(e.time_selector)}),
254
274
  url: location.href
255
275
  }
256
276
  })()
@@ -260,39 +280,39 @@ async function q(e, t, r) {
260
280
  return console.warn("[crawler] failed to extract page content", r, s), null;
261
281
  }
262
282
  }
263
- async function J(e) {
264
- const t = D();
283
+ async function J(t) {
284
+ const e = D();
265
285
  try {
266
- await t.loadURL(e.base_url, { httpReferrer: e.base_url });
267
- const r = await F(t, e);
268
- console.log(`[crawler] found ${r.length} links from ${e.remark || e.base_url}`);
286
+ await e.loadURL(t.base_url, { httpReferrer: t.base_url });
287
+ const r = await F(e, t);
288
+ console.log(`[crawler] found ${r.length} links from ${t.remark || t.base_url}`);
269
289
  const s = [];
270
- for (const a of r) {
271
- const o = await q(t, e, a);
272
- if (!o || !o.title || !o.contentHtml) {
273
- console.log(`[crawler] skip empty result for ${a}`);
290
+ for (const o of r) {
291
+ const a = await q(e, t, o);
292
+ if (!a || !a.title || !a.contentHtml) {
293
+ console.log(`[crawler] skip empty result for ${o}`);
274
294
  continue;
275
295
  }
276
- const l = {
277
- url: o.url || a,
278
- title: o.title,
279
- content_html: o.contentHtml,
280
- content_markdown: L(o.contentHtml),
281
- published_at: v(o.timeText)
296
+ const i = {
297
+ url: a.url || o,
298
+ title: a.title,
299
+ content_html: a.contentHtml,
300
+ content_markdown: v(a.contentHtml),
301
+ published_at: L(a.timeText)
282
302
  };
283
- s.push(l);
303
+ s.push(i);
284
304
  try {
285
- await H(l);
286
- } catch (c) {
287
- console.warn("[crawler] push single news item failed", c);
305
+ await P(i);
306
+ } catch (l) {
307
+ console.warn("[crawler] push single news item failed", l);
288
308
  }
289
309
  }
290
- return console.log(`[crawler] processed ${s.length} items from ${e.remark || e.base_url}`), {
310
+ return console.log(`[crawler] processed ${s.length} items from ${t.remark || t.base_url}`), {
291
311
  success: !0,
292
312
  data: s
293
313
  };
294
314
  } catch (r) {
295
- return console.warn("[crawler] rule failed", e.remark || e.base_url, r), {
315
+ return console.warn("[crawler] rule failed", t.remark || t.base_url, r), {
296
316
  success: !1,
297
317
  error: r instanceof Error ? r.message : String(r)
298
318
  };
@@ -301,17 +321,17 @@ async function J(e) {
301
321
  function W() {
302
322
  if ($) return;
303
323
  $ = !0;
304
- const e = d.interval_ms, t = async () => {
305
- const r = await P();
306
- console.log(`[crawler] scheduled run, rules=${r.length}`), d.running = !0, d.running_source = void 0;
324
+ const t = p.interval_ms, e = async () => {
325
+ const r = await H();
326
+ console.log(`[crawler] scheduled run, rules=${r.length}`), p.running = !0, p.running_source = void 0;
307
327
  try {
308
328
  for (const s of r)
309
329
  await J(s);
310
330
  } finally {
311
- d.running = !1, d.running_source = void 0, d.next_run_at = new Date(Date.now() + e).toISOString();
331
+ p.running = !1, p.running_source = void 0, p.next_run_at = new Date(Date.now() + t).toISOString();
312
332
  }
313
333
  };
314
- d.next_run_at = new Date(Date.now() + 5e3).toISOString(), setTimeout(t, 5e3), setInterval(t, e);
334
+ p.next_run_at = new Date(Date.now() + 5e3).toISOString(), setTimeout(e, 5e3), setInterval(e, t);
315
335
  }
316
336
  export {
317
337
  z as initCrawler
@@ -19,6 +19,12 @@ export type CrawlerConfig = {
19
19
  rulesApiUrl?: string;
20
20
  pushApiUrl?: string;
21
21
  ruleTransformer?: (data: any) => any;
22
+ /**
23
+ * 是否处于开发模式
24
+ * - 开发模式:不使用本地缓存,每次都优先使用内存 rules,其次直接从 API 拉取
25
+ * - 生产模式:会使用本地缓存(带 5 小时过期时间)
26
+ */
27
+ devMode?: boolean;
22
28
  newsItemFieldMap?: Partial<Record<keyof NewsItem, string | '-'>>;
23
29
  };
24
30
  export type NewsItem = {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@howuse/electron-crawler",
3
- "version": "0.2.0",
3
+ "version": "0.4.0",
4
4
  "description": "基于Electron的爬虫工具包,用于爬取新闻和股票详情",
5
5
  "keywords": [
6
6
  "electron",