@howuse/electron-crawler 0.1.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,12 +1,12 @@
1
- "use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const k=require("electron-store"),x=require("electron"),U=require("turndown");function v(){const e=x.BrowserWindow.getAllWindows().find(r=>r.title==="crawler-hidden-window");return e||new x.BrowserWindow({show:!1,webPreferences:{sandbox:!1},title:"crawler-hidden-window"})}const S=new U({headingStyle:"atx",codeBlockStyle:"fenced",bulletListMarker:"-",emDelimiter:"*",strongDelimiter:"**",linkStyle:"inlined",linkReferenceStyle:"full",preformattedCode:!1,blankReplacement:(e,t)=>t.nodeName==="BR"?`
2
- `:""});S.addRule("preserveLineBreaks",{filter:["br"],replacement:()=>`
3
- `});S.addRule("images",{filter:"img",replacement:(e,t)=>{const r=t.alt||"",s=t.src||t.getAttribute("src")||"",a=t.title||"";return a?`![${r}](${s} "${a}")`:`![${r}](${s})`}});function A(e){return e.replace(/<script[\s\S]*?<\/script>/gi,"").replace(/<style[\s\S]*?<\/style>/gi,"").replace(/<noscript[\s\S]*?<\/noscript>/gi,"").replace(/<iframe[\s\S]*?<\/iframe>/gi,"").replace(/on\w+="[^"]*"/gi,"").replace(/on\w+='[^']*'/gi,"").trim()}function C(e){if(!e||!e.trim())return"";try{const t=A(e);if(!t)return"";let r=S.turndown(t);return r=r.replace(/\n{3,}/g,`
1
+ "use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const x=require("electron-store"),A=require("electron"),O=require("turndown");function L(){const e=A.BrowserWindow.getAllWindows().find(r=>r.title==="crawler-hidden-window");return e||new A.BrowserWindow({show:!1,webPreferences:{sandbox:!1},title:"crawler-hidden-window"})}const k=new O({headingStyle:"atx",codeBlockStyle:"fenced",bulletListMarker:"-",emDelimiter:"*",strongDelimiter:"**",linkStyle:"inlined",linkReferenceStyle:"full",preformattedCode:!1,blankReplacement:(e,t)=>t.nodeName==="BR"?`
2
+ `:""});k.addRule("preserveLineBreaks",{filter:["br"],replacement:()=>`
3
+ `});k.addRule("images",{filter:"img",replacement:(e,t)=>{const r=t.alt||"",s=t.src||t.getAttribute("src")||"",o=t.title||"";return o?`![${r}](${s} "${o}")`:`![${r}](${s})`}});function b(e){return e.replace(/<script[\s\S]*?<\/script>/gi,"").replace(/<style[\s\S]*?<\/style>/gi,"").replace(/<noscript[\s\S]*?<\/noscript>/gi,"").replace(/<iframe[\s\S]*?<\/iframe>/gi,"").replace(/on\w+="[^"]*"/gi,"").replace(/on\w+='[^']*'/gi,"").trim()}function v(e){if(!e||!e.trim())return"";try{const t=b(e);if(!t)return"";let r=k.turndown(t);return r=r.replace(/\n{3,}/g,`
4
4
 
5
5
  `),r=r.split(`
6
6
  `).map(s=>s.trimEnd()).join(`
7
- `),r.trim()}catch(t){return console.error("[normalizeMarkdown] 转换失败:",t),A(e).replace(/<[^>]+>/g,"").replace(/\n{3,}/g,`
7
+ `),r.trim()}catch(t){return console.error("[normalizeMarkdown] 转换失败:",t),b(e).replace(/<[^>]+>/g,"").replace(/\n{3,}/g,`
8
8
 
9
- `).trim()}}function D(e){const t=new Date,r=t.getFullYear(),s=t.getMonth()+1,a=t.getDate(),o=t.getHours(),i=t.getMinutes();if(!e||!e.trim())return t.toISOString();const c=e.trim(),w=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/,y=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,T=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/,_=/(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,R=/(\d{1,2})[月\-/](\d{1,2})[日]?/,O=/(\d{1,2})[:时](\d{1,2})[分]?/;let g=r,u=s,f=a,m=o,d=i,n=c.match(w);if(n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),m=parseInt(n[4],10),d=parseInt(n[5],10);else if(n=c.match(y),n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),m=parseInt(n[4],10),d=parseInt(n[5],10);else if(n=c.match(T),n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10);else if(n=c.match(_),n)u=parseInt(n[1],10),f=parseInt(n[2],10),m=parseInt(n[3],10),d=parseInt(n[4],10);else if(n=c.match(R),n)u=parseInt(n[1],10),f=parseInt(n[2],10);else if(n=c.match(O),n)m=parseInt(n[1],10),d=parseInt(n[2],10);else{const I=new Date(c);return Number.isNaN(I.getTime())?t.toISOString():I.toISOString()}(u<1||u>12)&&(u=s),(f<1||f>31)&&(f=a),(m<0||m>23)&&(m=o),(d<0||d>59)&&(d=i);const h=new Date(g,u-1,f,m,d,0,0);return h.getFullYear()!==g||h.getMonth()!==u-1||h.getDate()!==f?t.toISOString():L(h)}function L(e){const t=y=>y.toString().padStart(2,"0"),r=e.getFullYear(),s=t(e.getMonth()+1),a=t(e.getDate()),o=t(e.getHours()),i=t(e.getMinutes()),c=t(e.getSeconds()),w=e.getMilliseconds().toString().padStart(3,"0");return`${r}-${s}-${a}T${o}:${i}:${c}.${w}`}const M=k.default||k,b=new M;async function P(){if(l.rules&&l.rules.length>0)return l.rules;const e=b.get("news.rules")||[];if(e.length>0)return e;const t=await H();return t.length>0?(b.set("news.rules",t),t):[]}const p={running:!1,interval_ms:1800*1e3};let l={rules:[],rulesApiUrl:void 0,pushApiUrl:void 0,ruleTransformer:e=>e,newsItemFieldMap:void 0},$=!1;function E(e){l={...l,...e,ruleTransformer:e.ruleTransformer||(t=>t&&typeof t=="object"&&"data"in t?t.data:t)},j()}async function H(){if(!l.rulesApiUrl)return[];try{const e=await fetch(l.rulesApiUrl);if(!e.ok)throw new Error(`Failed to fetch rules from API: ${e.status} ${e.statusText}`);const t=await e.json(),r=l.ruleTransformer(t);return Array.isArray(r)?r:(console.warn("[crawler] Rules API returned non-array data, using empty array instead"),[])}catch(e){return console.error("[crawler] Failed to fetch rules from API:",e),[]}}async function N(e){if(l.pushApiUrl)try{const t=q(e),r=await fetch(l.pushApiUrl,{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify(t)});if(!r.ok)throw new Error(`Failed to push results to API: ${r.status} ${r.statusText}`);console.log("[crawler] Results pushed to API successfully")}catch(t){console.error("[crawler] Failed to push results to API:",t)}}function q(e){const t=l.newsItemFieldMap;if(!t||Object.keys(t).length===0)return e;const r={},s=Object.entries(e);for(const[a,o]of s){const i=t[a];if(i==="-")continue;const c=typeof i=="string"?i:a;r[c]=o}return r}async function F(e,t){return await e.webContents.executeJavaScript(`
9
+ `).trim()}}function C(e){const t=new Date,r=t.getFullYear(),s=t.getMonth()+1,o=t.getDate(),a=t.getHours(),c=t.getMinutes();if(!e||!e.trim())return t.toISOString();const l=e.trim(),y=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/,S=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,T=/(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/,U=/(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/,_=/(\d{1,2})[月\-/](\d{1,2})[日]?/,R=/(\d{1,2})[:时](\d{1,2})[分]?/;let g=r,u=s,f=o,d=a,m=c,n=l.match(y);if(n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),d=parseInt(n[4],10),m=parseInt(n[5],10);else if(n=l.match(S),n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10),d=parseInt(n[4],10),m=parseInt(n[5],10);else if(n=l.match(T),n)g=parseInt(n[1],10),u=parseInt(n[2],10),f=parseInt(n[3],10);else if(n=l.match(U),n)u=parseInt(n[1],10),f=parseInt(n[2],10),d=parseInt(n[3],10),m=parseInt(n[4],10);else if(n=l.match(_),n)u=parseInt(n[1],10),f=parseInt(n[2],10);else if(n=l.match(R),n)d=parseInt(n[1],10),m=parseInt(n[2],10);else{const I=new Date(l);return Number.isNaN(I.getTime())?t.toISOString():I.toISOString()}(u<1||u>12)&&(u=s),(f<1||f>31)&&(f=o),(d<0||d>23)&&(d=a),(m<0||m>59)&&(m=c);const h=new Date(g,u-1,f,d,m,0,0);return h.getFullYear()!==g||h.getMonth()!==u-1||h.getDate()!==f?t.toISOString():D(h)}function D(e){const t=S=>S.toString().padStart(2,"0"),r=e.getFullYear(),s=t(e.getMonth()+1),o=t(e.getDate()),a=t(e.getHours()),c=t(e.getMinutes()),l=t(e.getSeconds()),y=e.getMilliseconds().toString().padStart(3,"0");return`${r}-${s}-${o}T${a}:${c}:${l}.${y}`}const M=x.default||x,w=new M;async function P(){if(i.rules&&i.rules.length>0)return i.rules;const e=w.get("news.rules")||[];if(e.length>0)return e;const t=await H();return t.length>0?(w.set("news.rules",t),t):[]}const p={running:!1,interval_ms:1800*1e3};let i={rules:[],rulesApiUrl:void 0,pushApiUrl:void 0,ruleTransformer:e=>e,newsItemFieldMap:void 0},$=!1;function E(e){i={...i,...e,ruleTransformer:e.ruleTransformer||(t=>t&&typeof t=="object"&&"data"in t?t.data:t)},j()}async function H(){if(!i.rulesApiUrl)return[];try{const e=await fetch(i.rulesApiUrl);if(!e.ok)throw new Error(`Failed to fetch rules from API: ${e.status} ${e.statusText}`);const t=await e.json(),r=i.ruleTransformer(t);return Array.isArray(r)?r:(console.warn("[crawler] Rules API returned non-array data, using empty array instead"),[])}catch(e){return console.error("[crawler] Failed to fetch rules from API:",e),[]}}async function N(e){if(!i.pushApiUrl)return;const t=w.get("news.pushedUrls")||[];if(t.includes(e.url)){console.log(`[crawler] URL already pushed, skipping: ${e.url}`);return}try{const r=q(e),s=await fetch(i.pushApiUrl,{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify(r)});if(!s.ok)throw new Error(`Failed to push results to API: ${s.status} ${s.statusText}`);const o=[...t,e.url];w.set("news.pushedUrls",o),console.log("[crawler] Results pushed to API successfully")}catch(r){console.error("[crawler] Failed to push results to API:",r)}}function q(e){const t=i.newsItemFieldMap;if(!t||Object.keys(t).length===0)return e;const r={},s=Object.entries(e);for(const[o,a]of s){const c=t[o];if(c==="-")continue;const l=typeof c=="string"?c:o;r[l]=a}return r}async function F(e,t){return await e.webContents.executeJavaScript(`
10
10
  (() => {
11
11
  const links = []
12
12
  // 在指定范围内查找所有链接
@@ -83,4 +83,4 @@
83
83
  url: location.href
84
84
  }
85
85
  })()
86
- `)}catch(s){return console.warn("[crawler] failed to extract page content",r,s),null}}async function J(e){const t=v();try{await t.loadURL(e.base_url,{httpReferrer:e.base_url});const r=await F(t,e);console.log(`[crawler] found ${r.length} links from ${e.remark||e.base_url}`);const s=[];for(const a of r){const o=await W(t,e,a);if(!o||!o.title||!o.contentHtml){console.log(`[crawler] skip empty result for ${a}`);continue}const i={url:o.url||a,title:o.title,content_html:o.contentHtml,content_markdown:C(o.contentHtml),published_at:D(o.timeText)};s.push(i);try{await N(i)}catch(c){console.warn("[crawler] push single news item failed",c)}}return console.log(`[crawler] processed ${s.length} items from ${e.remark||e.base_url}`),{success:!0,data:s}}catch(r){return console.warn("[crawler] rule failed",e.remark||e.base_url,r),{success:!1,error:r instanceof Error?r.message:String(r)}}}function j(){if($)return;$=!0;const e=p.interval_ms,t=async()=>{const r=await P();console.log(`[crawler] scheduled run, rules=${r.length}`),p.running=!0,p.running_source=void 0;try{for(const s of r)await J(s)}finally{p.running=!1,p.running_source=void 0,p.next_run_at=new Date(Date.now()+e).toISOString()}};p.next_run_at=new Date(Date.now()+5e3).toISOString(),setTimeout(t,5e3),setInterval(t,e)}exports.initCrawler=E;
86
+ `)}catch(s){return console.warn("[crawler] failed to extract page content",r,s),null}}async function J(e){const t=L();try{await t.loadURL(e.base_url,{httpReferrer:e.base_url});const r=await F(t,e);console.log(`[crawler] found ${r.length} links from ${e.remark||e.base_url}`);const s=[];for(const o of r){const a=await W(t,e,o);if(!a||!a.title||!a.contentHtml){console.log(`[crawler] skip empty result for ${o}`);continue}const c={url:a.url||o,title:a.title,content_html:a.contentHtml,content_markdown:v(a.contentHtml),published_at:C(a.timeText)};s.push(c);try{await N(c)}catch(l){console.warn("[crawler] push single news item failed",l)}}return console.log(`[crawler] processed ${s.length} items from ${e.remark||e.base_url}`),{success:!0,data:s}}catch(r){return console.warn("[crawler] rule failed",e.remark||e.base_url,r),{success:!1,error:r instanceof Error?r.message:String(r)}}}function j(){if($)return;$=!0;const e=p.interval_ms,t=async()=>{const r=await P();console.log(`[crawler] scheduled run, rules=${r.length}`),p.running=!0,p.running_source=void 0;try{for(const s of r)await J(s)}finally{p.running=!1,p.running_source=void 0,p.next_run_at=new Date(Date.now()+e).toISOString()}};p.next_run_at=new Date(Date.now()+5e3).toISOString(),setTimeout(t,5e3),setInterval(t,e)}exports.initCrawler=E;
package/dist/index.mjs CHANGED
@@ -1,9 +1,9 @@
1
- import k from "electron-store";
2
- import { BrowserWindow as x } from "electron";
3
- import U from "turndown";
4
- function D() {
5
- const e = x.getAllWindows().find((r) => r.title === "crawler-hidden-window");
6
- return e || new x({
1
+ import x from "electron-store";
2
+ import { BrowserWindow as A } from "electron";
3
+ import L from "turndown";
4
+ function O() {
5
+ const e = A.getAllWindows().find((r) => r.title === "crawler-hidden-window");
6
+ return e || new A({
7
7
  show: !1,
8
8
  webPreferences: {
9
9
  sandbox: !1
@@ -11,7 +11,7 @@ function D() {
11
11
  title: "crawler-hidden-window"
12
12
  });
13
13
  }
14
- const S = new U({
14
+ const k = new L({
15
15
  headingStyle: "atx",
16
16
  // 使用 # 格式的标题
17
17
  codeBlockStyle: "fenced",
@@ -31,79 +31,79 @@ const S = new U({
31
31
  blankReplacement: (e, t) => t.nodeName === "BR" ? `
32
32
  ` : ""
33
33
  });
34
- S.addRule("preserveLineBreaks", {
34
+ k.addRule("preserveLineBreaks", {
35
35
  filter: ["br"],
36
36
  replacement: () => `
37
37
  `
38
38
  });
39
- S.addRule("images", {
39
+ k.addRule("images", {
40
40
  filter: "img",
41
41
  replacement: (e, t) => {
42
- const r = t.alt || "", s = t.src || t.getAttribute("src") || "", a = t.title || "";
43
- return a ? `![${r}](${s} "${a}")` : `![${r}](${s})`;
42
+ const r = t.alt || "", s = t.src || t.getAttribute("src") || "", o = t.title || "";
43
+ return o ? `![${r}](${s} "${o}")` : `![${r}](${s})`;
44
44
  }
45
45
  });
46
- function A(e) {
46
+ function $(e) {
47
47
  return e.replace(/<script[\s\S]*?<\/script>/gi, "").replace(/<style[\s\S]*?<\/style>/gi, "").replace(/<noscript[\s\S]*?<\/noscript>/gi, "").replace(/<iframe[\s\S]*?<\/iframe>/gi, "").replace(/on\w+="[^"]*"/gi, "").replace(/on\w+='[^']*'/gi, "").trim();
48
48
  }
49
- function L(e) {
49
+ function D(e) {
50
50
  if (!e || !e.trim())
51
51
  return "";
52
52
  try {
53
- const t = A(e);
53
+ const t = $(e);
54
54
  if (!t)
55
55
  return "";
56
- let r = S.turndown(t);
56
+ let r = k.turndown(t);
57
57
  return r = r.replace(/\n{3,}/g, `
58
58
 
59
59
  `), r = r.split(`
60
60
  `).map((s) => s.trimEnd()).join(`
61
61
  `), r.trim();
62
62
  } catch (t) {
63
- return console.error("[normalizeMarkdown] 转换失败:", t), A(e).replace(/<[^>]+>/g, "").replace(/\n{3,}/g, `
63
+ return console.error("[normalizeMarkdown] 转换失败:", t), $(e).replace(/<[^>]+>/g, "").replace(/\n{3,}/g, `
64
64
 
65
65
  `).trim();
66
66
  }
67
67
  }
68
68
  function v(e) {
69
- const t = /* @__PURE__ */ new Date(), r = t.getFullYear(), s = t.getMonth() + 1, a = t.getDate(), o = t.getHours(), l = t.getMinutes();
69
+ const t = /* @__PURE__ */ new Date(), r = t.getFullYear(), s = t.getMonth() + 1, o = t.getDate(), a = t.getHours(), c = t.getMinutes();
70
70
  if (!e || !e.trim())
71
71
  return t.toISOString();
72
- const c = e.trim(), w = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/, y = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, _ = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/, T = /(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, R = /(\d{1,2})[月\-/](\d{1,2})[日]?/, O = /(\d{1,2})[:时](\d{1,2})[分]?/;
73
- let g = r, u = s, f = a, m = o, p = l, n = c.match(w);
72
+ const l = e.trim(), y = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]+(\d{1,2}):(\d{1,2})\s+(?:星期|周)[一二三四五六日天]/, S = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, U = /(\d{4})[年\-/](\d{1,2})[月\-/](\d{1,2})[日]?/, _ = /(\d{1,2})[月\-/](\d{1,2})[日\s]*(\d{1,2}):(\d{1,2})/, T = /(\d{1,2})[月\-/](\d{1,2})[日]?/, R = /(\d{1,2})[:时](\d{1,2})[分]?/;
73
+ let g = r, u = s, f = o, m = a, p = c, n = l.match(y);
74
74
  if (n)
75
75
  g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), m = parseInt(n[4], 10), p = parseInt(n[5], 10);
76
- else if (n = c.match(y), n)
76
+ else if (n = l.match(S), n)
77
77
  g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10), m = parseInt(n[4], 10), p = parseInt(n[5], 10);
78
- else if (n = c.match(_), n)
78
+ else if (n = l.match(U), n)
79
79
  g = parseInt(n[1], 10), u = parseInt(n[2], 10), f = parseInt(n[3], 10);
80
- else if (n = c.match(T), n)
80
+ else if (n = l.match(_), n)
81
81
  u = parseInt(n[1], 10), f = parseInt(n[2], 10), m = parseInt(n[3], 10), p = parseInt(n[4], 10);
82
- else if (n = c.match(R), n)
82
+ else if (n = l.match(T), n)
83
83
  u = parseInt(n[1], 10), f = parseInt(n[2], 10);
84
- else if (n = c.match(O), n)
84
+ else if (n = l.match(R), n)
85
85
  m = parseInt(n[1], 10), p = parseInt(n[2], 10);
86
86
  else {
87
- const I = new Date(c);
87
+ const I = new Date(l);
88
88
  return Number.isNaN(I.getTime()) ? t.toISOString() : I.toISOString();
89
89
  }
90
- (u < 1 || u > 12) && (u = s), (f < 1 || f > 31) && (f = a), (m < 0 || m > 23) && (m = o), (p < 0 || p > 59) && (p = l);
90
+ (u < 1 || u > 12) && (u = s), (f < 1 || f > 31) && (f = o), (m < 0 || m > 23) && (m = a), (p < 0 || p > 59) && (p = c);
91
91
  const h = new Date(g, u - 1, f, m, p, 0, 0);
92
92
  return h.getFullYear() !== g || h.getMonth() !== u - 1 || h.getDate() !== f ? t.toISOString() : C(h);
93
93
  }
94
94
  function C(e) {
95
- const t = (y) => y.toString().padStart(2, "0"), r = e.getFullYear(), s = t(e.getMonth() + 1), a = t(e.getDate()), o = t(e.getHours()), l = t(e.getMinutes()), c = t(e.getSeconds()), w = e.getMilliseconds().toString().padStart(3, "0");
96
- return `${r}-${s}-${a}T${o}:${l}:${c}.${w}`;
95
+ const t = (S) => S.toString().padStart(2, "0"), r = e.getFullYear(), s = t(e.getMonth() + 1), o = t(e.getDate()), a = t(e.getHours()), c = t(e.getMinutes()), l = t(e.getSeconds()), y = e.getMilliseconds().toString().padStart(3, "0");
96
+ return `${r}-${s}-${o}T${a}:${c}:${l}.${y}`;
97
97
  }
98
- const M = k.default || k, b = new M();
98
+ const M = x.default || x, w = new M();
99
99
  async function P() {
100
100
  if (i.rules && i.rules.length > 0)
101
101
  return i.rules;
102
- const e = b.get("news.rules") || [];
102
+ const e = w.get("news.rules") || [];
103
103
  if (e.length > 0)
104
104
  return e;
105
105
  const t = await E();
106
- return t.length > 0 ? (b.set("news.rules", t), t) : [];
106
+ return t.length > 0 ? (w.set("news.rules", t), t) : [];
107
107
  }
108
108
  const d = {
109
109
  running: !1,
@@ -115,7 +115,7 @@ let i = {
115
115
  pushApiUrl: void 0,
116
116
  ruleTransformer: (e) => e,
117
117
  newsItemFieldMap: void 0
118
- }, $ = !1;
118
+ }, b = !1;
119
119
  function z(e) {
120
120
  i = {
121
121
  ...i,
@@ -138,32 +138,39 @@ async function E() {
138
138
  }
139
139
  }
140
140
  async function H(e) {
141
- if (i.pushApiUrl)
142
- try {
143
- const t = N(e), r = await fetch(i.pushApiUrl, {
144
- method: "POST",
145
- headers: {
146
- "Content-Type": "application/json"
147
- },
148
- body: JSON.stringify(t)
149
- });
150
- if (!r.ok)
151
- throw new Error(`Failed to push results to API: ${r.status} ${r.statusText}`);
152
- console.log("[crawler] Results pushed to API successfully");
153
- } catch (t) {
154
- console.error("[crawler] Failed to push results to API:", t);
155
- }
141
+ if (!i.pushApiUrl)
142
+ return;
143
+ const t = w.get("news.pushedUrls") || [];
144
+ if (t.includes(e.url)) {
145
+ console.log(`[crawler] URL already pushed, skipping: ${e.url}`);
146
+ return;
147
+ }
148
+ try {
149
+ const r = N(e), s = await fetch(i.pushApiUrl, {
150
+ method: "POST",
151
+ headers: {
152
+ "Content-Type": "application/json"
153
+ },
154
+ body: JSON.stringify(r)
155
+ });
156
+ if (!s.ok)
157
+ throw new Error(`Failed to push results to API: ${s.status} ${s.statusText}`);
158
+ const o = [...t, e.url];
159
+ w.set("news.pushedUrls", o), console.log("[crawler] Results pushed to API successfully");
160
+ } catch (r) {
161
+ console.error("[crawler] Failed to push results to API:", r);
162
+ }
156
163
  }
157
164
  function N(e) {
158
165
  const t = i.newsItemFieldMap;
159
166
  if (!t || Object.keys(t).length === 0)
160
167
  return e;
161
168
  const r = {}, s = Object.entries(e);
162
- for (const [a, o] of s) {
163
- const l = t[a];
164
- if (l === "-") continue;
165
- const c = typeof l == "string" ? l : a;
166
- r[c] = o;
169
+ for (const [o, a] of s) {
170
+ const c = t[o];
171
+ if (c === "-") continue;
172
+ const l = typeof c == "string" ? c : o;
173
+ r[l] = a;
167
174
  }
168
175
  return r;
169
176
  }
@@ -261,30 +268,30 @@ async function q(e, t, r) {
261
268
  }
262
269
  }
263
270
  async function J(e) {
264
- const t = D();
271
+ const t = O();
265
272
  try {
266
273
  await t.loadURL(e.base_url, { httpReferrer: e.base_url });
267
274
  const r = await F(t, e);
268
275
  console.log(`[crawler] found ${r.length} links from ${e.remark || e.base_url}`);
269
276
  const s = [];
270
- for (const a of r) {
271
- const o = await q(t, e, a);
272
- if (!o || !o.title || !o.contentHtml) {
273
- console.log(`[crawler] skip empty result for ${a}`);
277
+ for (const o of r) {
278
+ const a = await q(t, e, o);
279
+ if (!a || !a.title || !a.contentHtml) {
280
+ console.log(`[crawler] skip empty result for ${o}`);
274
281
  continue;
275
282
  }
276
- const l = {
277
- url: o.url || a,
278
- title: o.title,
279
- content_html: o.contentHtml,
280
- content_markdown: L(o.contentHtml),
281
- published_at: v(o.timeText)
283
+ const c = {
284
+ url: a.url || o,
285
+ title: a.title,
286
+ content_html: a.contentHtml,
287
+ content_markdown: D(a.contentHtml),
288
+ published_at: v(a.timeText)
282
289
  };
283
- s.push(l);
290
+ s.push(c);
284
291
  try {
285
- await H(l);
286
- } catch (c) {
287
- console.warn("[crawler] push single news item failed", c);
292
+ await H(c);
293
+ } catch (l) {
294
+ console.warn("[crawler] push single news item failed", l);
288
295
  }
289
296
  }
290
297
  return console.log(`[crawler] processed ${s.length} items from ${e.remark || e.base_url}`), {
@@ -299,8 +306,8 @@ async function J(e) {
299
306
  }
300
307
  }
301
308
  function W() {
302
- if ($) return;
303
- $ = !0;
309
+ if (b) return;
310
+ b = !0;
304
311
  const e = d.interval_ms, t = async () => {
305
312
  const r = await P();
306
313
  console.log(`[crawler] scheduled run, rules=${r.length}`), d.running = !0, d.running_source = void 0;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@howuse/electron-crawler",
3
- "version": "0.1.0",
3
+ "version": "0.3.0",
4
4
  "description": "基于Electron的爬虫工具包,用于爬取新闻和股票详情",
5
5
  "keywords": [
6
6
  "electron",
@@ -18,8 +18,8 @@
18
18
  "url": "https://github.com/your-username/your-repo/issues"
19
19
  },
20
20
  "homepage": "https://github.com/your-username/your-repo#readme",
21
- "main": "dist/index.cjs",
22
- "module": "dist/index.js",
21
+ "main": "dist/index.js",
22
+ "module": "dist/index.mjs",
23
23
  "types": "dist/index.d.ts",
24
24
  "files": [
25
25
  "dist/**/*",
@@ -29,8 +29,8 @@
29
29
  "exports": {
30
30
  ".": {
31
31
  "types": "./dist/index.d.ts",
32
- "import": "./dist/index.js",
33
- "require": "./dist/index.cjs"
32
+ "import": "./dist/index.mjs",
33
+ "require": "./dist/index.js"
34
34
  }
35
35
  },
36
36
  "scripts": {