rssany 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +1 -1
- package/README.md +10 -11
- package/{plugins/sources → app/plugins/builtin}/email.rssany.js +92 -96
- package/app/plugins/builtin/rss.rssany.js +164 -0
- package/{plugins/templates → app/plugins}/site.rssany.js +6 -7
- package/dist/index.js +1200 -807
- package/dist/index.js.map +1 -1
- package/{config.examples.json → init/config.json} +7 -1
- package/init/sources.json +353 -0
- package/package.json +6 -7
- package/statics/401.html +1 -1
- package/statics/README.md +1 -1
- package/webui/build/200.html +16 -18
- package/webui/build/_app/immutable/assets/0.C6Q_nuW9.css +1 -0
- package/webui/build/_app/immutable/assets/10.Dj8_pmut.css +1 -0
- package/webui/build/_app/immutable/assets/11.qYZMiTb0.css +1 -0
- package/webui/build/_app/immutable/assets/12.Ct59LCqW.css +1 -0
- package/webui/build/_app/immutable/assets/13.BhO9zvFi.css +1 -0
- package/webui/build/_app/immutable/assets/14.CujIhjQK.css +1 -0
- package/webui/build/_app/immutable/assets/15.nNGjXhCQ.css +1 -0
- package/webui/build/_app/immutable/assets/16.PP9XLDf7.css +1 -0
- package/webui/build/_app/immutable/assets/4.9wPHhVwv.css +1 -0
- package/webui/build/_app/immutable/assets/6.DSJfjJwx.css +1 -0
- package/webui/build/_app/immutable/assets/7.CrNxmd8B.css +1 -0
- package/webui/build/_app/immutable/assets/8.Ba5_jYIY.css +1 -0
- package/webui/build/_app/immutable/assets/{9.BZheTlzZ.css → 9.m-LCx_kl.css} +1 -1
- package/webui/build/_app/immutable/assets/BackToParentRoute.DGk-X5ow.css +1 -0
- package/webui/build/_app/immutable/assets/SourcesList.yTBBi3_m.css +1 -0
- package/webui/build/_app/immutable/assets/homeFeedPanelStore.BopJZtHu.css +1 -0
- package/webui/build/_app/immutable/chunks/{V2-VOe88.js → B-OsL1Ct.js} +1 -1
- package/webui/build/_app/immutable/chunks/B2Q1a1-H.js +2 -0
- package/webui/build/_app/immutable/chunks/BK3WtZwv.js +1 -0
- package/webui/build/_app/immutable/chunks/BQqoDzLx.js +1 -0
- package/webui/build/_app/immutable/chunks/BXCWEhUd.js +1 -0
- package/webui/build/_app/immutable/chunks/BbWUOQ_m.js +1 -0
- package/webui/build/_app/immutable/chunks/Bp63qm3L.js +1 -0
- package/webui/build/_app/immutable/chunks/CVzlFH44.js +1 -0
- package/webui/build/_app/immutable/chunks/CWNeClHp.js +6 -0
- package/webui/build/_app/immutable/chunks/Cihqbfi5.js +1 -0
- package/webui/build/_app/immutable/chunks/CkUAV0m0.js +41 -0
- package/webui/build/_app/immutable/chunks/CtijX1u3.js +31 -0
- package/webui/build/_app/immutable/chunks/D5GvRCv7.js +1 -0
- package/webui/build/_app/immutable/chunks/DEDI7Ecm.js +1 -0
- package/webui/build/_app/immutable/chunks/DFuhmi31.js +1 -0
- package/webui/build/_app/immutable/chunks/DMWEh-Ek.js +2 -0
- package/webui/build/_app/immutable/chunks/{Cg3zih_x.js → DcAshVxe.js} +1 -1
- package/webui/build/_app/immutable/chunks/DjNLq3TF.js +1 -0
- package/webui/build/_app/immutable/chunks/Dt2CddFe.js +1 -0
- package/webui/build/_app/immutable/chunks/Dw782Tjs.js +1 -0
- package/webui/build/_app/immutable/chunks/EIZIMsXK.js +1 -0
- package/webui/build/_app/immutable/chunks/Xy_fhzQq.js +1 -0
- package/webui/build/_app/immutable/chunks/lk5LaiqA.js +1 -0
- package/webui/build/_app/immutable/chunks/mW5RwvnK.js +13 -0
- package/webui/build/_app/immutable/chunks/{CtHRh_pJ.js → tB7QMF3U.js} +1 -1
- package/webui/build/_app/immutable/chunks/xtNWTdbD.js +1 -0
- package/webui/build/_app/immutable/entry/app.DdgnooOk.js +2 -0
- package/webui/build/_app/immutable/entry/start.DhJaJZhR.js +1 -0
- package/webui/build/_app/immutable/nodes/0.BE05Cuc4.js +11 -0
- package/webui/build/_app/immutable/nodes/1.5DFDaT4c.js +1 -0
- package/webui/build/_app/immutable/nodes/10.OVK4i9XE.js +1 -0
- package/webui/build/_app/immutable/nodes/11.Dhn_rO4A.js +1 -0
- package/webui/build/_app/immutable/nodes/12.Cg8AeCSH.js +1 -0
- package/webui/build/_app/immutable/nodes/13.nT3SOzEB.js +1 -0
- package/webui/build/_app/immutable/nodes/14.B_KpJLxn.js +1 -0
- package/webui/build/_app/immutable/nodes/15.RaWaA-0I.js +1 -0
- package/webui/build/_app/immutable/nodes/{12.D9g8GCjm.js → 16.DSUgqolV.js} +15 -15
- package/webui/build/_app/immutable/nodes/2.BYWOpaxy.js +1 -0
- package/webui/build/_app/immutable/nodes/3.wQvGs9w-.js +1 -0
- package/webui/build/_app/immutable/nodes/4.DTSxpKm7.js +2 -0
- package/webui/build/_app/immutable/nodes/5.CCtn90c0.js +1 -0
- package/webui/build/_app/immutable/nodes/6.C2_mjW1u.js +1 -0
- package/webui/build/_app/immutable/nodes/7.Dwz6W7A1.js +1 -0
- package/webui/build/_app/immutable/nodes/8.DzkEw6rx.js +1 -0
- package/webui/build/_app/immutable/nodes/9.DtlXEwe1.js +1 -0
- package/webui/build/_app/version.json +1 -1
- package/plugins/sources/rss.rssany.js +0 -83
- package/sources.example.json +0 -562
- package/webui/build/_app/immutable/assets/0.BUAXpTm6.css +0 -1
- package/webui/build/_app/immutable/assets/10.I1OuCLrU.css +0 -1
- package/webui/build/_app/immutable/assets/11.CrO9xaki.css +0 -1
- package/webui/build/_app/immutable/assets/12.BEi6fInA.css +0 -1
- package/webui/build/_app/immutable/assets/14.Ctlgn1LZ.css +0 -1
- package/webui/build/_app/immutable/assets/2.eJ80XOGm.css +0 -1
- package/webui/build/_app/immutable/assets/4.B8-jYAVj.css +0 -1
- package/webui/build/_app/immutable/assets/6.Drn-0DON.css +0 -1
- package/webui/build/_app/immutable/assets/7.ms2diq_q.css +0 -1
- package/webui/build/_app/immutable/assets/8.DKymkjjs.css +0 -1
- package/webui/build/_app/immutable/assets/SourcesList.BhtYlRsQ.css +0 -1
- package/webui/build/_app/immutable/chunks/BUngiKFg.js +0 -1
- package/webui/build/_app/immutable/chunks/Bt0fzibd.js +0 -1
- package/webui/build/_app/immutable/chunks/BxHqDcpw.js +0 -1
- package/webui/build/_app/immutable/chunks/ByQRbEUX.js +0 -1
- package/webui/build/_app/immutable/chunks/C12mHcUp.js +0 -6
- package/webui/build/_app/immutable/chunks/C1kQ4pHy.js +0 -1
- package/webui/build/_app/immutable/chunks/C74gbb4Q.js +0 -1
- package/webui/build/_app/immutable/chunks/CAtemnMo.js +0 -1
- package/webui/build/_app/immutable/chunks/CVjCNJia.js +0 -1
- package/webui/build/_app/immutable/chunks/CjQQ9_Q2.js +0 -2
- package/webui/build/_app/immutable/chunks/CkS2JMkE.js +0 -1
- package/webui/build/_app/immutable/chunks/D-6mYMI1.js +0 -1
- package/webui/build/_app/immutable/chunks/D1Gs8-g3.js +0 -1
- package/webui/build/_app/immutable/chunks/D9dRVKgL.js +0 -1
- package/webui/build/_app/immutable/chunks/DCEY1XiC.js +0 -1
- package/webui/build/_app/immutable/chunks/DI-t-G_K.js +0 -2
- package/webui/build/_app/immutable/chunks/DTUxjyWL.js +0 -1
- package/webui/build/_app/immutable/chunks/DWJZOHke.js +0 -1
- package/webui/build/_app/immutable/chunks/Dgs6d7X5.js +0 -1
- package/webui/build/_app/immutable/chunks/DjpPK99f.js +0 -71
- package/webui/build/_app/immutable/chunks/DjzVVxpy.js +0 -1
- package/webui/build/_app/immutable/chunks/LQVMBmDN.js +0 -1
- package/webui/build/_app/immutable/chunks/Qw0Qgx6J.js +0 -1
- package/webui/build/_app/immutable/chunks/bohabpgg.js +0 -1
- package/webui/build/_app/immutable/chunks/c-YfbAB_.js +0 -8
- package/webui/build/_app/immutable/chunks/tpTQfoNn.js +0 -1
- package/webui/build/_app/immutable/entry/app.4I2fqDIL.js +0 -2
- package/webui/build/_app/immutable/entry/start.CrgdT2Qb.js +0 -1
- package/webui/build/_app/immutable/nodes/0.gA9sQtoM.js +0 -11
- package/webui/build/_app/immutable/nodes/1.Bybh7btp.js +0 -1
- package/webui/build/_app/immutable/nodes/10.DEkJCZ6X.js +0 -1
- package/webui/build/_app/immutable/nodes/11.CDNNJqlQ.js +0 -1
- package/webui/build/_app/immutable/nodes/13.DRpZV72T.js +0 -1
- package/webui/build/_app/immutable/nodes/14.DVeJW6bd.js +0 -1
- package/webui/build/_app/immutable/nodes/2.DIZ4IPNm.js +0 -1
- package/webui/build/_app/immutable/nodes/3.BFSNf0FK.js +0 -1
- package/webui/build/_app/immutable/nodes/4.BSsIjejE.js +0 -2
- package/webui/build/_app/immutable/nodes/5.COxRT9Oe.js +0 -1
- package/webui/build/_app/immutable/nodes/6.CBgQ4YzB.js +0 -1
- package/webui/build/_app/immutable/nodes/7.BbzWOL0V.js +0 -6
- package/webui/build/_app/immutable/nodes/8.C8120200.js +0 -1
- package/webui/build/_app/immutable/nodes/9.BH_BGQQ4.js +0 -1
- /package/webui/build/_app/immutable/nodes/{15.BtYZF6FM.js → 17.BtYZF6FM.js} +0 -0
- /package/webui/build/_app/immutable/nodes/{16.Ba_qJjp6.js → 18.Ba_qJjp6.js} +0 -0
package/dist/index.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import "dotenv/config";
|
|
2
|
-
import { existsSync, unlinkSync, openSync, writeSync, closeSync, readFileSync, watch } from "node:fs";
|
|
2
|
+
import { existsSync, unlinkSync, openSync, writeSync, closeSync, readFileSync, statSync, watch } from "node:fs";
|
|
3
3
|
import { platform, homedir, networkInterfaces } from "node:os";
|
|
4
4
|
import { serve } from "@hono/node-server";
|
|
5
5
|
import { Hono } from "hono";
|
|
@@ -10,12 +10,16 @@ import { promisify } from "node:util";
|
|
|
10
10
|
import puppeteerCore from "puppeteer-core";
|
|
11
11
|
import { parse, NodeType } from "node-html-parser";
|
|
12
12
|
import Database from "better-sqlite3";
|
|
13
|
-
import { mkdir, copyFile, access, rename, readFile, writeFile, readdir } from "node:fs/promises";
|
|
13
|
+
import { mkdir, copyFile, access, rename, readFile, writeFile, readdir, stat, unlink } from "node:fs/promises";
|
|
14
14
|
import { fileURLToPath, pathToFileURL } from "node:url";
|
|
15
|
-
import { createHash
|
|
15
|
+
import { createHash } from "node:crypto";
|
|
16
16
|
import { JSDOM } from "jsdom";
|
|
17
17
|
import { Readability } from "@mozilla/readability";
|
|
18
18
|
import OpenAI from "openai";
|
|
19
|
+
import RssParser from "rss-parser";
|
|
20
|
+
import { HttpsProxyAgent } from "https-proxy-agent";
|
|
21
|
+
import { ImapFlow } from "imapflow";
|
|
22
|
+
import { simpleParser } from "mailparser";
|
|
19
23
|
import { EventEmitter } from "node:events";
|
|
20
24
|
import { CronExpressionParser } from "cron-parser";
|
|
21
25
|
import { validate, schedule as schedule$1 } from "node-cron";
|
|
@@ -144,6 +148,50 @@ function markPipelineDrop(item) {
|
|
|
144
148
|
function isPipelineDroppedItem(item) {
|
|
145
149
|
return item.extra?.[PIPELINE_DROP_EXTRA_KEY] === true;
|
|
146
150
|
}
|
|
151
|
+
function canonicalHttpSourceRef(ref) {
|
|
152
|
+
const t = ref.trim();
|
|
153
|
+
if (!t) return t;
|
|
154
|
+
if (!/^https?:\/\//i.test(t)) return t.toLowerCase();
|
|
155
|
+
try {
|
|
156
|
+
const u = new URL(t);
|
|
157
|
+
const protocol = u.protocol.toLowerCase();
|
|
158
|
+
const host = u.host.toLowerCase();
|
|
159
|
+
let path = u.pathname;
|
|
160
|
+
if (path.length > 1 && path.endsWith("/")) {
|
|
161
|
+
path = path.slice(0, -1);
|
|
162
|
+
}
|
|
163
|
+
path = path.toLowerCase();
|
|
164
|
+
return `${protocol}//${host}${path}${u.search}${u.hash}`;
|
|
165
|
+
} catch {
|
|
166
|
+
return t.toLowerCase();
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
function maxIso(a, b) {
|
|
170
|
+
if (!a) return b;
|
|
171
|
+
if (!b) return a;
|
|
172
|
+
return a >= b ? a : b;
|
|
173
|
+
}
|
|
174
|
+
function mergeSourceStatsRows(rows) {
|
|
175
|
+
const map = /* @__PURE__ */ new Map();
|
|
176
|
+
for (const row of rows) {
|
|
177
|
+
const k = canonicalHttpSourceRef(row.source_url);
|
|
178
|
+
const prev = map.get(k);
|
|
179
|
+
if (!prev) {
|
|
180
|
+
map.set(k, { count: row.count, latest_at: row.latest_at });
|
|
181
|
+
} else {
|
|
182
|
+
map.set(k, {
|
|
183
|
+
count: prev.count + row.count,
|
|
184
|
+
latest_at: maxIso(prev.latest_at, row.latest_at)
|
|
185
|
+
});
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
return [...map.entries()].map(([source_url, v]) => ({ source_url, count: v.count, latest_at: v.latest_at })).sort((a, b) => b.count - a.count);
|
|
189
|
+
}
|
|
190
|
+
const httpSourceRef = /* @__PURE__ */ Object.freeze(/* @__PURE__ */ Object.defineProperty({
|
|
191
|
+
__proto__: null,
|
|
192
|
+
canonicalHttpSourceRef,
|
|
193
|
+
mergeSourceStatsRows
|
|
194
|
+
}, Symbol.toStringTag, { value: "Module" }));
|
|
147
195
|
const __dir = dirname(fileURLToPath(import.meta.url));
|
|
148
196
|
const base = basename(__dir);
|
|
149
197
|
const PACKAGE_ROOT = base === "app" || base === "dist" ? join(__dir, "..") : __dir;
|
|
@@ -156,12 +204,9 @@ const SOURCES_CONFIG_PATH = join(USER_DIR, "sources.json");
|
|
|
156
204
|
const TAGS_CONFIG_PATH = join(USER_DIR, "tags.json");
|
|
157
205
|
const CONFIG_PATH = join(USER_DIR, "config.json");
|
|
158
206
|
const LEGACY_SUBSCRIPTIONS_PATH = join(USER_DIR, "subscriptions.json");
|
|
159
|
-
const BUILTIN_PLUGINS_DIR = join(PACKAGE_ROOT, "plugins");
|
|
207
|
+
const BUILTIN_PLUGINS_DIR = join(PACKAGE_ROOT, "app/plugins/builtin");
|
|
160
208
|
const USER_PLUGINS_DIR = join(USER_DIR, "plugins");
|
|
161
|
-
const
|
|
162
|
-
const USER_SOURCES_DIR = join(USER_PLUGINS_DIR, "sources");
|
|
163
|
-
const BUILTIN_ENRICH_DIR = join(BUILTIN_PLUGINS_DIR, "enrich");
|
|
164
|
-
const USER_ENRICH_DIR = join(USER_PLUGINS_DIR, "enrich");
|
|
209
|
+
const PLUGIN_SITE_TEMPLATE_PATH = join(PACKAGE_ROOT, "app/plugins/site.rssany.js");
|
|
165
210
|
async function pathExists(p) {
|
|
166
211
|
try {
|
|
167
212
|
await access(p);
|
|
@@ -180,8 +225,9 @@ async function migrateFile(from, to) {
|
|
|
180
225
|
logger.warn("config", "配置迁移失败", { from, to, err: err instanceof Error ? err.message : String(err) });
|
|
181
226
|
}
|
|
182
227
|
}
|
|
183
|
-
const
|
|
184
|
-
const
|
|
228
|
+
const INIT_DATA_DIR = join(PACKAGE_ROOT, "init");
|
|
229
|
+
const EXAMPLE_SOURCES = join(INIT_DATA_DIR, "sources.json");
|
|
230
|
+
const EXAMPLE_CONFIG = join(INIT_DATA_DIR, "config.json");
|
|
185
231
|
async function seedExampleConfigsIfMissing() {
|
|
186
232
|
if (!await pathExists(SOURCES_CONFIG_PATH) && await pathExists(EXAMPLE_SOURCES)) {
|
|
187
233
|
try {
|
|
@@ -209,8 +255,6 @@ async function initUserDir() {
|
|
|
209
255
|
await mkdir(DATA_DIR, { recursive: true });
|
|
210
256
|
await mkdir(CACHE_DIR, { recursive: true });
|
|
211
257
|
await mkdir(USER_PLUGINS_DIR, { recursive: true });
|
|
212
|
-
await mkdir(USER_SOURCES_DIR, { recursive: true });
|
|
213
|
-
await mkdir(USER_ENRICH_DIR, { recursive: true });
|
|
214
258
|
await seedExampleConfigsIfMissing();
|
|
215
259
|
if (!await pathExists(SOURCES_CONFIG_PATH) && await pathExists(LEGACY_SUBSCRIPTIONS_PATH)) {
|
|
216
260
|
await migrateFile(LEGACY_SUBSCRIPTIONS_PATH, SOURCES_CONFIG_PATH);
|
|
@@ -225,17 +269,17 @@ function logCorruptDiagnostic(operation, err) {
|
|
|
225
269
|
const code = err?.code;
|
|
226
270
|
const msg = err instanceof Error ? err.message : String(err);
|
|
227
271
|
const lines = [
|
|
228
|
-
"[rssany db]
|
|
229
|
-
`
|
|
230
|
-
`
|
|
231
|
-
"
|
|
232
|
-
" 1.
|
|
233
|
-
" 2.
|
|
234
|
-
" 3.
|
|
235
|
-
"
|
|
236
|
-
" -
|
|
237
|
-
" -
|
|
238
|
-
" -
|
|
272
|
+
"[rssany db] 数据库可能损坏或并发冲突",
|
|
273
|
+
` 操作: ${operation}`,
|
|
274
|
+
` 错误: ${code ?? "unknown"} - ${msg}`,
|
|
275
|
+
" 常见原因:",
|
|
276
|
+
" 1. 多进程同时打开同一库(例如 tsx --watch 与另一实例同时写)",
|
|
277
|
+
" 2. 异常退出后 WAL 未正常 checkpoint",
|
|
278
|
+
" 3. 磁盘/杀毒/同步盘导致文件不完整",
|
|
279
|
+
" 建议:",
|
|
280
|
+
" - 避免多实例同时写库;开发时慎用 --watch 与后台任务并行",
|
|
281
|
+
" - 可尝试 RSSANY_DB_JOURNAL=delete 使用 DELETE 模式降低多文件依赖",
|
|
282
|
+
" - 备份后删除 .rssany/data/rssany.db 及同目录 -wal、-shm、rssany.db.lock 再启动"
|
|
239
283
|
];
|
|
240
284
|
process.stderr.write(lines.join("\n") + "\n");
|
|
241
285
|
}
|
|
@@ -274,7 +318,7 @@ function acquireDbLock(dbDir) {
|
|
|
274
318
|
})();
|
|
275
319
|
if (stillRunning) {
|
|
276
320
|
throw new Error(
|
|
277
|
-
|
|
321
|
+
`数据库已被其他进程占用(PID ${oldPid})。请勿多开实例;若确认无其他进程,可删除 ${lockPath} 后重试(常见于 tsx --watch 未退出)`
|
|
278
322
|
);
|
|
279
323
|
}
|
|
280
324
|
}
|
|
@@ -307,7 +351,7 @@ function withWriteLock(fn) {
|
|
|
307
351
|
},
|
|
308
352
|
(e) => {
|
|
309
353
|
if (isCorruptError(e)) {
|
|
310
|
-
logCorruptDiagnostic("
|
|
354
|
+
logCorruptDiagnostic("withWriteLock 内 updateItemContent/upsertItems 等", e);
|
|
311
355
|
}
|
|
312
356
|
rejectOut(e);
|
|
313
357
|
throw e;
|
|
@@ -315,7 +359,7 @@ function withWriteLock(fn) {
|
|
|
315
359
|
);
|
|
316
360
|
return out;
|
|
317
361
|
}
|
|
318
|
-
const DATE_ONLY_TITLE_RE = /^(?:jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)\b[\s\d
|
|
362
|
+
const DATE_ONLY_TITLE_RE = /^(?:jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)\b[\s\d,./-]*(?:st|nd|rd|th)?[\s\d,./-]*$/i;
|
|
319
363
|
function normalizeText(text) {
|
|
320
364
|
return (text ?? "").replace(/\s+/g, " ").trim();
|
|
321
365
|
}
|
|
@@ -393,7 +437,7 @@ async function getDb() {
|
|
|
393
437
|
db = null;
|
|
394
438
|
}
|
|
395
439
|
if (isCorruptError(err)) {
|
|
396
|
-
logCorruptDiagnostic("
|
|
440
|
+
logCorruptDiagnostic("打开/初始化主库 (getDb)", err);
|
|
397
441
|
}
|
|
398
442
|
throw err;
|
|
399
443
|
}
|
|
@@ -407,7 +451,7 @@ async function runIntegrityCheck() {
|
|
|
407
451
|
return row?.integrity_check ?? "unknown";
|
|
408
452
|
} catch (err) {
|
|
409
453
|
const msg = err instanceof Error ? err.message : String(err);
|
|
410
|
-
return `integrity_check
|
|
454
|
+
return `integrity_check 执行失败: ${msg}`;
|
|
411
455
|
}
|
|
412
456
|
}
|
|
413
457
|
const LOGS_DB_PATH = join(DATA_DIR, "logs.db");
|
|
@@ -520,13 +564,31 @@ function initSchema(db) {
|
|
|
520
564
|
}
|
|
521
565
|
} catch {
|
|
522
566
|
}
|
|
567
|
+
migrateItemsSourceUrlIfNeeded(db);
|
|
568
|
+
}
|
|
569
|
+
function migrateItemsSourceUrlIfNeeded(db) {
|
|
570
|
+
const v = db.pragma("user_version", { simple: true });
|
|
571
|
+
if (v >= 2) return;
|
|
572
|
+
const rows = db.prepare("SELECT rowid, source_url FROM items").all();
|
|
573
|
+
const upd = db.prepare("UPDATE items SET source_url = @next WHERE rowid = @rowid");
|
|
574
|
+
const run = db.transaction(() => {
|
|
575
|
+
for (const r of rows) {
|
|
576
|
+
const next = canonicalHttpSourceRef(r.source_url);
|
|
577
|
+
if (next !== r.source_url) {
|
|
578
|
+
upd.run({ next, rowid: r.rowid });
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
db.pragma("user_version = 2");
|
|
582
|
+
});
|
|
583
|
+
run();
|
|
523
584
|
}
|
|
524
585
|
async function upsertItems(items, sourceUrlOverride) {
|
|
525
586
|
if (items.length === 0) return { newCount: 0, newIds: /* @__PURE__ */ new Set() };
|
|
526
|
-
const
|
|
527
|
-
if (!
|
|
528
|
-
throw new Error("upsertItems:
|
|
587
|
+
const raw = items[0].sourceRef?.trim();
|
|
588
|
+
if (!raw) {
|
|
589
|
+
throw new Error("upsertItems: 每条 item 须有 sourceRef,或传入 sourceUrlOverride");
|
|
529
590
|
}
|
|
591
|
+
const sourceUrl = canonicalHttpSourceRef(raw);
|
|
530
592
|
return withWriteLock(async () => {
|
|
531
593
|
const db = await getDb();
|
|
532
594
|
const stmt = db.prepare(`
|
|
@@ -633,11 +695,13 @@ async function updateItemContent(item) {
|
|
|
633
695
|
}
|
|
634
696
|
async function queryFeedItems(sourceUrls, limit, offset, opts) {
|
|
635
697
|
if (sourceUrls.length === 0) return { items: [], hasMore: false };
|
|
698
|
+
const expanded = [...new Set(sourceUrls.map((u) => canonicalHttpSourceRef(u)).filter(Boolean))];
|
|
699
|
+
if (expanded.length === 0) return { items: [], hasMore: false };
|
|
636
700
|
const db = await getDb();
|
|
637
|
-
const placeholders =
|
|
701
|
+
const placeholders = expanded.map((_, i) => `@u${i}`).join(", ");
|
|
638
702
|
const conditions = [`source_url IN (${placeholders})`];
|
|
639
703
|
const params = { lim: limit + 1, off: offset };
|
|
640
|
-
|
|
704
|
+
expanded.forEach((url, i) => {
|
|
641
705
|
params[`u${i}`] = url;
|
|
642
706
|
});
|
|
643
707
|
if (opts?.since) {
|
|
@@ -647,7 +711,7 @@ async function queryFeedItems(sourceUrls, limit, offset, opts) {
|
|
|
647
711
|
if (opts?.until) {
|
|
648
712
|
conditions.push("COALESCE(pub_date, fetched_at) < @until");
|
|
649
713
|
if (opts.until.length === 10) {
|
|
650
|
-
const d = /* @__PURE__ */ new Date(opts.until
|
|
714
|
+
const d = /* @__PURE__ */ new Date(`${opts.until}T12:00:00Z`);
|
|
651
715
|
d.setUTCDate(d.getUTCDate() + 1);
|
|
652
716
|
params.until = d.toISOString();
|
|
653
717
|
} else {
|
|
@@ -671,12 +735,20 @@ async function queryItems(opts) {
|
|
|
671
735
|
const conditions = [];
|
|
672
736
|
const params = { limit, offset };
|
|
673
737
|
if (sourceUrl) {
|
|
738
|
+
const key = canonicalHttpSourceRef(sourceUrl);
|
|
739
|
+
if (!key) {
|
|
740
|
+
return { items: [], total: 0 };
|
|
741
|
+
}
|
|
674
742
|
conditions.push("i.source_url = @sourceUrl");
|
|
675
|
-
params.sourceUrl =
|
|
743
|
+
params.sourceUrl = key;
|
|
676
744
|
} else if (sourceUrls && sourceUrls.length > 0) {
|
|
677
|
-
const
|
|
745
|
+
const expanded = [...new Set(sourceUrls.map((s) => canonicalHttpSourceRef(s)).filter(Boolean))];
|
|
746
|
+
if (expanded.length === 0) {
|
|
747
|
+
return { items: [], total: 0 };
|
|
748
|
+
}
|
|
749
|
+
const placeholders = expanded.map((_, i) => `@src${i}`).join(", ");
|
|
678
750
|
conditions.push(`i.source_url IN (${placeholders})`);
|
|
679
|
-
|
|
751
|
+
expanded.forEach((s, i) => params[`src${i}`] = s);
|
|
680
752
|
}
|
|
681
753
|
if (author && author.trim().length >= 2) {
|
|
682
754
|
conditions.push("instr(i.author, @author) > 0");
|
|
@@ -690,9 +762,7 @@ async function queryItems(opts) {
|
|
|
690
762
|
const trimmed = tagsFilter.filter((t) => typeof t === "string" && t.trim()).map((t) => t.trim());
|
|
691
763
|
if (trimmed.length > 0) {
|
|
692
764
|
const tagConds = trimmed.map((_, i) => `LOWER(TRIM(json_each.value)) = LOWER(@tag${i})`).join(" OR ");
|
|
693
|
-
conditions.push(
|
|
694
|
-
`i.tags IS NOT NULL AND EXISTS (SELECT 1 FROM json_each(i.tags) WHERE ${tagConds})`
|
|
695
|
-
);
|
|
765
|
+
conditions.push(`i.tags IS NOT NULL AND EXISTS (SELECT 1 FROM json_each(i.tags) WHERE ${tagConds})`);
|
|
696
766
|
trimmed.forEach((t, i) => {
|
|
697
767
|
params[`tag${i}`] = t;
|
|
698
768
|
});
|
|
@@ -772,9 +842,11 @@ async function deleteItem(id) {
|
|
|
772
842
|
}
|
|
773
843
|
async function deleteItemsBySourceUrl(sourceUrl) {
|
|
774
844
|
if (!sourceUrl?.trim()) return 0;
|
|
845
|
+
const key = canonicalHttpSourceRef(sourceUrl.trim());
|
|
846
|
+
if (!key) return 0;
|
|
775
847
|
return withWriteLock(async () => {
|
|
776
848
|
const db = await getDb();
|
|
777
|
-
const info = db.prepare("DELETE FROM items WHERE source_url = @sourceUrl").run({ sourceUrl:
|
|
849
|
+
const info = db.prepare("DELETE FROM items WHERE source_url = @sourceUrl").run({ sourceUrl: key });
|
|
778
850
|
return info.changes;
|
|
779
851
|
});
|
|
780
852
|
}
|
|
@@ -789,10 +861,12 @@ async function getPendingPushItems(limit = 100) {
|
|
|
789
861
|
return mapRowsToDbItems(rows);
|
|
790
862
|
}
|
|
791
863
|
async function getSourceStats() {
|
|
864
|
+
const { mergeSourceStatsRows: mergeSourceStatsRows2 } = await Promise.resolve().then(() => httpSourceRef);
|
|
792
865
|
const db = await getDb();
|
|
793
|
-
|
|
866
|
+
const rows = db.prepare(
|
|
794
867
|
"SELECT source_url, COUNT(*) as count, MAX(COALESCE(pub_date, fetched_at)) as latest_at FROM items GROUP BY source_url ORDER BY count DESC"
|
|
795
868
|
).all();
|
|
869
|
+
return mergeSourceStatsRows2(rows);
|
|
796
870
|
}
|
|
797
871
|
async function insertLog(entry) {
|
|
798
872
|
const db = await getLogsDb();
|
|
@@ -972,6 +1046,9 @@ const logger = {
|
|
|
972
1046
|
}
|
|
973
1047
|
};
|
|
974
1048
|
const execAsync = promisify(exec);
|
|
1049
|
+
const VIEWPORT_WIDTH = 1366;
|
|
1050
|
+
const VIEWPORT_HEIGHT_HEADLESS = 5e3;
|
|
1051
|
+
const VIEWPORT_HEIGHT_HEADFUL = 1200;
|
|
975
1052
|
function resolveProxy(config) {
|
|
976
1053
|
return config?.proxy ?? process.env.HTTP_PROXY ?? process.env.HTTPS_PROXY;
|
|
977
1054
|
}
|
|
@@ -982,6 +1059,14 @@ function parseProxy(proxy) {
|
|
|
982
1059
|
const password = u.password || void 0;
|
|
983
1060
|
return { serverUrl, username, password };
|
|
984
1061
|
}
|
|
1062
|
+
async function applyProxyAuthToPage(page, opts) {
|
|
1063
|
+
const proxy = resolveProxy(opts);
|
|
1064
|
+
if (!proxy) return;
|
|
1065
|
+
const { username, password } = parseProxy(proxy);
|
|
1066
|
+
if (username !== void 0 || password !== void 0) {
|
|
1067
|
+
await page.authenticate({ username: username ?? "", password: password ?? "" });
|
|
1068
|
+
}
|
|
1069
|
+
}
|
|
985
1070
|
function launchArgs(config) {
|
|
986
1071
|
const base2 = [
|
|
987
1072
|
"--disable-blink-features=AutomationControlled",
|
|
@@ -993,8 +1078,8 @@ function launchArgs(config) {
|
|
|
993
1078
|
"--disable-site-isolation-trials",
|
|
994
1079
|
"--disable-infobars"
|
|
995
1080
|
];
|
|
996
|
-
const height = config?.headless !== false ?
|
|
997
|
-
base2.push(`--window-size
|
|
1081
|
+
const height = config?.headless !== false ? VIEWPORT_HEIGHT_HEADLESS : VIEWPORT_HEIGHT_HEADFUL;
|
|
1082
|
+
base2.push(`--window-size=${VIEWPORT_WIDTH},${height}`);
|
|
998
1083
|
const proxy = resolveProxy(config);
|
|
999
1084
|
if (proxy) {
|
|
1000
1085
|
const { serverUrl } = parseProxy(proxy);
|
|
@@ -1084,150 +1169,130 @@ function headersToRecord(headers) {
|
|
|
1084
1169
|
async function setupPage(page, headless = true) {
|
|
1085
1170
|
const realUserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
|
|
1086
1171
|
await page.setUserAgent(realUserAgent);
|
|
1087
|
-
await page.setViewport({
|
|
1172
|
+
await page.setViewport({
|
|
1173
|
+
width: VIEWPORT_WIDTH,
|
|
1174
|
+
height: headless ? VIEWPORT_HEIGHT_HEADLESS : VIEWPORT_HEIGHT_HEADFUL
|
|
1175
|
+
});
|
|
1088
1176
|
await stealthPage(page);
|
|
1089
1177
|
}
|
|
1090
|
-
let _browser = null;
|
|
1091
|
-
let _browserHeadless = true;
|
|
1092
|
-
let _launchPromise = null;
|
|
1093
1178
|
function isFrameDetachedError(e) {
|
|
1094
1179
|
const msg = e instanceof Error ? e.message : String(e);
|
|
1095
1180
|
return /detached|Navigating frame was detached|Session closed/i.test(msg);
|
|
1096
1181
|
}
|
|
1097
|
-
async function
|
|
1098
|
-
if (!_browser) return false;
|
|
1099
|
-
try {
|
|
1100
|
-
await _browser.version();
|
|
1101
|
-
return true;
|
|
1102
|
-
} catch {
|
|
1103
|
-
_browser = null;
|
|
1104
|
-
return false;
|
|
1105
|
-
}
|
|
1106
|
-
}
|
|
1107
|
-
async function getOrCreateBrowser(config) {
|
|
1182
|
+
async function launchBrowser(config) {
|
|
1108
1183
|
const wantHeadless = config.headless !== false;
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
const executablePath = config.chromeExecutablePath ?? process.env.CHROME_PATH ?? findChromeExecutable();
|
|
1122
|
-
if (!executablePath) {
|
|
1123
|
-
throw new Error("未找到 Chrome 可执行文件,请安装 Google Chrome 或设置 CHROME_PATH 环境变量");
|
|
1184
|
+
const executablePath = config.chromeExecutablePath ?? process.env.CHROME_PATH ?? findChromeExecutable();
|
|
1185
|
+
if (!executablePath) {
|
|
1186
|
+
throw new Error("未找到 Chrome 可执行文件,请安装 Google Chrome 或设置 CHROME_PATH 环境变量");
|
|
1187
|
+
}
|
|
1188
|
+
const userDataDir = getUserDataDir(config.cacheDir);
|
|
1189
|
+
const maxRetries = 2;
|
|
1190
|
+
let lastErr;
|
|
1191
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
1192
|
+
try {
|
|
1193
|
+
if (attempt === 0 && userDataDir) {
|
|
1194
|
+
const absUserDataDir = resolve(userDataDir);
|
|
1195
|
+
await killStaleChromeProcesses(absUserDataDir);
|
|
1124
1196
|
}
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
browser.on("disconnected", () => {
|
|
1148
|
-
_browser = null;
|
|
1149
|
-
_launchPromise = null;
|
|
1150
|
-
});
|
|
1151
|
-
_browser = browser;
|
|
1152
|
-
_browserHeadless = wantHeadless;
|
|
1153
|
-
return browser;
|
|
1154
|
-
} catch (e) {
|
|
1155
|
-
lastErr = e;
|
|
1156
|
-
if (attempt < maxRetries && isAlreadyRunningError(e)) {
|
|
1157
|
-
continue;
|
|
1158
|
-
}
|
|
1159
|
-
if (isAlreadyRunningError(e)) {
|
|
1160
|
-
const dir = userDataDir ?? "browser_data/main";
|
|
1161
|
-
throw new Error(
|
|
1162
|
-
`Chrome 的 profile 目录已被占用(${dir})。通常是因为上次未正常退出或同时运行了多个本服务实例。请关闭占用该目录的 Chrome 进程后重试,或设置环境变量 CACHE_DIR 使用不同缓存目录。`
|
|
1163
|
-
);
|
|
1164
|
-
}
|
|
1165
|
-
throw e;
|
|
1166
|
-
}
|
|
1197
|
+
if (attempt > 0) {
|
|
1198
|
+
const waitMs = attempt * 2e3;
|
|
1199
|
+
logger.info("scraper", "userDataDir 曾被占用,等待后重试", { waitMs, attempt });
|
|
1200
|
+
await new Promise((r) => setTimeout(r, waitMs));
|
|
1201
|
+
}
|
|
1202
|
+
return await puppeteerCore.launch({
|
|
1203
|
+
headless: wantHeadless,
|
|
1204
|
+
args: launchArgs({ proxy: config.proxy, headless: wantHeadless }),
|
|
1205
|
+
userDataDir,
|
|
1206
|
+
executablePath,
|
|
1207
|
+
ignoreDefaultArgs: ["--enable-automation"]
|
|
1208
|
+
});
|
|
1209
|
+
} catch (e) {
|
|
1210
|
+
lastErr = e;
|
|
1211
|
+
if (attempt < maxRetries && isAlreadyRunningError(e)) {
|
|
1212
|
+
continue;
|
|
1213
|
+
}
|
|
1214
|
+
if (isAlreadyRunningError(e)) {
|
|
1215
|
+
const dir = userDataDir ?? "browser_data/main";
|
|
1216
|
+
throw new Error(
|
|
1217
|
+
`Chrome 的 profile 目录已被占用(${dir})。通常是因为上次未正常退出或同时运行了多个本服务实例。请关闭占用该目录的 Chrome 进程后重试,或设置环境变量 CACHE_DIR 使用不同缓存目录。`
|
|
1218
|
+
);
|
|
1167
1219
|
}
|
|
1168
|
-
throw lastErr;
|
|
1169
|
-
})().catch((e) => {
|
|
1170
|
-
_launchPromise = null;
|
|
1171
1220
|
throw e;
|
|
1172
|
-
}
|
|
1221
|
+
}
|
|
1173
1222
|
}
|
|
1174
|
-
|
|
1223
|
+
throw lastErr;
|
|
1175
1224
|
}
|
|
1176
|
-
|
|
1177
|
-
_browser?.close().catch(() => {
|
|
1178
|
-
});
|
|
1179
|
-
});
|
|
1180
|
-
process.once("SIGINT", async () => {
|
|
1181
|
-
await _browser?.close().catch(() => {
|
|
1182
|
-
});
|
|
1183
|
-
process.exit(0);
|
|
1184
|
-
});
|
|
1185
|
-
process.once("SIGTERM", async () => {
|
|
1186
|
-
await _browser?.close().catch(() => {
|
|
1187
|
-
});
|
|
1188
|
-
process.exit(0);
|
|
1189
|
-
});
|
|
1190
|
-
async function preCheckAuth(authFlow, cacheDir) {
|
|
1225
|
+
async function preCheckAuth(authFlow, cacheDir, opts) {
|
|
1191
1226
|
const { checkAuth, loginUrl, domain } = authFlow;
|
|
1192
1227
|
if (domain == null || !cacheDir) return true;
|
|
1193
|
-
const
|
|
1194
|
-
const
|
|
1228
|
+
const isHeadless = opts?.headless !== false;
|
|
1229
|
+
const browser = await launchBrowser({
|
|
1230
|
+
headless: isHeadless,
|
|
1231
|
+
cacheDir,
|
|
1232
|
+
proxy: resolveProxy(opts)
|
|
1233
|
+
});
|
|
1195
1234
|
try {
|
|
1196
|
-
await
|
|
1197
|
-
|
|
1198
|
-
|
|
1199
|
-
|
|
1235
|
+
const page = await browser.newPage();
|
|
1236
|
+
try {
|
|
1237
|
+
await setupPage(page, isHeadless);
|
|
1238
|
+
await applyProxyAuthToPage(page, opts);
|
|
1239
|
+
await page.goto(loginUrl, { waitUntil: "domcontentloaded", timeout: 6e4 });
|
|
1240
|
+
await new Promise((resolve2) => setTimeout(resolve2, 3e3));
|
|
1241
|
+
return await checkAuth(page, page.url());
|
|
1242
|
+
} finally {
|
|
1243
|
+
await page.close().catch(() => {
|
|
1244
|
+
});
|
|
1245
|
+
}
|
|
1200
1246
|
} finally {
|
|
1201
|
-
await
|
|
1247
|
+
await browser.close().catch(() => {
|
|
1202
1248
|
});
|
|
1203
1249
|
}
|
|
1204
1250
|
}
|
|
1205
|
-
async function ensureAuth(authFlow, cacheDir) {
|
|
1251
|
+
async function ensureAuth(authFlow, cacheDir, opts) {
|
|
1206
1252
|
const { checkAuth, loginUrl, loginTimeoutMs = 60 * 1e3, pollIntervalMs = 2e3 } = authFlow;
|
|
1207
|
-
const browser = await
|
|
1208
|
-
const page = await browser.newPage();
|
|
1253
|
+
const browser = await launchBrowser({ headless: false, cacheDir, proxy: resolveProxy(opts) });
|
|
1209
1254
|
try {
|
|
1210
|
-
await
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
const
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1255
|
+
const page = await browser.newPage();
|
|
1256
|
+
try {
|
|
1257
|
+
await setupPage(page, false);
|
|
1258
|
+
await applyProxyAuthToPage(page, opts);
|
|
1259
|
+
await page.goto(loginUrl, { waitUntil: "domcontentloaded", timeout: 6e4 });
|
|
1260
|
+
await new Promise((resolve2) => setTimeout(resolve2, 3e3));
|
|
1261
|
+
const authenticated = await checkAuth(page, page.url());
|
|
1262
|
+
if (authenticated) return;
|
|
1263
|
+
const startTime = Date.now();
|
|
1264
|
+
while (Date.now() - startTime < loginTimeoutMs) {
|
|
1265
|
+
await new Promise((resolve2) => setTimeout(resolve2, pollIntervalMs));
|
|
1266
|
+
const authenticated2 = await checkAuth(page, page.url());
|
|
1267
|
+
if (authenticated2) return;
|
|
1268
|
+
}
|
|
1269
|
+
throw new Error(`登录超时(${loginTimeoutMs}ms)`);
|
|
1270
|
+
} finally {
|
|
1271
|
+
await page.close().catch(() => {
|
|
1272
|
+
});
|
|
1273
|
+
}
|
|
1222
1274
|
} finally {
|
|
1223
|
-
await
|
|
1275
|
+
await browser.close().catch(() => {
|
|
1224
1276
|
});
|
|
1225
1277
|
}
|
|
1226
1278
|
}
|
|
1227
1279
|
async function fetchHtml(url, config = {}) {
|
|
1228
|
-
const {
|
|
1280
|
+
const {
|
|
1281
|
+
timeoutMs,
|
|
1282
|
+
headers,
|
|
1283
|
+
cookies,
|
|
1284
|
+
cacheDir,
|
|
1285
|
+
checkAuth,
|
|
1286
|
+
authFlow,
|
|
1287
|
+
purify,
|
|
1288
|
+
headless,
|
|
1289
|
+
waitAfterLoadMs,
|
|
1290
|
+
waitForSelector,
|
|
1291
|
+
waitForSelectorTimeoutMs,
|
|
1292
|
+
useHttpResponseBody
|
|
1293
|
+
} = config;
|
|
1229
1294
|
const isHeadless = headless !== false;
|
|
1230
|
-
const browser = await
|
|
1295
|
+
const browser = await launchBrowser({
|
|
1231
1296
|
headless: isHeadless,
|
|
1232
1297
|
cacheDir,
|
|
1233
1298
|
proxy: resolveProxy(config),
|
|
@@ -1236,70 +1301,84 @@ async function fetchHtml(url, config = {}) {
|
|
|
1236
1301
|
const navigationTimeout = timeoutMs ?? 6e4;
|
|
1237
1302
|
const maxAttempts = 2;
|
|
1238
1303
|
let lastError;
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
await setupPage(page, isHeadless);
|
|
1249
|
-
const extraHeaders = { "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", ...headers ?? {} };
|
|
1250
|
-
if (cookies != null && cookies !== "") {
|
|
1251
|
-
extraHeaders.cookie = cookies;
|
|
1252
|
-
}
|
|
1253
|
-
await page.setExtraHTTPHeaders(extraHeaders);
|
|
1254
|
-
const proxy = resolveProxy(config);
|
|
1255
|
-
if (proxy) {
|
|
1256
|
-
const { username, password } = parseProxy(proxy);
|
|
1257
|
-
if (username !== void 0 || password !== void 0) {
|
|
1258
|
-
await page.authenticate({ username: username ?? "", password: password ?? "" });
|
|
1304
|
+
try {
|
|
1305
|
+
for (let attempt = 0; attempt < maxAttempts; attempt++) {
|
|
1306
|
+
const page = await browser.newPage();
|
|
1307
|
+
const isRetry = attempt === 1;
|
|
1308
|
+
const waitUntil = isRetry ? "domcontentloaded" : "load";
|
|
1309
|
+
const extraWaitMs = isRetry ? Math.min(500, Math.max(0, waitAfterLoadMs ?? 2e3)) : Math.max(0, waitAfterLoadMs ?? 2e3);
|
|
1310
|
+
try {
|
|
1311
|
+
if (config.browserContext) {
|
|
1312
|
+
await config.browserContext(page.browserContext());
|
|
1259
1313
|
}
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
}
|
|
1272
|
-
if (checkAuth != null || authFlow != null) {
|
|
1273
|
-
const authCheck = checkAuth ?? authFlow?.checkAuth;
|
|
1274
|
-
if (authCheck != null) {
|
|
1275
|
-
const ok = await authCheck(page, url);
|
|
1276
|
-
if (!ok) {
|
|
1277
|
-
throw new Error("checkAuth failed: 未通过认证检查,请先调用 ensureAuth 进行预处理登录");
|
|
1314
|
+
await setupPage(page, isHeadless);
|
|
1315
|
+
const extraHeaders = { "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", ...headers ?? {} };
|
|
1316
|
+
if (cookies != null && cookies !== "") {
|
|
1317
|
+
extraHeaders.cookie = cookies;
|
|
1318
|
+
}
|
|
1319
|
+
await page.setExtraHTTPHeaders(extraHeaders);
|
|
1320
|
+
const proxy = resolveProxy(config);
|
|
1321
|
+
if (proxy) {
|
|
1322
|
+
const { username, password } = parseProxy(proxy);
|
|
1323
|
+
if (username !== void 0 || password !== void 0) {
|
|
1324
|
+
await page.authenticate({ username: username ?? "", password: password ?? "" });
|
|
1278
1325
|
}
|
|
1279
1326
|
}
|
|
1327
|
+
if (timeoutMs != null) {
|
|
1328
|
+
await page.setDefaultNavigationTimeout(timeoutMs);
|
|
1329
|
+
}
|
|
1330
|
+
const response = await page.goto(url, { waitUntil, timeout: navigationTimeout });
|
|
1331
|
+
if (extraWaitMs > 0) {
|
|
1332
|
+
await new Promise((resolve2) => setTimeout(resolve2, extraWaitMs));
|
|
1333
|
+
}
|
|
1334
|
+
if (waitForSelector != null && waitForSelector !== "" && !isRetry) {
|
|
1335
|
+
const selectorTimeout = waitForSelectorTimeoutMs ?? 2e4;
|
|
1336
|
+
await page.waitForSelector(waitForSelector, { timeout: selectorTimeout });
|
|
1337
|
+
}
|
|
1338
|
+
if (checkAuth != null || authFlow != null) {
|
|
1339
|
+
const authCheck = checkAuth ?? authFlow?.checkAuth;
|
|
1340
|
+
if (authCheck != null) {
|
|
1341
|
+
const ok = await authCheck(page, url);
|
|
1342
|
+
if (!ok) {
|
|
1343
|
+
throw new Error("checkAuth failed: 未通过认证检查,请先调用 ensureAuth 进行预处理登录");
|
|
1344
|
+
}
|
|
1345
|
+
}
|
|
1346
|
+
}
|
|
1347
|
+
let rawBody;
|
|
1348
|
+
if (useHttpResponseBody === true && response != null) {
|
|
1349
|
+
try {
|
|
1350
|
+
rawBody = await response.text();
|
|
1351
|
+
} catch {
|
|
1352
|
+
rawBody = await page.content();
|
|
1353
|
+
}
|
|
1354
|
+
} else {
|
|
1355
|
+
rawBody = await page.content();
|
|
1356
|
+
}
|
|
1357
|
+
const finalUrl = response?.url() ?? page.url() ?? String(url);
|
|
1358
|
+
const status = response?.status() ?? 0;
|
|
1359
|
+
const statusText = response?.statusText() ?? "";
|
|
1360
|
+
const rawHeaders = response?.headers() ?? {};
|
|
1361
|
+
const normalizedHeaders = headersToRecord(rawHeaders);
|
|
1362
|
+
const body = applyPurify(rawBody, purify);
|
|
1363
|
+
await page.close().catch(() => {
|
|
1364
|
+
});
|
|
1365
|
+
return { finalUrl, status, statusText, headers: normalizedHeaders, body };
|
|
1366
|
+
} catch (e) {
|
|
1367
|
+
lastError = e;
|
|
1368
|
+
await page.close().catch(() => {
|
|
1369
|
+
});
|
|
1370
|
+
if (isRetry || !isFrameDetachedError(e)) {
|
|
1371
|
+
throw e;
|
|
1372
|
+
}
|
|
1373
|
+
logger.warn("scraper", "fetchHtml 因 frame 分离重试", { url, attempt: attempt + 1, err: e instanceof Error ? e.message : String(e) });
|
|
1374
|
+
await new Promise((r) => setTimeout(r, 800));
|
|
1280
1375
|
}
|
|
1281
|
-
const rawBody = await page.content();
|
|
1282
|
-
const finalUrl = response?.url() ?? page.url() ?? String(url);
|
|
1283
|
-
const status = response?.status() ?? 0;
|
|
1284
|
-
const statusText = response?.statusText() ?? "";
|
|
1285
|
-
const rawHeaders = response?.headers() ?? {};
|
|
1286
|
-
const normalizedHeaders = headersToRecord(rawHeaders);
|
|
1287
|
-
const body = applyPurify(rawBody, purify);
|
|
1288
|
-
await page.close().catch(() => {
|
|
1289
|
-
});
|
|
1290
|
-
return { finalUrl, status, statusText, headers: normalizedHeaders, body };
|
|
1291
|
-
} catch (e) {
|
|
1292
|
-
lastError = e;
|
|
1293
|
-
await page.close().catch(() => {
|
|
1294
|
-
});
|
|
1295
|
-
if (isRetry || !isFrameDetachedError(e)) {
|
|
1296
|
-
throw e;
|
|
1297
|
-
}
|
|
1298
|
-
logger.warn("scraper", "fetchHtml 因 frame 分离重试", { url, attempt: attempt + 1, err: e instanceof Error ? e.message : String(e) });
|
|
1299
|
-
await new Promise((r) => setTimeout(r, 800));
|
|
1300
1376
|
}
|
|
1377
|
+
throw lastError;
|
|
1378
|
+
} finally {
|
|
1379
|
+
await browser.close().catch(() => {
|
|
1380
|
+
});
|
|
1301
1381
|
}
|
|
1302
|
-
throw lastError;
|
|
1303
1382
|
}
|
|
1304
1383
|
const VALID_INTERVALS = ["1min", "5min", "10min", "30min", "1h", "6h", "12h", "1day", "3day", "7day"];
|
|
1305
1384
|
function cronToRefreshInterval(cronExpr) {
|
|
@@ -1471,19 +1550,78 @@ async function extractFromLink(link, extractorConfig = {}, fetchConfig = {}) {
|
|
|
1471
1550
|
cacheKey: extractorConfig.cacheKey ?? (cacheDir ? cacheKey(link, "forever") : void 0)
|
|
1472
1551
|
});
|
|
1473
1552
|
}
|
|
1553
|
+
const DEFAULT_BASE_URL = "https://api.openai.com/v1";
|
|
1554
|
+
const DEFAULT_MODEL = "gpt-4o-mini";
|
|
1555
|
+
let fileCache = null;
|
|
1556
|
+
function invalidateLLMConfigCache() {
|
|
1557
|
+
fileCache = null;
|
|
1558
|
+
}
|
|
1559
|
+
function readLlmFromFileSync() {
|
|
1560
|
+
if (!existsSync(CONFIG_PATH)) return {};
|
|
1561
|
+
try {
|
|
1562
|
+
const st = statSync(CONFIG_PATH);
|
|
1563
|
+
if (fileCache && fileCache.mtimeMs === st.mtimeMs) return fileCache.llm;
|
|
1564
|
+
const raw = readFileSync(CONFIG_PATH, "utf-8");
|
|
1565
|
+
const j = JSON.parse(raw);
|
|
1566
|
+
const llmRaw = j?.llm;
|
|
1567
|
+
const llm = {};
|
|
1568
|
+
if (llmRaw && typeof llmRaw === "object") {
|
|
1569
|
+
const o = llmRaw;
|
|
1570
|
+
if (typeof o.apiKey === "string" && o.apiKey.length > 0) llm.apiKey = o.apiKey;
|
|
1571
|
+
if (typeof o.baseUrl === "string" && o.baseUrl.trim()) llm.baseUrl = o.baseUrl.trim();
|
|
1572
|
+
if (typeof o.model === "string" && o.model.trim()) llm.model = o.model.trim();
|
|
1573
|
+
}
|
|
1574
|
+
fileCache = { mtimeMs: st.mtimeMs, llm };
|
|
1575
|
+
return llm;
|
|
1576
|
+
} catch {
|
|
1577
|
+
return {};
|
|
1578
|
+
}
|
|
1579
|
+
}
|
|
1474
1580
|
function getLLMConfig() {
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
};
|
|
1581
|
+
const file = readLlmFromFileSync();
|
|
1582
|
+
const apiKey = file.apiKey ?? process.env.OPENAI_API_KEY;
|
|
1583
|
+
const baseUrl = file.baseUrl ?? process.env.OPENAI_BASE_URL ?? DEFAULT_BASE_URL;
|
|
1584
|
+
const model = file.model ?? process.env.OPENAI_MODEL ?? DEFAULT_MODEL;
|
|
1585
|
+
return { apiKey, baseUrl, model };
|
|
1586
|
+
}
|
|
1587
|
+
function extractAssistantText(completion) {
|
|
1588
|
+
const choice = completion.choices[0];
|
|
1589
|
+
if (!choice) throw new Error("LLM 返回无 choices");
|
|
1590
|
+
const msg = choice.message;
|
|
1591
|
+
const raw = msg.content;
|
|
1592
|
+
if (typeof raw === "string") {
|
|
1593
|
+
const t = raw.trim();
|
|
1594
|
+
if (t.length > 0) return t;
|
|
1595
|
+
}
|
|
1596
|
+
const extra = msg;
|
|
1597
|
+
const rc = extra.reasoning_content;
|
|
1598
|
+
if (typeof rc === "string" && rc.trim().length > 0) {
|
|
1599
|
+
return rc.trim();
|
|
1600
|
+
}
|
|
1601
|
+
const refusal = msg.refusal;
|
|
1602
|
+
if (typeof refusal === "string" && refusal.trim()) {
|
|
1603
|
+
throw new Error(`模型拒绝: ${refusal.trim()}`);
|
|
1604
|
+
}
|
|
1605
|
+
const fr = choice.finish_reason;
|
|
1606
|
+
if (fr === "tool_calls") {
|
|
1607
|
+
throw new Error("LLM 返回了工具调用而非文本,请换一个模型或关闭工具调用");
|
|
1608
|
+
}
|
|
1609
|
+
if (fr === "content_filter") {
|
|
1610
|
+
throw new Error("内容被内容策略过滤");
|
|
1611
|
+
}
|
|
1612
|
+
if (fr === "length") {
|
|
1613
|
+
throw new Error(
|
|
1614
|
+
"LLM 输出在 content / reasoning_content 均为空前已用尽"
|
|
1615
|
+
);
|
|
1616
|
+
}
|
|
1617
|
+
throw new Error(`LLM 返回空内容 (finish_reason=${String(fr)})`);
|
|
1480
1618
|
}
|
|
1481
1619
|
function mergeConfig(override) {
|
|
1482
1620
|
const env = getLLMConfig();
|
|
1483
1621
|
const apiKey = override?.apiKey ?? env.apiKey;
|
|
1484
1622
|
const baseUrl = override?.apiUrl ?? override?.baseUrl ?? env.baseUrl;
|
|
1485
1623
|
const model = override?.model ?? env.model;
|
|
1486
|
-
if (!apiKey) throw new Error("LLM API Key
|
|
1624
|
+
if (!apiKey) throw new Error("LLM API Key 未配置:请在管理后台「设置 → LLM」或环境变量 OPENAI_API_KEY 中设置");
|
|
1487
1625
|
return { apiKey, baseUrl, model };
|
|
1488
1626
|
}
|
|
1489
1627
|
async function chatJson(prompt, config, options) {
|
|
@@ -1495,8 +1633,7 @@ async function chatJson(prompt, config, options) {
|
|
|
1495
1633
|
max_tokens: options?.maxTokens ?? 8192,
|
|
1496
1634
|
response_format: { type: "json_object" }
|
|
1497
1635
|
});
|
|
1498
|
-
const content = completion
|
|
1499
|
-
if (!content) throw new Error("LLM 返回空内容");
|
|
1636
|
+
const content = extractAssistantText(completion);
|
|
1500
1637
|
return JSON.parse(content);
|
|
1501
1638
|
}
|
|
1502
1639
|
async function chatText(prompt, config, options) {
|
|
@@ -1507,9 +1644,7 @@ async function chatText(prompt, config, options) {
|
|
|
1507
1644
|
messages: [{ role: "user", content: prompt }],
|
|
1508
1645
|
max_tokens: options?.maxTokens ?? 8192
|
|
1509
1646
|
});
|
|
1510
|
-
|
|
1511
|
-
if (!content) throw new Error("LLM 返回空内容");
|
|
1512
|
-
return content;
|
|
1647
|
+
return extractAssistantText(completion);
|
|
1513
1648
|
}
|
|
1514
1649
|
function generateGuid(link) {
|
|
1515
1650
|
return createHash("sha256").update(link).digest("hex");
|
|
@@ -1593,7 +1728,7 @@ async function parseHtml(html, config = {}) {
|
|
|
1593
1728
|
const actualMode = mode ?? (llmConfig != null ? "llm" : customParser != null ? "custom" : "llm");
|
|
1594
1729
|
if (actualMode === "llm") {
|
|
1595
1730
|
if (llmConfig == null && !getLLMConfig().apiKey) {
|
|
1596
|
-
throw new Error('mode 为 "llm" 时必须提供 llmConfig
|
|
1731
|
+
throw new Error('mode 为 "llm" 时必须提供 llmConfig,或在后台「设置 → LLM」/ OPENAI_API_KEY 中配置 Key');
|
|
1597
1732
|
}
|
|
1598
1733
|
const htmlForLLM = applyPurify(html, purify !== false);
|
|
1599
1734
|
entries = await parseWithLLM(htmlForLLM, url, llmConfig ?? {});
|
|
@@ -1675,11 +1810,6 @@ function isValidSource(obj) {
|
|
|
1675
1810
|
const s = obj;
|
|
1676
1811
|
return typeof s.id === "string" && (typeof s.pattern === "string" || s.pattern instanceof RegExp) && typeof s.fetchItems === "function" && s.listUrlPattern === void 0;
|
|
1677
1812
|
}
|
|
1678
|
-
function isValidEnrichPlugin(obj) {
|
|
1679
|
-
if (obj == null || typeof obj !== "object") return false;
|
|
1680
|
-
const p = obj;
|
|
1681
|
-
return typeof p.id === "string" && typeof p.match === "function" && typeof p.enrichItem === "function";
|
|
1682
|
-
}
|
|
1683
1813
|
async function loadSourcePluginsFromDir(dir, label) {
|
|
1684
1814
|
const siteEntries = [];
|
|
1685
1815
|
const sources = [];
|
|
@@ -1711,46 +1841,12 @@ async function loadSourcePluginsFromDir(dir, label) {
|
|
|
1711
1841
|
}
|
|
1712
1842
|
return { siteEntries, sources };
|
|
1713
1843
|
}
|
|
1714
|
-
async function
|
|
1715
|
-
const
|
|
1716
|
-
let entries;
|
|
1717
|
-
try {
|
|
1718
|
-
const raw = await readdir(dir, { withFileTypes: true, encoding: "utf-8" });
|
|
1719
|
-
entries = raw;
|
|
1720
|
-
} catch {
|
|
1721
|
-
return result;
|
|
1722
|
-
}
|
|
1723
|
-
for (const e of entries) {
|
|
1724
|
-
const name = String(e.name);
|
|
1725
|
-
if (!e.isFile()) continue;
|
|
1726
|
-
if (!PLUGIN_EXTENSIONS.some((ext) => name.endsWith(ext))) continue;
|
|
1727
|
-
const filePath = join(dir, name);
|
|
1728
|
-
try {
|
|
1729
|
-
const mod = await import(pathToFileURL(filePath).href);
|
|
1730
|
-
const plugin = mod.default ?? mod;
|
|
1731
|
-
if (validator(plugin)) {
|
|
1732
|
-
result.push(plugin);
|
|
1733
|
-
} else {
|
|
1734
|
-
logger.warn("plugin", "插件接口不匹配,已跳过", { label, name });
|
|
1735
|
-
}
|
|
1736
|
-
} catch (err) {
|
|
1737
|
-
logger.warn("plugin", "插件加载失败", { label, name, err: err instanceof Error ? err.message : String(err) });
|
|
1738
|
-
}
|
|
1739
|
-
}
|
|
1740
|
-
return result;
|
|
1741
|
-
}
|
|
1742
|
-
async function loadFromSourcesOrRoot() {
|
|
1743
|
-
const [builtinFromSources, userFromSources] = await Promise.all([
|
|
1744
|
-
loadSourcePluginsFromDir(BUILTIN_SOURCES_DIR, "builtin:sources"),
|
|
1745
|
-
loadSourcePluginsFromDir(USER_SOURCES_DIR, "user:sources")
|
|
1746
|
-
]);
|
|
1747
|
-
const hasAny = builtinFromSources.siteEntries.length + builtinFromSources.sources.length + userFromSources.siteEntries.length + userFromSources.sources.length > 0;
|
|
1748
|
-
if (hasAny) return { builtin: builtinFromSources, user: userFromSources };
|
|
1749
|
-
const [builtinRoot, userRoot] = await Promise.all([
|
|
1844
|
+
async function loadBuiltinAndUser() {
|
|
1845
|
+
const [builtin, user] = await Promise.all([
|
|
1750
1846
|
loadSourcePluginsFromDir(BUILTIN_PLUGINS_DIR, "builtin"),
|
|
1751
1847
|
loadSourcePluginsFromDir(USER_PLUGINS_DIR, "user")
|
|
1752
1848
|
]);
|
|
1753
|
-
return { builtin
|
|
1849
|
+
return { builtin, user };
|
|
1754
1850
|
}
|
|
1755
1851
|
const pluginSitePaths = /* @__PURE__ */ new Map();
|
|
1756
1852
|
function mergeSourcePluginPaths(siteIds, pathMap, builtinSources, userSources) {
|
|
@@ -1774,7 +1870,7 @@ function getPluginFilePath(id) {
|
|
|
1774
1870
|
return pluginSitePaths.get(id);
|
|
1775
1871
|
}
|
|
1776
1872
|
async function loadSiteAndSourcePlugins() {
|
|
1777
|
-
const { builtin, user } = await
|
|
1873
|
+
const { builtin, user } = await loadBuiltinAndUser();
|
|
1778
1874
|
const siteMap = /* @__PURE__ */ new Map();
|
|
1779
1875
|
const pathMap = /* @__PURE__ */ new Map();
|
|
1780
1876
|
for (const { site: s, filePath } of builtin.siteEntries) {
|
|
@@ -1797,73 +1893,6 @@ async function loadSiteAndSourcePlugins() {
|
|
|
1797
1893
|
pathMap.forEach((path, id) => pluginSitePaths.set(id, path));
|
|
1798
1894
|
return { sites: Array.from(siteMap.values()), sources: Array.from(sourceMap.values()) };
|
|
1799
1895
|
}
|
|
1800
|
-
let registeredEnrichPlugins = [];
|
|
1801
|
-
async function loadEnrichPlugins() {
|
|
1802
|
-
const [builtin, user] = await Promise.all([
|
|
1803
|
-
loadPluginsFromDir(BUILTIN_ENRICH_DIR, "builtin:enrich", isValidEnrichPlugin),
|
|
1804
|
-
loadPluginsFromDir(USER_ENRICH_DIR, "user:enrich", isValidEnrichPlugin)
|
|
1805
|
-
]);
|
|
1806
|
-
const merged = /* @__PURE__ */ new Map();
|
|
1807
|
-
for (const p of builtin) merged.set(p.id, p);
|
|
1808
|
-
for (const p of user) {
|
|
1809
|
-
if (merged.has(p.id)) logger.info("plugin", "用户 Enrich 插件覆盖同名内置", { pluginId: p.id });
|
|
1810
|
-
merged.set(p.id, p);
|
|
1811
|
-
}
|
|
1812
|
-
const list = Array.from(merged.values());
|
|
1813
|
-
list.sort((a, b) => (a.priority ?? 100) - (b.priority ?? 100));
|
|
1814
|
-
registeredEnrichPlugins = list;
|
|
1815
|
-
return list;
|
|
1816
|
-
}
|
|
1817
|
-
function getMatchedEnrichPlugin(item, ctx) {
|
|
1818
|
-
return registeredEnrichPlugins.find((p) => p.match(item, ctx));
|
|
1819
|
-
}
|
|
1820
|
-
function buildEnrichContext(ctx) {
|
|
1821
|
-
return {
|
|
1822
|
-
cacheDir: ctx.cacheDir,
|
|
1823
|
-
headless: ctx.headless,
|
|
1824
|
-
proxy: ctx.proxy,
|
|
1825
|
-
async fetchHtml(url, opts) {
|
|
1826
|
-
const res = await fetchHtml(url, {
|
|
1827
|
-
cacheDir: ctx.cacheDir,
|
|
1828
|
-
useCache: false,
|
|
1829
|
-
authFlow: void 0,
|
|
1830
|
-
headless: ctx.headless,
|
|
1831
|
-
proxy: ctx.proxy,
|
|
1832
|
-
waitAfterLoadMs: opts?.waitMs,
|
|
1833
|
-
purify: opts?.purify
|
|
1834
|
-
});
|
|
1835
|
-
return { html: res.body, finalUrl: res.finalUrl ?? url, status: res.status };
|
|
1836
|
-
},
|
|
1837
|
-
async extractItem(item, opts) {
|
|
1838
|
-
const res = await fetchHtml(item.link, {
|
|
1839
|
-
cacheDir: ctx.cacheDir,
|
|
1840
|
-
useCache: false,
|
|
1841
|
-
authFlow: void 0,
|
|
1842
|
-
headless: ctx.headless,
|
|
1843
|
-
proxy: ctx.proxy
|
|
1844
|
-
});
|
|
1845
|
-
if (res.status !== 200 && res.status !== 304) {
|
|
1846
|
-
throw new Error(`默认正文提取失败: HTTP ${res.status} ${res.statusText} for ${item.link}`);
|
|
1847
|
-
}
|
|
1848
|
-
const extracted = await extractHtml(res.body, {
|
|
1849
|
-
url: res.finalUrl ?? item.link,
|
|
1850
|
-
cacheDir: ctx.cacheDir ?? void 0,
|
|
1851
|
-
mode: "readability",
|
|
1852
|
-
useCache: true,
|
|
1853
|
-
cacheKey: opts?.cacheKey
|
|
1854
|
-
});
|
|
1855
|
-
const pubDate = extracted.pubDate != null ? typeof extracted.pubDate === "string" ? new Date(extracted.pubDate) : extracted.pubDate : item.pubDate;
|
|
1856
|
-
return {
|
|
1857
|
-
...item,
|
|
1858
|
-
author: normalizeAuthor(extracted.author ?? item.author),
|
|
1859
|
-
title: extracted.title ?? item.title,
|
|
1860
|
-
summary: extracted.summary ?? item.summary,
|
|
1861
|
-
content: extracted.content ?? item.content,
|
|
1862
|
-
pubDate
|
|
1863
|
-
};
|
|
1864
|
-
}
|
|
1865
|
-
};
|
|
1866
|
-
}
|
|
1867
1896
|
function buildSiteContext(site, ctx) {
|
|
1868
1897
|
const proxy = ctx.proxy ?? site.proxy;
|
|
1869
1898
|
const authFlow = toAuthFlow(site);
|
|
@@ -1871,6 +1900,7 @@ function buildSiteContext(site, ctx) {
|
|
|
1871
1900
|
cacheDir: ctx.cacheDir,
|
|
1872
1901
|
headless: ctx.headless,
|
|
1873
1902
|
proxy,
|
|
1903
|
+
deps: ctx.deps,
|
|
1874
1904
|
async fetchHtml(url, opts) {
|
|
1875
1905
|
const res = await fetchHtml(url, {
|
|
1876
1906
|
cacheDir: ctx.cacheDir,
|
|
@@ -1881,7 +1911,8 @@ function buildSiteContext(site, ctx) {
|
|
|
1881
1911
|
waitAfterLoadMs: opts?.waitMs,
|
|
1882
1912
|
purify: opts?.purify,
|
|
1883
1913
|
waitForSelector: opts?.waitForSelector,
|
|
1884
|
-
waitForSelectorTimeoutMs: opts?.waitForSelectorTimeoutMs
|
|
1914
|
+
waitForSelectorTimeoutMs: opts?.waitForSelectorTimeoutMs,
|
|
1915
|
+
useHttpResponseBody: opts?.useHttpResponseBody
|
|
1885
1916
|
});
|
|
1886
1917
|
return { html: res.body, finalUrl: res.finalUrl ?? url, status: res.status };
|
|
1887
1918
|
},
|
|
@@ -1925,15 +1956,15 @@ function createWebSource(site) {
|
|
|
1925
1956
|
proxy: site.proxy ?? void 0,
|
|
1926
1957
|
preCheck: authFlow ? async (ctx) => {
|
|
1927
1958
|
if (!ctx.cacheDir) return;
|
|
1928
|
-
const passed = await preCheckAuth(authFlow, ctx.cacheDir
|
|
1959
|
+
const passed = await preCheckAuth(authFlow, ctx.cacheDir, {
|
|
1960
|
+
proxy: ctx.proxy,
|
|
1961
|
+
headless: ctx.headless
|
|
1962
|
+
});
|
|
1929
1963
|
if (!passed) throw new AuthRequiredError(`站点 ${site.id} 需要登录,请先执行 ensureAuth`);
|
|
1930
1964
|
} : void 0,
|
|
1931
1965
|
async fetchItems(sourceId, ctx) {
|
|
1932
1966
|
return site.fetchItems(sourceId, buildSiteContext(site, ctx));
|
|
1933
|
-
}
|
|
1934
|
-
enrichItem: site.enrichItem ? async (item, ctx) => {
|
|
1935
|
-
return site.enrichItem(item, buildSiteContext(site, ctx));
|
|
1936
|
-
} : void 0
|
|
1967
|
+
}
|
|
1937
1968
|
};
|
|
1938
1969
|
}
|
|
1939
1970
|
const genericWebSource = {
|
|
@@ -1970,6 +2001,37 @@ function getPluginSites() {
|
|
|
1970
2001
|
function getBestSite(url) {
|
|
1971
2002
|
return getSiteByUrl(url, loadedSites);
|
|
1972
2003
|
}
|
|
2004
|
+
const PLUGIN_HOST_DEPS = {
|
|
2005
|
+
parseHtml: parse,
|
|
2006
|
+
NodeType,
|
|
2007
|
+
createHash,
|
|
2008
|
+
RssParser,
|
|
2009
|
+
HttpsProxyAgent,
|
|
2010
|
+
ImapFlow,
|
|
2011
|
+
simpleParser,
|
|
2012
|
+
logger
|
|
2013
|
+
};
|
|
2014
|
+
function buildSourceContext(partial) {
|
|
2015
|
+
const { cacheDir, headless, proxy } = partial;
|
|
2016
|
+
return {
|
|
2017
|
+
...partial,
|
|
2018
|
+
deps: PLUGIN_HOST_DEPS,
|
|
2019
|
+
async fetchHtml(url, opts) {
|
|
2020
|
+
const res = await fetchHtml(url, {
|
|
2021
|
+
cacheDir,
|
|
2022
|
+
useCache: false,
|
|
2023
|
+
headless,
|
|
2024
|
+
proxy,
|
|
2025
|
+
waitAfterLoadMs: opts?.waitMs,
|
|
2026
|
+
purify: opts?.purify,
|
|
2027
|
+
waitForSelector: opts?.waitForSelector,
|
|
2028
|
+
waitForSelectorTimeoutMs: opts?.waitForSelectorTimeoutMs,
|
|
2029
|
+
useHttpResponseBody: opts?.useHttpResponseBody
|
|
2030
|
+
});
|
|
2031
|
+
return { html: res.body, finalUrl: res.finalUrl ?? url, status: res.status };
|
|
2032
|
+
}
|
|
2033
|
+
};
|
|
2034
|
+
}
|
|
1973
2035
|
const registeredSources = [];
|
|
1974
2036
|
function sourcePatternToRegex(pattern) {
|
|
1975
2037
|
if (pattern instanceof RegExp) return pattern;
|
|
@@ -1992,10 +2054,7 @@ function getSource(sourceId) {
|
|
|
1992
2054
|
return genericWebSource;
|
|
1993
2055
|
}
|
|
1994
2056
|
async function initSources() {
|
|
1995
|
-
const
|
|
1996
|
-
loadSiteAndSourcePlugins(),
|
|
1997
|
-
loadEnrichPlugins()
|
|
1998
|
-
]);
|
|
2057
|
+
const siteResult = await loadSiteAndSourcePlugins();
|
|
1999
2058
|
const { sites, sources: sourcePlugins } = siteResult;
|
|
2000
2059
|
setLoadedSites(sites);
|
|
2001
2060
|
registeredSources.length = 0;
|
|
@@ -2016,6 +2075,38 @@ async function initSources() {
|
|
|
2016
2075
|
function resolveRef(src) {
|
|
2017
2076
|
return src.ref ?? src.url ?? "";
|
|
2018
2077
|
}
|
|
2078
|
+
async function readGlobalProxyFromConfig() {
|
|
2079
|
+
try {
|
|
2080
|
+
const raw = await readFile(CONFIG_PATH, "utf-8");
|
|
2081
|
+
const j = JSON.parse(raw);
|
|
2082
|
+
if (typeof j.globalProxy === "string") {
|
|
2083
|
+
const t = j.globalProxy.trim();
|
|
2084
|
+
return t.length > 0 ? t : void 0;
|
|
2085
|
+
}
|
|
2086
|
+
} catch {
|
|
2087
|
+
}
|
|
2088
|
+
return void 0;
|
|
2089
|
+
}
|
|
2090
|
+
async function saveGlobalProxyToConfig(proxy) {
|
|
2091
|
+
let root = {};
|
|
2092
|
+
try {
|
|
2093
|
+
const raw = await readFile(CONFIG_PATH, "utf-8");
|
|
2094
|
+
root = JSON.parse(raw);
|
|
2095
|
+
} catch {
|
|
2096
|
+
}
|
|
2097
|
+
const t = proxy.trim();
|
|
2098
|
+
if (t.length === 0) {
|
|
2099
|
+
delete root.globalProxy;
|
|
2100
|
+
} else {
|
|
2101
|
+
root.globalProxy = t;
|
|
2102
|
+
}
|
|
2103
|
+
await writeFile(CONFIG_PATH, JSON.stringify(root, null, 2) + "\n", "utf-8");
|
|
2104
|
+
}
|
|
2105
|
+
async function resolveProxyForSite(site) {
|
|
2106
|
+
const s = site.proxy?.trim();
|
|
2107
|
+
if (s) return s;
|
|
2108
|
+
return readGlobalProxyFromConfig();
|
|
2109
|
+
}
|
|
2019
2110
|
async function loadSourcesFile() {
|
|
2020
2111
|
try {
|
|
2021
2112
|
const raw = await readFile(SOURCES_CONFIG_PATH, "utf-8");
|
|
@@ -2061,6 +2152,15 @@ async function saveSourcesFile(sources) {
|
|
|
2061
2152
|
"utf-8"
|
|
2062
2153
|
);
|
|
2063
2154
|
}
|
|
2155
|
+
async function getEffectiveProxyForListUrl(listUrl, source) {
|
|
2156
|
+
const list = await getAllSources();
|
|
2157
|
+
const sub = list.find((s) => resolveRef(s) === listUrl);
|
|
2158
|
+
const fromSub = sub?.proxy?.trim();
|
|
2159
|
+
if (fromSub) return fromSub;
|
|
2160
|
+
const fromPlugin = source.proxy?.trim();
|
|
2161
|
+
if (fromPlugin) return fromPlugin;
|
|
2162
|
+
return readGlobalProxyFromConfig();
|
|
2163
|
+
}
|
|
2064
2164
|
async function getSourcesRaw() {
|
|
2065
2165
|
try {
|
|
2066
2166
|
const raw = await readFile(SOURCES_CONFIG_PATH, "utf-8");
|
|
@@ -2378,40 +2478,221 @@ function onFeedUpdated(fn) {
|
|
|
2378
2478
|
eventBus.on("feed:updated", fn);
|
|
2379
2479
|
return () => eventBus.off("feed:updated", fn);
|
|
2380
2480
|
}
|
|
2381
|
-
|
|
2382
|
-
concurrency: 2,
|
|
2383
|
-
maxRetries: 2
|
|
2384
|
-
};
|
|
2385
|
-
async function loadEnrichConfig() {
|
|
2386
|
-
let fileEnrich = {};
|
|
2481
|
+
async function getDeliverConfig() {
|
|
2387
2482
|
try {
|
|
2388
|
-
const raw = await readFile(
|
|
2389
|
-
const
|
|
2390
|
-
|
|
2391
|
-
|
|
2392
|
-
|
|
2483
|
+
const raw = await readFile(CONFIG_PATH, "utf-8");
|
|
2484
|
+
const j = JSON.parse(raw);
|
|
2485
|
+
const u = j?.deliver?.url;
|
|
2486
|
+
const t = j?.deliver?.token;
|
|
2487
|
+
return {
|
|
2488
|
+
url: typeof u === "string" ? u.trim() : "",
|
|
2489
|
+
token: typeof t === "string" ? t.trim() : ""
|
|
2490
|
+
};
|
|
2393
2491
|
} catch {
|
|
2492
|
+
return { url: "", token: "" };
|
|
2394
2493
|
}
|
|
2395
|
-
return {
|
|
2396
|
-
concurrency: Number(fileEnrich["concurrency"] ?? process.env.ENRICH_CONCURRENCY ?? DEFAULTS.concurrency),
|
|
2397
|
-
maxRetries: Number(fileEnrich["maxRetries"] ?? process.env.ENRICH_MAX_RETRIES ?? DEFAULTS.maxRetries)
|
|
2398
|
-
};
|
|
2399
2494
|
}
|
|
2400
|
-
|
|
2401
|
-
|
|
2402
|
-
|
|
2403
|
-
const
|
|
2404
|
-
|
|
2405
|
-
|
|
2406
|
-
|
|
2407
|
-
const
|
|
2408
|
-
|
|
2409
|
-
|
|
2410
|
-
|
|
2411
|
-
|
|
2412
|
-
|
|
2413
|
-
|
|
2414
|
-
|
|
2495
|
+
async function saveDeliverConfig(config) {
|
|
2496
|
+
let root = {};
|
|
2497
|
+
try {
|
|
2498
|
+
const raw = await readFile(CONFIG_PATH, "utf-8");
|
|
2499
|
+
root = JSON.parse(raw);
|
|
2500
|
+
} catch {
|
|
2501
|
+
}
|
|
2502
|
+
const prev = root.deliver;
|
|
2503
|
+
const base2 = typeof prev === "object" && prev !== null && !Array.isArray(prev) ? { ...prev } : {};
|
|
2504
|
+
const url = config.url.trim();
|
|
2505
|
+
const token = config.token.trim();
|
|
2506
|
+
const next = { ...base2, url };
|
|
2507
|
+
if (token) next.token = token;
|
|
2508
|
+
else delete next.token;
|
|
2509
|
+
root.deliver = next;
|
|
2510
|
+
await writeFile(CONFIG_PATH, JSON.stringify(root, null, 2) + "\n", "utf-8");
|
|
2511
|
+
}
|
|
2512
|
+
function feedItemsToPayload(items) {
|
|
2513
|
+
return items.map((i) => ({
|
|
2514
|
+
guid: i.guid,
|
|
2515
|
+
title: i.title,
|
|
2516
|
+
link: i.link,
|
|
2517
|
+
pubDate: i.pubDate instanceof Date ? i.pubDate.toISOString() : (/* @__PURE__ */ new Date()).toISOString(),
|
|
2518
|
+
author: i.author,
|
|
2519
|
+
summary: i.summary,
|
|
2520
|
+
content: i.content,
|
|
2521
|
+
tags: i.tags,
|
|
2522
|
+
sourceRef: i.sourceRef,
|
|
2523
|
+
translations: i.translations
|
|
2524
|
+
}));
|
|
2525
|
+
}
|
|
2526
|
+
async function postDeliverItems(url, sourceRef, items, options) {
|
|
2527
|
+
if (!url.trim() || items.length === 0) return;
|
|
2528
|
+
const body = JSON.stringify({ sourceRef, items: feedItemsToPayload(items) });
|
|
2529
|
+
const headers = { "Content-Type": "application/json" };
|
|
2530
|
+
const t = options?.bearerToken?.trim();
|
|
2531
|
+
if (t) headers.Authorization = `Bearer ${t}`;
|
|
2532
|
+
const res = await fetch(url.trim(), {
|
|
2533
|
+
method: "POST",
|
|
2534
|
+
headers,
|
|
2535
|
+
body,
|
|
2536
|
+
signal: AbortSignal.timeout(12e4)
|
|
2537
|
+
});
|
|
2538
|
+
if (!res.ok) {
|
|
2539
|
+
const text = await res.text().catch(() => "");
|
|
2540
|
+
throw new Error(`HTTP ${res.status}${text ? `: ${text.slice(0, 200)}` : ""}`);
|
|
2541
|
+
}
|
|
2542
|
+
}
|
|
2543
|
+
async function postDeliverItemsSafe(url, sourceRef, items, options) {
|
|
2544
|
+
try {
|
|
2545
|
+
await postDeliverItems(url, sourceRef, items, options);
|
|
2546
|
+
} catch (err) {
|
|
2547
|
+
logger.warn("deliver", "投递失败", {
|
|
2548
|
+
sourceRef,
|
|
2549
|
+
count: items.length,
|
|
2550
|
+
err: err instanceof Error ? err.message : String(err)
|
|
2551
|
+
});
|
|
2552
|
+
}
|
|
2553
|
+
}
|
|
2554
|
+
function resolveHeadlessForFeeder(config) {
|
|
2555
|
+
if (config.force === true) {
|
|
2556
|
+
return config.headless === true ? true : false;
|
|
2557
|
+
}
|
|
2558
|
+
return config.headless;
|
|
2559
|
+
}
|
|
2560
|
+
function buildChannelFromItems(listUrl, items, lng) {
|
|
2561
|
+
const channel = {
|
|
2562
|
+
title: items[0]?.author?.length ? `${items[0].author[0]} 的订阅` : "RSS 订阅",
|
|
2563
|
+
link: listUrl,
|
|
2564
|
+
description: `来自 ${listUrl} 的订阅`
|
|
2565
|
+
};
|
|
2566
|
+
if (lng) channel.language = lng;
|
|
2567
|
+
return channel;
|
|
2568
|
+
}
|
|
2569
|
+
function toRssEntry(item, lng) {
|
|
2570
|
+
const eff = getEffectiveItemFields(item, lng);
|
|
2571
|
+
const hasContent = eff.content != null && eff.content !== "";
|
|
2572
|
+
const desc = hasContent ? eff.content : eff.summary;
|
|
2573
|
+
return {
|
|
2574
|
+
title: eff.title,
|
|
2575
|
+
link: item.link,
|
|
2576
|
+
description: desc,
|
|
2577
|
+
guid: item.guid,
|
|
2578
|
+
published: item.pubDate?.toISOString?.() ?? void 0,
|
|
2579
|
+
imageUrl: item.imageUrl
|
|
2580
|
+
};
|
|
2581
|
+
}
|
|
2582
|
+
const generatingKeys = /* @__PURE__ */ new Map();
|
|
2583
|
+
const pipelineCtx = {
|
|
2584
|
+
llm: { chatJson, chatText },
|
|
2585
|
+
db: { getSystemTags }
|
|
2586
|
+
};
|
|
2587
|
+
async function runPipelineOnItem(item, ctx) {
|
|
2588
|
+
return runPipeline(item, { ...pipelineCtx, ...ctx });
|
|
2589
|
+
}
|
|
2590
|
+
async function generateAndCache(listUrl, key, config, proxy) {
|
|
2591
|
+
const { cacheDir = "cache" } = config;
|
|
2592
|
+
const headless = resolveHeadlessForFeeder(config);
|
|
2593
|
+
const source = getSource(listUrl);
|
|
2594
|
+
const ctx = buildSourceContext({ cacheDir, headless, proxy });
|
|
2595
|
+
let items;
|
|
2596
|
+
try {
|
|
2597
|
+
items = await source.fetchItems(listUrl, ctx);
|
|
2598
|
+
} catch (err) {
|
|
2599
|
+
generatingKeys.delete(key);
|
|
2600
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
2601
|
+
logger.error("scraper", "抓取失败", { source_url: listUrl, err: message });
|
|
2602
|
+
throw err;
|
|
2603
|
+
}
|
|
2604
|
+
const sourceRefStored = canonicalHttpSourceRef(listUrl);
|
|
2605
|
+
items.forEach((i) => {
|
|
2606
|
+
i.sourceRef = sourceRefStored;
|
|
2607
|
+
i.author = normalizeAuthor(i.author);
|
|
2608
|
+
});
|
|
2609
|
+
generatingKeys.delete(key);
|
|
2610
|
+
logger.info("scraper", "抓取成功", { source_url: listUrl, count: items.length });
|
|
2611
|
+
const { url: deliverUrl, token: deliverToken } = await getDeliverConfig();
|
|
2612
|
+
let newCount = 0;
|
|
2613
|
+
let newIds = /* @__PURE__ */ new Set();
|
|
2614
|
+
const upsertResult = await upsertItems(items).catch((err) => {
|
|
2615
|
+
logger.warn("db", "upsertItems 失败", { source_url: listUrl, err: err instanceof Error ? err.message : String(err) });
|
|
2616
|
+
return { newCount: 0, newIds: /* @__PURE__ */ new Set() };
|
|
2617
|
+
});
|
|
2618
|
+
newCount = upsertResult.newCount;
|
|
2619
|
+
newIds = upsertResult.newIds;
|
|
2620
|
+
let pipelineDroppedNew = 0;
|
|
2621
|
+
const shouldRunPipelineRow = (guid) => newIds.has(guid);
|
|
2622
|
+
for (let i = 0; i < items.length; i++) {
|
|
2623
|
+
if (!shouldRunPipelineRow(items[i].guid)) continue;
|
|
2624
|
+
const processed = await runPipelineOnItem(items[i], { sourceUrl: sourceRefStored });
|
|
2625
|
+
items[i] = processed;
|
|
2626
|
+
if (isPipelineDroppedItem(processed)) {
|
|
2627
|
+
await deleteItem(processed.guid).catch(
|
|
2628
|
+
(err) => logger.warn("db", "质量过滤后删除条目失败", { source_url: listUrl, err: err instanceof Error ? err.message : String(err) })
|
|
2629
|
+
);
|
|
2630
|
+
pipelineDroppedNew++;
|
|
2631
|
+
} else {
|
|
2632
|
+
updateItemContent(processed).catch(
|
|
2633
|
+
(err) => logger.warn("db", "updateItemContent 失败", { source_url: listUrl, err: err instanceof Error ? err.message : String(err) })
|
|
2634
|
+
);
|
|
2635
|
+
}
|
|
2636
|
+
}
|
|
2637
|
+
if (newCount > 0) {
|
|
2638
|
+
emitFeedUpdated({ sourceUrl: sourceRefStored, newCount: newCount - pipelineDroppedNew });
|
|
2639
|
+
}
|
|
2640
|
+
const out = items.filter((i) => !isPipelineDroppedItem(i));
|
|
2641
|
+
if (deliverUrl && out.length > 0) {
|
|
2642
|
+
await postDeliverItemsSafe(deliverUrl, sourceRefStored, out, {
|
|
2643
|
+
bearerToken: deliverToken || void 0
|
|
2644
|
+
});
|
|
2645
|
+
}
|
|
2646
|
+
return { items: out };
|
|
2647
|
+
}
|
|
2648
|
+
async function getItems(listUrl, config = {}) {
|
|
2649
|
+
const source = getSource(listUrl);
|
|
2650
|
+
const proxy = await getEffectiveProxyForListUrl(listUrl, source);
|
|
2651
|
+
const headless = resolveHeadlessForFeeder(config);
|
|
2652
|
+
const key = config.cron ? cacheKeyFromCron(listUrl, config.cron) : cacheKey(listUrl, config.refreshInterval ?? source.refreshInterval ?? "1day");
|
|
2653
|
+
if (source.preCheck != null) {
|
|
2654
|
+
try {
|
|
2655
|
+
await source.preCheck(
|
|
2656
|
+
buildSourceContext({
|
|
2657
|
+
cacheDir: config.cacheDir ?? "cache",
|
|
2658
|
+
headless,
|
|
2659
|
+
proxy
|
|
2660
|
+
})
|
|
2661
|
+
);
|
|
2662
|
+
} catch (err) {
|
|
2663
|
+
if (err instanceof AuthRequiredError) throw err;
|
|
2664
|
+
throw err;
|
|
2665
|
+
}
|
|
2666
|
+
}
|
|
2667
|
+
let task = config.force ? void 0 : generatingKeys.get(key);
|
|
2668
|
+
if (!task) {
|
|
2669
|
+
task = generateAndCache(listUrl, key, config, proxy);
|
|
2670
|
+
if (!config.force) generatingKeys.set(key, task);
|
|
2671
|
+
}
|
|
2672
|
+
const { items } = await task;
|
|
2673
|
+
return { items, fromCache: false };
|
|
2674
|
+
}
|
|
2675
|
+
function feedItemsToRssXml(items, listUrl, lng, opts) {
|
|
2676
|
+
const channel = buildChannelFromItems(listUrl, items, lng);
|
|
2677
|
+
if (opts?.channelTitle) channel.title = opts.channelTitle;
|
|
2678
|
+
if (opts?.channelDesc) channel.description = opts.channelDesc;
|
|
2679
|
+
return buildRssXml(channel, items.map((it) => toRssEntry(it, lng)));
|
|
2680
|
+
}
|
|
2681
|
+
const validateCron = validate;
|
|
2682
|
+
const tasks$1 = /* @__PURE__ */ new Map();
|
|
2683
|
+
const groups = /* @__PURE__ */ new Map();
|
|
2684
|
+
const DEFAULT_RETRY_DELAY_MS = 5e3;
|
|
2685
|
+
const DEFAULT_GROUP_CONCURRENCY = 10;
|
|
2686
|
+
async function runWithRetry(task, options) {
|
|
2687
|
+
const retries = options.retries ?? 0;
|
|
2688
|
+
const retryDelayMs = options.retryDelayMs ?? DEFAULT_RETRY_DELAY_MS;
|
|
2689
|
+
let lastErr;
|
|
2690
|
+
for (let attempt = 0; attempt <= retries; attempt++) {
|
|
2691
|
+
try {
|
|
2692
|
+
await task();
|
|
2693
|
+
return;
|
|
2694
|
+
} catch (err) {
|
|
2695
|
+
lastErr = err;
|
|
2415
2696
|
if (attempt < retries) {
|
|
2416
2697
|
await new Promise((r) => setTimeout(r, retryDelayMs));
|
|
2417
2698
|
}
|
|
@@ -2580,365 +2861,8 @@ function getGroupStats() {
|
|
|
2580
2861
|
}
|
|
2581
2862
|
return result;
|
|
2582
2863
|
}
|
|
2583
|
-
const ENRICH_GROUP = "enrich";
|
|
2584
|
-
const MAX_STORED_TASKS = 200;
|
|
2585
|
-
const RETRY_DELAY_MS = 3e3;
|
|
2586
|
-
class EnrichQueue {
|
|
2587
|
-
tasks = /* @__PURE__ */ new Map();
|
|
2588
|
-
taskItems = /* @__PURE__ */ new Map();
|
|
2589
|
-
taskCallbacks = /* @__PURE__ */ new Map();
|
|
2590
|
-
configLoaded = false;
|
|
2591
|
-
async ensureConfig() {
|
|
2592
|
-
if (this.configLoaded) return { concurrency: 2, maxRetries: 2 };
|
|
2593
|
-
const config = await loadEnrichConfig();
|
|
2594
|
-
this.configLoaded = true;
|
|
2595
|
-
logger.info("scraper", "配置加载完成", { concurrency: config.concurrency, maxRetries: config.maxRetries });
|
|
2596
|
-
return config;
|
|
2597
|
-
}
|
|
2598
|
-
evictIfNeeded() {
|
|
2599
|
-
if (this.tasks.size <= MAX_STORED_TASKS) return;
|
|
2600
|
-
const ids = [...this.tasks.keys()];
|
|
2601
|
-
for (const id of ids) {
|
|
2602
|
-
if (this.tasks.get(id)?.status === "done") {
|
|
2603
|
-
this.removeTask(id);
|
|
2604
|
-
if (this.tasks.size <= MAX_STORED_TASKS) return;
|
|
2605
|
-
}
|
|
2606
|
-
}
|
|
2607
|
-
if (this.tasks.size > MAX_STORED_TASKS) this.removeTask(ids[0]);
|
|
2608
|
-
}
|
|
2609
|
-
removeTask(id) {
|
|
2610
|
-
this.tasks.delete(id);
|
|
2611
|
-
this.taskItems.delete(id);
|
|
2612
|
-
this.taskCallbacks.delete(id);
|
|
2613
|
-
}
|
|
2614
|
-
checkTaskComplete(taskId) {
|
|
2615
|
-
const task = this.tasks.get(taskId);
|
|
2616
|
-
const items = this.taskItems.get(taskId);
|
|
2617
|
-
const callbacks = this.taskCallbacks.get(taskId);
|
|
2618
|
-
if (!task || !items) return;
|
|
2619
|
-
const allSettled = task.itemResults.every((r) => r.status === "done" || r.status === "failed");
|
|
2620
|
-
if (!allSettled) return;
|
|
2621
|
-
task.status = "done";
|
|
2622
|
-
task.completedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
2623
|
-
logger.info("scraper", "任务完成", {
|
|
2624
|
-
source_url: task.sourceUrl,
|
|
2625
|
-
taskId,
|
|
2626
|
-
done: task.progress.done,
|
|
2627
|
-
failed: task.progress.failed
|
|
2628
|
-
});
|
|
2629
|
-
Promise.resolve(callbacks?.onAllDone?.(items)).catch((err) => {
|
|
2630
|
-
logger.warn("scraper", "onAllDone 回调异常", { taskId, err: err instanceof Error ? err.message : String(err) });
|
|
2631
|
-
});
|
|
2632
|
-
}
|
|
2633
|
-
async submit(items, enrichFn, ctx, opts) {
|
|
2634
|
-
const config = await this.ensureConfig();
|
|
2635
|
-
const id = randomUUID();
|
|
2636
|
-
const itemResults = items.map((_, i) => ({
|
|
2637
|
-
index: i,
|
|
2638
|
-
status: "pending",
|
|
2639
|
-
retries: 0
|
|
2640
|
-
}));
|
|
2641
|
-
const task = {
|
|
2642
|
-
id,
|
|
2643
|
-
sourceUrl: opts.sourceUrl,
|
|
2644
|
-
status: items.length === 0 ? "done" : "pending",
|
|
2645
|
-
progress: { total: items.length, done: 0, failed: 0 },
|
|
2646
|
-
itemResults,
|
|
2647
|
-
createdAt: (/* @__PURE__ */ new Date()).toISOString(),
|
|
2648
|
-
completedAt: items.length === 0 ? (/* @__PURE__ */ new Date()).toISOString() : void 0
|
|
2649
|
-
};
|
|
2650
|
-
const itemsCopy = [...items];
|
|
2651
|
-
this.tasks.set(id, task);
|
|
2652
|
-
this.taskItems.set(id, itemsCopy);
|
|
2653
|
-
this.taskCallbacks.set(id, opts);
|
|
2654
|
-
this.evictIfNeeded();
|
|
2655
|
-
for (let i = 0; i < items.length; i++) {
|
|
2656
|
-
const itemIndex = i;
|
|
2657
|
-
const workId = `${id}-${i}`;
|
|
2658
|
-
const taskFn = async () => {
|
|
2659
|
-
const t = this.tasks.get(id);
|
|
2660
|
-
const its = this.taskItems.get(id);
|
|
2661
|
-
const cbs = this.taskCallbacks.get(id);
|
|
2662
|
-
if (!t || !its || !cbs) return;
|
|
2663
|
-
const itemResult = t.itemResults[itemIndex];
|
|
2664
|
-
if (!itemResult) return;
|
|
2665
|
-
itemResult.status = "running";
|
|
2666
|
-
if (t.status === "pending") t.status = "running";
|
|
2667
|
-
for (let r = 0; r <= config.maxRetries; r++) {
|
|
2668
|
-
try {
|
|
2669
|
-
const enriched = await enrichFn(its[itemIndex], ctx);
|
|
2670
|
-
its[itemIndex] = enriched;
|
|
2671
|
-
itemResult.item = enriched;
|
|
2672
|
-
itemResult.status = "done";
|
|
2673
|
-
t.progress.done++;
|
|
2674
|
-
await Promise.resolve(cbs.onItemDone?.(enriched, itemIndex));
|
|
2675
|
-
this.checkTaskComplete(id);
|
|
2676
|
-
return;
|
|
2677
|
-
} catch (err) {
|
|
2678
|
-
const msg = err instanceof Error ? err.message : String(err);
|
|
2679
|
-
if (r < config.maxRetries) {
|
|
2680
|
-
logger.warn("scraper", "提取失败,重试中", {
|
|
2681
|
-
source_url: t.sourceUrl,
|
|
2682
|
-
item_url: its[itemIndex]?.link,
|
|
2683
|
-
retries: r + 1,
|
|
2684
|
-
maxRetries: config.maxRetries,
|
|
2685
|
-
err: msg
|
|
2686
|
-
});
|
|
2687
|
-
await new Promise((resolve2) => setTimeout(resolve2, RETRY_DELAY_MS));
|
|
2688
|
-
} else {
|
|
2689
|
-
itemResult.status = "failed";
|
|
2690
|
-
itemResult.error = msg;
|
|
2691
|
-
t.progress.failed++;
|
|
2692
|
-
logger.warn("scraper", "提取最终失败", {
|
|
2693
|
-
source_url: t.sourceUrl,
|
|
2694
|
-
item_url: its[itemIndex]?.link,
|
|
2695
|
-
err: msg
|
|
2696
|
-
});
|
|
2697
|
-
const failedItem = { ...its[itemIndex], enrichFailed: true };
|
|
2698
|
-
its[itemIndex] = failedItem;
|
|
2699
|
-
await Promise.resolve(cbs.onItemDone?.(failedItem, itemIndex));
|
|
2700
|
-
this.checkTaskComplete(id);
|
|
2701
|
-
}
|
|
2702
|
-
}
|
|
2703
|
-
}
|
|
2704
|
-
};
|
|
2705
|
-
schedule(ENRICH_GROUP, workId, taskFn, { concurrency: config.concurrency }).catch(() => {
|
|
2706
|
-
});
|
|
2707
|
-
}
|
|
2708
|
-
return id;
|
|
2709
|
-
}
|
|
2710
|
-
getTask(id) {
|
|
2711
|
-
return this.tasks.get(id);
|
|
2712
|
-
}
|
|
2713
|
-
getTaskItems(id) {
|
|
2714
|
-
return this.taskItems.get(id);
|
|
2715
|
-
}
|
|
2716
|
-
}
|
|
2717
|
-
const enrichQueue = new EnrichQueue();
|
|
2718
|
-
async function getDeliverUrl() {
|
|
2719
|
-
try {
|
|
2720
|
-
const raw = await readFile(CONFIG_PATH, "utf-8");
|
|
2721
|
-
const j = JSON.parse(raw);
|
|
2722
|
-
const u = j?.deliver?.url;
|
|
2723
|
-
return typeof u === "string" ? u.trim() : "";
|
|
2724
|
-
} catch {
|
|
2725
|
-
return "";
|
|
2726
|
-
}
|
|
2727
|
-
}
|
|
2728
|
-
async function saveDeliverUrl(url) {
|
|
2729
|
-
let root = {};
|
|
2730
|
-
try {
|
|
2731
|
-
const raw = await readFile(CONFIG_PATH, "utf-8");
|
|
2732
|
-
root = JSON.parse(raw);
|
|
2733
|
-
} catch {
|
|
2734
|
-
}
|
|
2735
|
-
root.deliver = { url: url.trim() };
|
|
2736
|
-
await writeFile(CONFIG_PATH, JSON.stringify(root, null, 2) + "\n", "utf-8");
|
|
2737
|
-
}
|
|
2738
|
-
function feedItemsToPayload(items) {
|
|
2739
|
-
return items.map((i) => ({
|
|
2740
|
-
guid: i.guid,
|
|
2741
|
-
title: i.title,
|
|
2742
|
-
link: i.link,
|
|
2743
|
-
pubDate: i.pubDate instanceof Date ? i.pubDate.toISOString() : (/* @__PURE__ */ new Date()).toISOString(),
|
|
2744
|
-
author: i.author,
|
|
2745
|
-
summary: i.summary,
|
|
2746
|
-
content: i.content,
|
|
2747
|
-
tags: i.tags,
|
|
2748
|
-
sourceRef: i.sourceRef,
|
|
2749
|
-
translations: i.translations
|
|
2750
|
-
}));
|
|
2751
|
-
}
|
|
2752
|
-
async function postDeliverItems(url, sourceRef, items) {
|
|
2753
|
-
if (!url.trim() || items.length === 0) return;
|
|
2754
|
-
const body = JSON.stringify({ sourceRef, items: feedItemsToPayload(items) });
|
|
2755
|
-
const res = await fetch(url.trim(), {
|
|
2756
|
-
method: "POST",
|
|
2757
|
-
headers: { "Content-Type": "application/json" },
|
|
2758
|
-
body,
|
|
2759
|
-
signal: AbortSignal.timeout(12e4)
|
|
2760
|
-
});
|
|
2761
|
-
if (!res.ok) {
|
|
2762
|
-
const text = await res.text().catch(() => "");
|
|
2763
|
-
throw new Error(`HTTP ${res.status}${text ? `: ${text.slice(0, 200)}` : ""}`);
|
|
2764
|
-
}
|
|
2765
|
-
}
|
|
2766
|
-
async function postDeliverItemsSafe(url, sourceRef, items) {
|
|
2767
|
-
try {
|
|
2768
|
-
await postDeliverItems(url, sourceRef, items);
|
|
2769
|
-
} catch (err) {
|
|
2770
|
-
logger.warn("deliver", "投递失败", {
|
|
2771
|
-
sourceRef,
|
|
2772
|
-
count: items.length,
|
|
2773
|
-
err: err instanceof Error ? err.message : String(err)
|
|
2774
|
-
});
|
|
2775
|
-
}
|
|
2776
|
-
}
|
|
2777
|
-
function buildChannelFromItems(listUrl, items, lng) {
|
|
2778
|
-
const channel = {
|
|
2779
|
-
title: items[0]?.author?.length ? `${items[0].author[0]} 的订阅` : "RSS 订阅",
|
|
2780
|
-
link: listUrl,
|
|
2781
|
-
description: `来自 ${listUrl} 的订阅`
|
|
2782
|
-
};
|
|
2783
|
-
if (lng) channel.language = lng;
|
|
2784
|
-
return channel;
|
|
2785
|
-
}
|
|
2786
|
-
function toRssEntry(item, lng) {
|
|
2787
|
-
const eff = getEffectiveItemFields(item, lng);
|
|
2788
|
-
const hasContent = eff.content != null && eff.content !== "";
|
|
2789
|
-
const desc = hasContent ? eff.content : eff.summary;
|
|
2790
|
-
return {
|
|
2791
|
-
title: eff.title,
|
|
2792
|
-
link: item.link,
|
|
2793
|
-
description: desc,
|
|
2794
|
-
guid: item.guid,
|
|
2795
|
-
published: item.pubDate?.toISOString?.() ?? void 0,
|
|
2796
|
-
imageUrl: item.imageUrl
|
|
2797
|
-
};
|
|
2798
|
-
}
|
|
2799
|
-
const generatingKeys = /* @__PURE__ */ new Map();
|
|
2800
|
-
const pipelineCtx = {
|
|
2801
|
-
llm: { chatJson, chatText },
|
|
2802
|
-
db: { getSystemTags }
|
|
2803
|
-
};
|
|
2804
|
-
async function runPipelineOnItem(item, ctx) {
|
|
2805
|
-
return runPipeline(item, { ...pipelineCtx, ...ctx });
|
|
2806
|
-
}
|
|
2807
|
-
function buildEnrichFn(source, listUrl, ctx) {
|
|
2808
|
-
const enrichCtx = buildEnrichContext(ctx);
|
|
2809
|
-
enrichCtx.sourceUrl = listUrl;
|
|
2810
|
-
return async (item) => {
|
|
2811
|
-
let result = item;
|
|
2812
|
-
if (source.enrichItem) {
|
|
2813
|
-
result = await source.enrichItem(item, ctx);
|
|
2814
|
-
}
|
|
2815
|
-
const plugin = getMatchedEnrichPlugin(result, { sourceUrl: listUrl });
|
|
2816
|
-
if (plugin) {
|
|
2817
|
-
result = await plugin.enrichItem(result, enrichCtx);
|
|
2818
|
-
}
|
|
2819
|
-
return result;
|
|
2820
|
-
};
|
|
2821
|
-
}
|
|
2822
|
-
async function generateAndCache(listUrl, key, config) {
|
|
2823
|
-
const { cacheDir = "cache", includeContent = true, headless } = config;
|
|
2824
|
-
const source = getSource(listUrl);
|
|
2825
|
-
const ctx = { cacheDir, headless, proxy: config.proxy ?? source.proxy };
|
|
2826
|
-
let items;
|
|
2827
|
-
try {
|
|
2828
|
-
items = await source.fetchItems(listUrl, ctx);
|
|
2829
|
-
} catch (err) {
|
|
2830
|
-
generatingKeys.delete(key);
|
|
2831
|
-
const message = err instanceof Error ? err.message : String(err);
|
|
2832
|
-
logger.error("scraper", "抓取失败", { source_url: listUrl, err: message });
|
|
2833
|
-
throw err;
|
|
2834
|
-
}
|
|
2835
|
-
items.forEach((i) => {
|
|
2836
|
-
i.sourceRef = listUrl;
|
|
2837
|
-
i.author = normalizeAuthor(i.author);
|
|
2838
|
-
});
|
|
2839
|
-
generatingKeys.delete(key);
|
|
2840
|
-
logger.info("scraper", "抓取成功", { source_url: listUrl, count: items.length });
|
|
2841
|
-
const deliverUrl = await getDeliverUrl();
|
|
2842
|
-
let newCount = 0;
|
|
2843
|
-
let newIds = /* @__PURE__ */ new Set();
|
|
2844
|
-
const upsertResult = await upsertItems(items).catch((err) => {
|
|
2845
|
-
logger.warn("db", "upsertItems 失败", { source_url: listUrl, err: err instanceof Error ? err.message : String(err) });
|
|
2846
|
-
return { newCount: 0, newIds: /* @__PURE__ */ new Set() };
|
|
2847
|
-
});
|
|
2848
|
-
newCount = upsertResult.newCount;
|
|
2849
|
-
newIds = upsertResult.newIds;
|
|
2850
|
-
let pipelineDroppedNew = 0;
|
|
2851
|
-
const shouldRunPipelineRow = (guid) => newIds.has(guid);
|
|
2852
|
-
const hasEnrich = source.enrichItem != null || items.some((i) => getMatchedEnrichPlugin(i, { sourceUrl: listUrl }));
|
|
2853
|
-
if (!includeContent || items.length === 0 || !hasEnrich) {
|
|
2854
|
-
for (let i = 0; i < items.length; i++) {
|
|
2855
|
-
if (!shouldRunPipelineRow(items[i].guid)) continue;
|
|
2856
|
-
const processed = await runPipelineOnItem(items[i], { sourceUrl: listUrl, isEnriched: false });
|
|
2857
|
-
items[i] = processed;
|
|
2858
|
-
if (isPipelineDroppedItem(processed)) {
|
|
2859
|
-
await deleteItem(processed.guid).catch(
|
|
2860
|
-
(err) => logger.warn("db", "质量过滤后删除条目失败", { source_url: listUrl, err: err instanceof Error ? err.message : String(err) })
|
|
2861
|
-
);
|
|
2862
|
-
pipelineDroppedNew++;
|
|
2863
|
-
} else {
|
|
2864
|
-
updateItemContent(processed).catch(
|
|
2865
|
-
(err) => logger.warn("db", "updateItemContent 失败", { source_url: listUrl, err: err instanceof Error ? err.message : String(err) })
|
|
2866
|
-
);
|
|
2867
|
-
}
|
|
2868
|
-
}
|
|
2869
|
-
if (newCount > 0) {
|
|
2870
|
-
emitFeedUpdated({ sourceUrl: listUrl, newCount: newCount - pipelineDroppedNew });
|
|
2871
|
-
}
|
|
2872
|
-
const out = items.filter((i) => !isPipelineDroppedItem(i));
|
|
2873
|
-
if (deliverUrl && out.length > 0) {
|
|
2874
|
-
await postDeliverItemsSafe(deliverUrl, listUrl, out);
|
|
2875
|
-
}
|
|
2876
|
-
return { items: out };
|
|
2877
|
-
}
|
|
2878
|
-
const enrichFn = (item, _ctx) => buildEnrichFn(source, listUrl, ctx)(item);
|
|
2879
|
-
await enrichQueue.submit(
|
|
2880
|
-
items,
|
|
2881
|
-
enrichFn,
|
|
2882
|
-
ctx,
|
|
2883
|
-
{
|
|
2884
|
-
sourceUrl: listUrl,
|
|
2885
|
-
onItemDone: async (enrichedItem, index) => {
|
|
2886
|
-
enrichedItem.sourceRef = listUrl;
|
|
2887
|
-
const processed = shouldRunPipelineRow(enrichedItem.guid) ? await runPipelineOnItem(enrichedItem, { sourceUrl: listUrl, isEnriched: true }) : enrichedItem;
|
|
2888
|
-
items[index] = processed;
|
|
2889
|
-
if (isPipelineDroppedItem(processed)) {
|
|
2890
|
-
await deleteItem(processed.guid).catch(
|
|
2891
|
-
(err) => logger.warn("db", "质量过滤后删除条目失败", { source_url: listUrl, err: err instanceof Error ? err.message : String(err) })
|
|
2892
|
-
);
|
|
2893
|
-
pipelineDroppedNew++;
|
|
2894
|
-
} else {
|
|
2895
|
-
updateItemContent(processed).catch(
|
|
2896
|
-
(err) => logger.warn("db", "updateItemContent 失败", { source_url: listUrl, err: err instanceof Error ? err.message : String(err) })
|
|
2897
|
-
);
|
|
2898
|
-
}
|
|
2899
|
-
},
|
|
2900
|
-
onAllDone: async () => {
|
|
2901
|
-
for (let i = items.length - 1; i >= 0; i--) {
|
|
2902
|
-
if (isPipelineDroppedItem(items[i])) items.splice(i, 1);
|
|
2903
|
-
}
|
|
2904
|
-
if (newCount > 0) {
|
|
2905
|
-
emitFeedUpdated({ sourceUrl: listUrl, newCount: newCount - pipelineDroppedNew });
|
|
2906
|
-
}
|
|
2907
|
-
if (deliverUrl && items.length > 0) {
|
|
2908
|
-
await postDeliverItemsSafe(deliverUrl, listUrl, items);
|
|
2909
|
-
}
|
|
2910
|
-
}
|
|
2911
|
-
}
|
|
2912
|
-
);
|
|
2913
|
-
return { items };
|
|
2914
|
-
}
|
|
2915
|
-
async function getItems(listUrl, config = {}) {
|
|
2916
|
-
const source = getSource(listUrl);
|
|
2917
|
-
const key = config.cron ? cacheKeyFromCron(listUrl, config.cron) : cacheKey(listUrl, config.refreshInterval ?? source.refreshInterval ?? "1day");
|
|
2918
|
-
if (source.preCheck != null) {
|
|
2919
|
-
try {
|
|
2920
|
-
await source.preCheck({ cacheDir: config.cacheDir ?? "cache", headless: config.headless, proxy: config.proxy ?? source.proxy });
|
|
2921
|
-
} catch (err) {
|
|
2922
|
-
if (err instanceof AuthRequiredError) throw err;
|
|
2923
|
-
throw err;
|
|
2924
|
-
}
|
|
2925
|
-
}
|
|
2926
|
-
let task = config.force ? void 0 : generatingKeys.get(key);
|
|
2927
|
-
if (!task) {
|
|
2928
|
-
task = generateAndCache(listUrl, key, config);
|
|
2929
|
-
if (!config.force) generatingKeys.set(key, task);
|
|
2930
|
-
}
|
|
2931
|
-
const { items } = await task;
|
|
2932
|
-
return { items, fromCache: false };
|
|
2933
|
-
}
|
|
2934
|
-
function feedItemsToRssXml(items, listUrl, lng, opts) {
|
|
2935
|
-
const channel = buildChannelFromItems(listUrl, items, lng);
|
|
2936
|
-
if (opts?.channelTitle) channel.title = opts.channelTitle;
|
|
2937
|
-
if (opts?.channelDesc) channel.description = opts.channelDesc;
|
|
2938
|
-
return buildRssXml(channel, items.map((it) => toRssEntry(it, lng)));
|
|
2939
|
-
}
|
|
2940
2864
|
const DEFAULT_REFRESH = "1day";
|
|
2941
|
-
const SOURCES_CONCURRENCY =
|
|
2865
|
+
const SOURCES_CONCURRENCY = 1;
|
|
2942
2866
|
function createPullTask(ref, cacheDir, cronExpr) {
|
|
2943
2867
|
return async () => {
|
|
2944
2868
|
try {
|
|
@@ -2975,7 +2899,7 @@ async function rescheduleSources(cacheDir, runNow2) {
|
|
|
2975
2899
|
}
|
|
2976
2900
|
}
|
|
2977
2901
|
async function initScheduler(cacheDir) {
|
|
2978
|
-
await rescheduleSources(cacheDir,
|
|
2902
|
+
await rescheduleSources(cacheDir, false);
|
|
2979
2903
|
let debounceTimer = null;
|
|
2980
2904
|
try {
|
|
2981
2905
|
const watcher = watch(SOURCES_CONFIG_PATH, () => {
|
|
@@ -3044,27 +2968,19 @@ function registerRssApiRoutes(app) {
|
|
|
3044
2968
|
}
|
|
3045
2969
|
});
|
|
3046
2970
|
}
|
|
3047
|
-
function registerEnrichRoutes(app) {
|
|
3048
|
-
app.get("/api/enrich/:taskId", (c) => {
|
|
3049
|
-
const taskId = c.req.param("taskId");
|
|
3050
|
-
const task = enrichQueue.getTask(taskId);
|
|
3051
|
-
if (!task) return c.json({ error: "任务不存在或已过期" }, 404);
|
|
3052
|
-
return c.json(task);
|
|
3053
|
-
});
|
|
3054
|
-
}
|
|
3055
2971
|
function registerSchedulerRoutes(app) {
|
|
3056
2972
|
app.get("/api/scheduler/stats", requireAdmin(), (c) => {
|
|
3057
2973
|
const stats = getGroupStats();
|
|
3058
2974
|
return c.json(stats);
|
|
3059
2975
|
});
|
|
3060
2976
|
}
|
|
3061
|
-
const USER_SITE_TEMPLATE = join(BUILTIN_PLUGINS_DIR, "templates", "site.rssany.js");
|
|
3062
2977
|
const SITE_TEMPLATE_FALLBACK = `/**
|
|
3063
|
-
* Site
|
|
2978
|
+
* Site 插件模板(由 /plugins 页添加,位于 .rssany/plugins/)
|
|
2979
|
+
* HTML DOM 解析请用 ctx.deps.parseHtml,勿在插件内 import node_modules。
|
|
3064
2980
|
*/
|
|
3065
2981
|
export default {
|
|
3066
2982
|
id: "__PLUGIN_ID__",
|
|
3067
|
-
listUrlPattern:
|
|
2983
|
+
listUrlPattern: __LIST_URL_PATTERN__,
|
|
3068
2984
|
refreshInterval: "1day",
|
|
3069
2985
|
|
|
3070
2986
|
async fetchItems(sourceId, ctx) {
|
|
@@ -3072,7 +2988,7 @@ export default {
|
|
|
3072
2988
|
waitMs: 2000,
|
|
3073
2989
|
purify: true,
|
|
3074
2990
|
});
|
|
3075
|
-
void html;
|
|
2991
|
+
void ctx.deps.parseHtml(html);
|
|
3076
2992
|
void finalUrl;
|
|
3077
2993
|
return [];
|
|
3078
2994
|
},
|
|
@@ -3081,6 +2997,11 @@ export default {
|
|
|
3081
2997
|
function isValidNewPluginId(id) {
|
|
3082
2998
|
return /^[a-zA-Z][a-zA-Z0-9_-]{0,63}$/.test(id) && id !== "generic" && id !== "new";
|
|
3083
2999
|
}
|
|
3000
|
+
function isValidNewListUrlPattern(pattern) {
|
|
3001
|
+
if (pattern.length === 0 || pattern.length > 2048) return false;
|
|
3002
|
+
if (/[\r\n]/.test(pattern)) return false;
|
|
3003
|
+
return true;
|
|
3004
|
+
}
|
|
3084
3005
|
async function fileExists(p) {
|
|
3085
3006
|
try {
|
|
3086
3007
|
await access(p);
|
|
@@ -3110,16 +3031,23 @@ function registerPluginsRoutes(app) {
|
|
|
3110
3031
|
if (!isValidNewPluginId(id)) {
|
|
3111
3032
|
return c.json({ error: "id 须为字母开头,仅含字母数字、下划线、连字符;不能为 generic 或 new" }, 400);
|
|
3112
3033
|
}
|
|
3034
|
+
const listUrlPatternRaw = typeof body.listUrlPattern === "string" ? body.listUrlPattern.trim() : "";
|
|
3035
|
+
if (!listUrlPatternRaw) {
|
|
3036
|
+
return c.json({ error: "缺少支持的站点(listUrlPattern),例如 https://example.com/*" }, 400);
|
|
3037
|
+
}
|
|
3038
|
+
if (!isValidNewListUrlPattern(listUrlPatternRaw)) {
|
|
3039
|
+
return c.json({ error: "支持的站点须为非空字符串,不超过 2048 字符,且不能含换行" }, 400);
|
|
3040
|
+
}
|
|
3113
3041
|
await mkdir(USER_PLUGINS_DIR, { recursive: true });
|
|
3114
|
-
|
|
3115
|
-
const outPath = join(USER_SOURCES_DIR, `${id}.rssany.ts`);
|
|
3042
|
+
const outPath = join(USER_PLUGINS_DIR, `${id}.rssany.js`);
|
|
3116
3043
|
if (await fileExists(outPath)) return c.json({ error: "该 id 已存在同名文件" }, 409);
|
|
3117
3044
|
let tpl = SITE_TEMPLATE_FALLBACK;
|
|
3118
3045
|
try {
|
|
3119
|
-
tpl = await readFile(
|
|
3046
|
+
tpl = await readFile(PLUGIN_SITE_TEMPLATE_PATH, "utf-8");
|
|
3120
3047
|
} catch {
|
|
3121
3048
|
}
|
|
3122
|
-
const
|
|
3049
|
+
const patternLiteral = JSON.stringify(listUrlPatternRaw);
|
|
3050
|
+
const content = tpl.replace(/__PLUGIN_ID__/g, id).replace(/__LIST_URL_PATTERN__/g, patternLiteral);
|
|
3123
3051
|
if (!isAllowedPluginPath(outPath)) return c.json({ error: "路径不允许" }, 403);
|
|
3124
3052
|
try {
|
|
3125
3053
|
await writeFile(outPath, content, "utf-8");
|
|
@@ -3134,7 +3062,6 @@ function registerPluginsRoutes(app) {
|
|
|
3134
3062
|
kind: "site",
|
|
3135
3063
|
id: s.id,
|
|
3136
3064
|
listUrlPattern: typeof s.listUrlPattern === "string" ? s.listUrlPattern : String(s.listUrlPattern),
|
|
3137
|
-
hasEnrich: !!s.enrichItem,
|
|
3138
3065
|
hasAuth: !!(s.checkAuth && s.loginUrl)
|
|
3139
3066
|
}));
|
|
3140
3067
|
const siteIds = new Set(sites.map((p) => p.id));
|
|
@@ -3142,7 +3069,6 @@ function registerPluginsRoutes(app) {
|
|
|
3142
3069
|
kind: "source",
|
|
3143
3070
|
id: src.id,
|
|
3144
3071
|
listUrlPattern: typeof src.pattern === "string" ? src.pattern : String(src.pattern),
|
|
3145
|
-
hasEnrich: !!src.enrichItem,
|
|
3146
3072
|
hasAuth: false
|
|
3147
3073
|
}));
|
|
3148
3074
|
return c.json([...sites, ...sources]);
|
|
@@ -3303,6 +3229,12 @@ function registerItemsRoutes(app) {
|
|
|
3303
3229
|
return c.json({ ok: false, message: err instanceof Error ? err.message : String(err) }, 400);
|
|
3304
3230
|
}
|
|
3305
3231
|
});
|
|
3232
|
+
app.delete("/api/items/by-source", requireAdmin(), async (c) => {
|
|
3233
|
+
const sourceUrl = (c.req.query("source_url") ?? "").trim();
|
|
3234
|
+
if (!sourceUrl) return c.json({ ok: false, message: "source_url 不能为空" }, 400);
|
|
3235
|
+
const deleted = await deleteItemsBySourceUrl(sourceUrl);
|
|
3236
|
+
return c.json({ ok: true, deleted });
|
|
3237
|
+
});
|
|
3306
3238
|
app.delete("/api/items/:id", async (c) => {
|
|
3307
3239
|
const id = decodeURIComponent(c.req.param("id") ?? "").trim();
|
|
3308
3240
|
if (!id) return c.json({ ok: false, message: "id 不能为空" }, 400);
|
|
@@ -3310,12 +3242,6 @@ function registerItemsRoutes(app) {
|
|
|
3310
3242
|
if (!deleted) return c.json({ ok: false, message: "条目不存在或已删除" }, 404);
|
|
3311
3243
|
return c.json({ ok: true });
|
|
3312
3244
|
});
|
|
3313
|
-
app.delete("/api/items/by-source", requireAdmin(), async (c) => {
|
|
3314
|
-
const sourceUrl = (c.req.query("source_url") ?? "").trim();
|
|
3315
|
-
if (!sourceUrl) return c.json({ ok: false, message: "source_url 不能为空" }, 400);
|
|
3316
|
-
const deleted = await deleteItemsBySourceUrl(sourceUrl);
|
|
3317
|
-
return c.json({ ok: true, deleted });
|
|
3318
|
-
});
|
|
3319
3245
|
app.get("/api/items", async (c) => {
|
|
3320
3246
|
const ref = c.req.query("ref") ?? c.req.query("source") ?? void 0;
|
|
3321
3247
|
const subscribed = parseSubscribedFlag$1(c.req.query("subscribed"));
|
|
@@ -3463,7 +3389,7 @@ function registerSourcesRoutes(app) {
|
|
|
3463
3389
|
const w = s.weight;
|
|
3464
3390
|
const weight = typeof w === "number" ? w : void 0;
|
|
3465
3391
|
return {
|
|
3466
|
-
ref: String(s.ref),
|
|
3392
|
+
ref: canonicalHttpSourceRef(String(s.ref)),
|
|
3467
3393
|
type,
|
|
3468
3394
|
label: s.label,
|
|
3469
3395
|
description: s.description,
|
|
@@ -3537,15 +3463,16 @@ function registerTopicsRoutes(app) {
|
|
|
3537
3463
|
}
|
|
3538
3464
|
function registerDeliverRoutes(app) {
|
|
3539
3465
|
app.get("/api/deliver", requireAdmin(), async (c) => {
|
|
3540
|
-
const url = await
|
|
3541
|
-
return c.json({ url });
|
|
3466
|
+
const { url, token } = await getDeliverConfig();
|
|
3467
|
+
return c.json({ url, token });
|
|
3542
3468
|
});
|
|
3543
3469
|
app.put("/api/deliver", requireAdmin(), async (c) => {
|
|
3544
3470
|
try {
|
|
3545
3471
|
const body = await c.req.json();
|
|
3546
3472
|
const url = typeof body?.url === "string" ? body.url.trim() : "";
|
|
3547
|
-
|
|
3548
|
-
|
|
3473
|
+
const token = typeof body?.token === "string" ? body.token.trim() : "";
|
|
3474
|
+
await saveDeliverConfig({ url, token });
|
|
3475
|
+
return c.json({ ok: true, url, token });
|
|
3549
3476
|
} catch (err) {
|
|
3550
3477
|
return c.json({ ok: false, message: err instanceof Error ? err.message : String(err) }, 400);
|
|
3551
3478
|
}
|
|
@@ -3554,6 +3481,7 @@ function registerDeliverRoutes(app) {
|
|
|
3554
3481
|
try {
|
|
3555
3482
|
const body = await c.req.json();
|
|
3556
3483
|
const url = typeof body?.url === "string" ? body.url.trim() : "";
|
|
3484
|
+
const token = typeof body?.token === "string" ? body.token.trim() : "";
|
|
3557
3485
|
if (!url) return c.json({ ok: false, message: "url 不能为空" }, 400);
|
|
3558
3486
|
const sample = {
|
|
3559
3487
|
guid: "deliver-test-" + Date.now(),
|
|
@@ -3562,22 +3490,150 @@ function registerDeliverRoutes(app) {
|
|
|
3562
3490
|
pubDate: (/* @__PURE__ */ new Date()).toISOString(),
|
|
3563
3491
|
summary: "若下游收到此条,说明投递 URL 可用。"
|
|
3564
3492
|
};
|
|
3565
|
-
await postDeliverItems(
|
|
3566
|
-
|
|
3567
|
-
|
|
3568
|
-
|
|
3569
|
-
|
|
3570
|
-
|
|
3571
|
-
|
|
3572
|
-
|
|
3573
|
-
|
|
3574
|
-
|
|
3493
|
+
await postDeliverItems(
|
|
3494
|
+
url,
|
|
3495
|
+
"rssany-deliver-test",
|
|
3496
|
+
[
|
|
3497
|
+
{
|
|
3498
|
+
guid: sample.guid,
|
|
3499
|
+
title: sample.title,
|
|
3500
|
+
link: sample.link,
|
|
3501
|
+
pubDate: new Date(sample.pubDate),
|
|
3502
|
+
summary: sample.summary,
|
|
3503
|
+
sourceRef: "rssany-deliver-test"
|
|
3504
|
+
}
|
|
3505
|
+
],
|
|
3506
|
+
{ bearerToken: token || void 0 }
|
|
3507
|
+
);
|
|
3575
3508
|
return c.json({ ok: true });
|
|
3576
3509
|
} catch (err) {
|
|
3577
3510
|
return c.json({ ok: false, message: err instanceof Error ? err.message : String(err) }, 400);
|
|
3578
3511
|
}
|
|
3579
3512
|
});
|
|
3580
3513
|
}
|
|
3514
|
+
function trimOrUndef(s) {
|
|
3515
|
+
if (typeof s !== "string") return void 0;
|
|
3516
|
+
const t = s.trim();
|
|
3517
|
+
return t.length > 0 ? t : void 0;
|
|
3518
|
+
}
|
|
3519
|
+
async function readLlmFileConfig() {
|
|
3520
|
+
try {
|
|
3521
|
+
const raw = await readFile(CONFIG_PATH, "utf-8");
|
|
3522
|
+
const j = JSON.parse(raw);
|
|
3523
|
+
const llm = j?.llm;
|
|
3524
|
+
if (!llm || typeof llm !== "object") return {};
|
|
3525
|
+
const o = llm;
|
|
3526
|
+
return {
|
|
3527
|
+
apiKey: typeof o.apiKey === "string" ? o.apiKey : void 0,
|
|
3528
|
+
baseUrl: trimOrUndef(o.baseUrl),
|
|
3529
|
+
model: trimOrUndef(o.model)
|
|
3530
|
+
};
|
|
3531
|
+
} catch {
|
|
3532
|
+
return {};
|
|
3533
|
+
}
|
|
3534
|
+
}
|
|
3535
|
+
async function saveLlmSettings(input) {
|
|
3536
|
+
let root = {};
|
|
3537
|
+
try {
|
|
3538
|
+
const raw = await readFile(CONFIG_PATH, "utf-8");
|
|
3539
|
+
root = JSON.parse(raw);
|
|
3540
|
+
} catch {
|
|
3541
|
+
}
|
|
3542
|
+
const prev = await readLlmFileConfig();
|
|
3543
|
+
const next = {
|
|
3544
|
+
baseUrl: input.baseUrl.trim(),
|
|
3545
|
+
model: input.model.trim()
|
|
3546
|
+
};
|
|
3547
|
+
const newKey = typeof input.apiKey === "string" && input.apiKey.length > 0 ? input.apiKey : void 0;
|
|
3548
|
+
if (newKey) {
|
|
3549
|
+
next.apiKey = newKey;
|
|
3550
|
+
} else if (prev.apiKey) {
|
|
3551
|
+
next.apiKey = prev.apiKey;
|
|
3552
|
+
}
|
|
3553
|
+
root.llm = next;
|
|
3554
|
+
await writeFile(CONFIG_PATH, JSON.stringify(root, null, 2) + "\n", "utf-8");
|
|
3555
|
+
invalidateLLMConfigCache();
|
|
3556
|
+
}
|
|
3557
|
+
function registerLlmRoutes(app) {
|
|
3558
|
+
app.get("/api/llm", requireAdmin(), async (c) => {
|
|
3559
|
+
const resolved = getLLMConfig();
|
|
3560
|
+
const file = await readLlmFileConfig();
|
|
3561
|
+
const hasApiKey = !!resolved.apiKey;
|
|
3562
|
+
const apiKeyInFile = !!(file.apiKey && file.apiKey.length > 0);
|
|
3563
|
+
return c.json({
|
|
3564
|
+
baseUrl: resolved.baseUrl,
|
|
3565
|
+
model: resolved.model,
|
|
3566
|
+
hasApiKey,
|
|
3567
|
+
apiKeyInFile
|
|
3568
|
+
});
|
|
3569
|
+
});
|
|
3570
|
+
app.put("/api/llm", requireAdmin(), async (c) => {
|
|
3571
|
+
try {
|
|
3572
|
+
const body = await c.req.json();
|
|
3573
|
+
const baseUrl = typeof body.baseUrl === "string" ? body.baseUrl : "";
|
|
3574
|
+
const model = typeof body.model === "string" ? body.model : "";
|
|
3575
|
+
const apiKey = typeof body.apiKey === "string" ? body.apiKey : void 0;
|
|
3576
|
+
await saveLlmSettings({
|
|
3577
|
+
baseUrl,
|
|
3578
|
+
model,
|
|
3579
|
+
...apiKey !== void 0 ? { apiKey } : {}
|
|
3580
|
+
});
|
|
3581
|
+
const resolved = getLLMConfig();
|
|
3582
|
+
const file = await readLlmFileConfig();
|
|
3583
|
+
return c.json({
|
|
3584
|
+
ok: true,
|
|
3585
|
+
baseUrl: resolved.baseUrl,
|
|
3586
|
+
model: resolved.model,
|
|
3587
|
+
hasApiKey: !!resolved.apiKey,
|
|
3588
|
+
apiKeyInFile: !!(file.apiKey && file.apiKey.length > 0)
|
|
3589
|
+
});
|
|
3590
|
+
} catch (err) {
|
|
3591
|
+
return c.json(
|
|
3592
|
+
{ ok: false, message: err instanceof Error ? err.message : String(err) },
|
|
3593
|
+
400
|
|
3594
|
+
);
|
|
3595
|
+
}
|
|
3596
|
+
});
|
|
3597
|
+
app.post("/api/llm/test", requireAdmin(), async (c) => {
|
|
3598
|
+
const t0 = Date.now();
|
|
3599
|
+
try {
|
|
3600
|
+
const cfg = getLLMConfig();
|
|
3601
|
+
if (!cfg.apiKey) {
|
|
3602
|
+
return c.json({ ok: false, message: "未配置 API Key(请在界面或 OPENAI_API_KEY 中设置)" }, 400);
|
|
3603
|
+
}
|
|
3604
|
+
const reply = await chatText("Reply with exactly the single word: ok", void 0, {
|
|
3605
|
+
maxTokens: 32768,
|
|
3606
|
+
debugLabel: "llmSettingsTest"
|
|
3607
|
+
});
|
|
3608
|
+
return c.json({ ok: true, reply });
|
|
3609
|
+
} catch (err) {
|
|
3610
|
+
const ms = Date.now() - t0;
|
|
3611
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
3612
|
+
console.error("[llm/test] fail", { ms, message });
|
|
3613
|
+
return c.json({ ok: false, message }, 400);
|
|
3614
|
+
}
|
|
3615
|
+
});
|
|
3616
|
+
}
|
|
3617
|
+
function registerProxySettingsRoutes(app) {
|
|
3618
|
+
app.get("/api/proxy", requireAdmin(), async (c) => {
|
|
3619
|
+
const globalProxy = await readGlobalProxyFromConfig() ?? "";
|
|
3620
|
+
return c.json({ globalProxy });
|
|
3621
|
+
});
|
|
3622
|
+
app.put("/api/proxy", requireAdmin(), async (c) => {
|
|
3623
|
+
try {
|
|
3624
|
+
const body = await c.req.json().catch(() => ({}));
|
|
3625
|
+
const globalProxy = typeof body.globalProxy === "string" ? body.globalProxy : "";
|
|
3626
|
+
await saveGlobalProxyToConfig(globalProxy);
|
|
3627
|
+
const saved = await readGlobalProxyFromConfig() ?? "";
|
|
3628
|
+
return c.json({ ok: true, globalProxy: saved });
|
|
3629
|
+
} catch (err) {
|
|
3630
|
+
return c.json(
|
|
3631
|
+
{ ok: false, message: err instanceof Error ? err.message : String(err) },
|
|
3632
|
+
400
|
|
3633
|
+
);
|
|
3634
|
+
}
|
|
3635
|
+
});
|
|
3636
|
+
}
|
|
3581
3637
|
const tasks = /* @__PURE__ */ new Map();
|
|
3582
3638
|
let idCounter = 0;
|
|
3583
3639
|
function nextId() {
|
|
@@ -3651,10 +3707,311 @@ function registerTasksRoutes(app) {
|
|
|
3651
3707
|
}
|
|
3652
3708
|
});
|
|
3653
3709
|
}
|
|
3710
|
+
const CACHE_SUBDIR = "feed-favicons";
|
|
3711
|
+
const CACHE_KEY_PREFIX = "feed-favicon:v1:";
|
|
3712
|
+
const CACHE_MAX_AGE_SEC = 3 * 24 * 60 * 60;
|
|
3713
|
+
const CACHE_MAX_AGE_MS = CACHE_MAX_AGE_SEC * 1e3;
|
|
3714
|
+
const CACHE_CONTROL = `public, max-age=${CACHE_MAX_AGE_SEC}`;
|
|
3715
|
+
const FETCH_TIMEOUT_MS = 6e3;
|
|
3716
|
+
const MAX_ICON_BYTES = 2 * 1024 * 1024;
|
|
3717
|
+
const MAX_HTML_BYTES = 512 * 1024;
|
|
3718
|
+
const inflightByDomain = /* @__PURE__ */ new Map();
|
|
3719
|
+
const MAX_DOMAIN_LEN = 253;
|
|
3720
|
+
function isPlausibleHostname(s) {
|
|
3721
|
+
if (s.length === 0 || s.length > MAX_DOMAIN_LEN) return false;
|
|
3722
|
+
return /^[a-z0-9]([a-z0-9.-]*[a-z0-9])?$/i.test(s);
|
|
3723
|
+
}
|
|
3724
|
+
function cacheFilePath(domainKey) {
|
|
3725
|
+
const h = createHash("sha256").update(CACHE_KEY_PREFIX + domainKey.toLowerCase()).digest("hex");
|
|
3726
|
+
return join(CACHE_DIR, CACHE_SUBDIR, h);
|
|
3727
|
+
}
|
|
3728
|
+
function originFaviconUrls(domain) {
|
|
3729
|
+
const d = domain.toLowerCase();
|
|
3730
|
+
const hosts = [`https://${d}`];
|
|
3731
|
+
if (d.startsWith("www.")) {
|
|
3732
|
+
const bare = d.slice(4);
|
|
3733
|
+
if (bare) hosts.push(`https://${bare}`);
|
|
3734
|
+
} else {
|
|
3735
|
+
hosts.push(`https://www.${d}`);
|
|
3736
|
+
}
|
|
3737
|
+
const paths = ["/favicon.ico", "/favicon.png", "/apple-touch-icon.png"];
|
|
3738
|
+
const urls = [];
|
|
3739
|
+
for (const base2 of [...new Set(hosts)]) {
|
|
3740
|
+
for (const p of paths) {
|
|
3741
|
+
urls.push(`${base2}${p}`);
|
|
3742
|
+
}
|
|
3743
|
+
}
|
|
3744
|
+
return urls;
|
|
3745
|
+
}
|
|
3746
|
+
function homepageUrlsForDomain(domain) {
|
|
3747
|
+
const d = domain.toLowerCase();
|
|
3748
|
+
const urls = [`https://${d}/`];
|
|
3749
|
+
if (d.startsWith("www.")) {
|
|
3750
|
+
const bare = d.slice(4);
|
|
3751
|
+
if (bare) urls.push(`https://${bare}/`);
|
|
3752
|
+
} else {
|
|
3753
|
+
urls.push(`https://www.${d}/`);
|
|
3754
|
+
}
|
|
3755
|
+
return [...new Set(urls)];
|
|
3756
|
+
}
|
|
3757
|
+
function isIconLinkRel(rel) {
|
|
3758
|
+
const tokens = rel.toLowerCase().trim().split(/\s+/).filter(Boolean);
|
|
3759
|
+
if (tokens.some((x) => x === "mask-icon")) return true;
|
|
3760
|
+
if (tokens.some((x) => x === "apple-touch-icon" || x === "apple-touch-icon-precomposed")) return true;
|
|
3761
|
+
if (tokens.includes("shortcut") && tokens.includes("icon")) return true;
|
|
3762
|
+
return tokens.includes("icon");
|
|
3763
|
+
}
|
|
3764
|
+
function parseLinkIconHrefs(html, pageUrl) {
|
|
3765
|
+
const root = parse(html, { lowerCaseTagName: true });
|
|
3766
|
+
let base2 = pageUrl;
|
|
3767
|
+
const baseEl = root.querySelector("base[href]");
|
|
3768
|
+
if (baseEl) {
|
|
3769
|
+
const bh = baseEl.getAttribute("href")?.trim();
|
|
3770
|
+
if (bh) {
|
|
3771
|
+
try {
|
|
3772
|
+
base2 = new URL(bh, pageUrl).href;
|
|
3773
|
+
} catch {
|
|
3774
|
+
}
|
|
3775
|
+
}
|
|
3776
|
+
}
|
|
3777
|
+
const out = [];
|
|
3778
|
+
const seen = /* @__PURE__ */ new Set();
|
|
3779
|
+
for (const el of root.querySelectorAll("link[href]")) {
|
|
3780
|
+
const rel = el.getAttribute("rel") ?? "";
|
|
3781
|
+
if (!isIconLinkRel(rel)) continue;
|
|
3782
|
+
const href = el.getAttribute("href")?.trim();
|
|
3783
|
+
if (!href || href.startsWith("data:") || href.startsWith("blob:")) continue;
|
|
3784
|
+
try {
|
|
3785
|
+
const abs = new URL(href, base2).href;
|
|
3786
|
+
if ((abs.startsWith("http:") || abs.startsWith("https:")) && !seen.has(abs)) {
|
|
3787
|
+
seen.add(abs);
|
|
3788
|
+
out.push(abs);
|
|
3789
|
+
}
|
|
3790
|
+
} catch {
|
|
3791
|
+
}
|
|
3792
|
+
}
|
|
3793
|
+
return out;
|
|
3794
|
+
}
|
|
3795
|
+
async function fetchHtmlPage(url) {
|
|
3796
|
+
try {
|
|
3797
|
+
const upstream = await fetch(url, {
|
|
3798
|
+
redirect: "follow",
|
|
3799
|
+
headers: {
|
|
3800
|
+
Accept: "text/html,application/xhtml+xml;q=0.9,*/*;q=0.1",
|
|
3801
|
+
"User-Agent": "Mozilla/5.0 (compatible; RssAny/1.0; +https://github.com/rssany/rssany) favicon"
|
|
3802
|
+
},
|
|
3803
|
+
signal: AbortSignal.timeout(FETCH_TIMEOUT_MS)
|
|
3804
|
+
});
|
|
3805
|
+
if (!upstream.ok) return null;
|
|
3806
|
+
const ab = await upstream.arrayBuffer();
|
|
3807
|
+
const buf = Buffer.from(ab);
|
|
3808
|
+
const slice = buf.subarray(0, Math.min(buf.length, MAX_HTML_BYTES));
|
|
3809
|
+
return slice.toString("utf-8");
|
|
3810
|
+
} catch {
|
|
3811
|
+
return null;
|
|
3812
|
+
}
|
|
3813
|
+
}
|
|
3814
|
+
async function discoverIconUrlsFromHomepage(domain) {
|
|
3815
|
+
if (process.env.FAVICON_SKIP_HTML === "1" || process.env.FAVICON_SKIP_HTML === "true") {
|
|
3816
|
+
return [];
|
|
3817
|
+
}
|
|
3818
|
+
for (const pageUrl of homepageUrlsForDomain(domain)) {
|
|
3819
|
+
const html = await fetchHtmlPage(pageUrl);
|
|
3820
|
+
if (!html) continue;
|
|
3821
|
+
const hrefs = parseLinkIconHrefs(html, pageUrl);
|
|
3822
|
+
if (hrefs.length > 0) return hrefs;
|
|
3823
|
+
}
|
|
3824
|
+
return [];
|
|
3825
|
+
}
|
|
3826
|
+
function duckduckgoFaviconUrl(domain) {
|
|
3827
|
+
return `https://icons.duckduckgo.com/ip3/${domain}.ico`;
|
|
3828
|
+
}
|
|
3829
|
+
function iconHorseUrl(domain) {
|
|
3830
|
+
return `https://icon.horse/icon/${encodeURIComponent(domain)}`;
|
|
3831
|
+
}
|
|
3832
|
+
function unavatarUrl(domain) {
|
|
3833
|
+
return `https://unavatar.io/${encodeURIComponent(domain)}`;
|
|
3834
|
+
}
|
|
3835
|
+
function googleFaviconUrl(domain) {
|
|
3836
|
+
return `https://www.google.com/s2/favicons?domain=${encodeURIComponent(domain)}&sz=64`;
|
|
3837
|
+
}
|
|
3838
|
+
function letterCharFromDomain(domain) {
|
|
3839
|
+
const d = domain.toLowerCase().replace(/^www\./, "");
|
|
3840
|
+
const m = d.match(/[a-z0-9]/);
|
|
3841
|
+
return m ? m[0].toUpperCase() : "?";
|
|
3842
|
+
}
|
|
3843
|
+
function hueFromDomain(domain) {
|
|
3844
|
+
const h = createHash("sha256").update(domain.toLowerCase()).digest();
|
|
3845
|
+
return (h[0] << 8 | h[1]) % 360;
|
|
3846
|
+
}
|
|
3847
|
+
function escapeXmlText(s) {
|
|
3848
|
+
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
3849
|
+
}
|
|
3850
|
+
function letterAvatarSvg(domain) {
|
|
3851
|
+
const letter = escapeXmlText(letterCharFromDomain(domain));
|
|
3852
|
+
const hue = hueFromDomain(domain);
|
|
3853
|
+
const bg = `hsl(${hue} 42% 44%)`;
|
|
3854
|
+
const svg = `<?xml version="1.0" encoding="UTF-8"?>
|
|
3855
|
+
<svg xmlns="http://www.w3.org/2000/svg" width="64" height="64" viewBox="0 0 64 64">
|
|
3856
|
+
<rect width="64" height="64" rx="12" fill="${bg}"/>
|
|
3857
|
+
<text x="32" y="32" dominant-baseline="central" text-anchor="middle" fill="#ffffff" font-family="system-ui,Segoe UI,Helvetica,sans-serif" font-size="28" font-weight="600">${letter}</text>
|
|
3858
|
+
</svg>`;
|
|
3859
|
+
return Buffer.from(svg.trim(), "utf-8");
|
|
3860
|
+
}
|
|
3861
|
+
function letterAvatarForDomain(domain) {
|
|
3862
|
+
return { buf: letterAvatarSvg(domain), mime: "image/svg+xml" };
|
|
3863
|
+
}
|
|
3864
|
+
function isEnoent(e) {
|
|
3865
|
+
return typeof e === "object" && e !== null && e.code === "ENOENT";
|
|
3866
|
+
}
|
|
3867
|
+
function sniffImageMime(buf) {
|
|
3868
|
+
if (buf.length < 4) return null;
|
|
3869
|
+
if (buf[0] === 137 && buf[1] === 80 && buf[2] === 78 && buf[3] === 71) return "image/png";
|
|
3870
|
+
if (buf.length >= 6 && buf[0] === 71 && buf[1] === 73 && buf[2] === 70) return "image/gif";
|
|
3871
|
+
if (buf.length >= 3 && buf[0] === 255 && buf[1] === 216 && buf[2] === 255) return "image/jpeg";
|
|
3872
|
+
if (buf.length >= 12 && buf.subarray(0, 4).toString("ascii") === "RIFF" && buf.subarray(8, 12).toString("ascii") === "WEBP") {
|
|
3873
|
+
return "image/webp";
|
|
3874
|
+
}
|
|
3875
|
+
if (buf.length >= 6 && buf.readUInt16LE(0) === 0 && (buf[2] === 1 || buf[2] === 2) && buf[3] === 0) {
|
|
3876
|
+
return "image/x-icon";
|
|
3877
|
+
}
|
|
3878
|
+
const head = buf.subarray(0, Math.min(256, buf.length)).toString("utf-8").trimStart();
|
|
3879
|
+
if (head.startsWith("<svg") || head.startsWith("<?xml")) return "image/svg+xml";
|
|
3880
|
+
return null;
|
|
3881
|
+
}
|
|
3882
|
+
const IMAGE_CT_PREFIX = "image/";
|
|
3883
|
+
function mimeFromFetch(ct) {
|
|
3884
|
+
if (!ct) return null;
|
|
3885
|
+
const base2 = ct.split(";")[0].trim().toLowerCase();
|
|
3886
|
+
return base2.startsWith(IMAGE_CT_PREFIX) ? base2 : null;
|
|
3887
|
+
}
|
|
3888
|
+
function resolveImageMime(buf, ct) {
|
|
3889
|
+
return sniffImageMime(buf) ?? mimeFromFetch(ct);
|
|
3890
|
+
}
|
|
3891
|
+
async function fetchIconCandidate(url) {
|
|
3892
|
+
let upstream;
|
|
3893
|
+
try {
|
|
3894
|
+
upstream = await fetch(url, {
|
|
3895
|
+
redirect: "follow",
|
|
3896
|
+
headers: {
|
|
3897
|
+
Accept: "image/avif,image/webp,image/apng,image/*,*/*;q=0.8",
|
|
3898
|
+
"User-Agent": "Mozilla/5.0 (compatible; RssAny/1.0; +https://github.com/rssany/rssany) favicon"
|
|
3899
|
+
},
|
|
3900
|
+
signal: AbortSignal.timeout(FETCH_TIMEOUT_MS)
|
|
3901
|
+
});
|
|
3902
|
+
} catch {
|
|
3903
|
+
return null;
|
|
3904
|
+
}
|
|
3905
|
+
if (!upstream.ok) return null;
|
|
3906
|
+
const ab = await upstream.arrayBuffer();
|
|
3907
|
+
const buf = Buffer.from(ab);
|
|
3908
|
+
if (buf.length === 0 || buf.length > MAX_ICON_BYTES) return null;
|
|
3909
|
+
return { buf, ct: upstream.headers.get("content-type") };
|
|
3910
|
+
}
|
|
3911
|
+
function isValidIcon(got) {
|
|
3912
|
+
if (!got) return false;
|
|
3913
|
+
const mime = resolveImageMime(got.buf, got.ct);
|
|
3914
|
+
return !!(mime && mime.startsWith(IMAGE_CT_PREFIX));
|
|
3915
|
+
}
|
|
3916
|
+
function upstreamFaviconUrls(domain, htmlIconUrls) {
|
|
3917
|
+
const urls = [...originFaviconUrls(domain), ...htmlIconUrls];
|
|
3918
|
+
const thirdPartyOff = process.env.FAVICON_THIRD_PARTY === "0" || process.env.FAVICON_THIRD_PARTY === "false";
|
|
3919
|
+
if (!thirdPartyOff) {
|
|
3920
|
+
urls.push(duckduckgoFaviconUrl(domain), iconHorseUrl(domain), unavatarUrl(domain));
|
|
3921
|
+
}
|
|
3922
|
+
const includeGoogle = process.env.FAVICON_INCLUDE_GOOGLE === "1" || process.env.FAVICON_INCLUDE_GOOGLE === "true";
|
|
3923
|
+
if (includeGoogle) urls.push(googleFaviconUrl(domain));
|
|
3924
|
+
return urls;
|
|
3925
|
+
}
|
|
3926
|
+
async function fetchFaviconFromNetwork(domain) {
|
|
3927
|
+
const htmlIconUrls = await discoverIconUrlsFromHomepage(domain);
|
|
3928
|
+
const urls = upstreamFaviconUrls(domain, htmlIconUrls);
|
|
3929
|
+
const tasks2 = urls.map(async (url) => {
|
|
3930
|
+
const got = await fetchIconCandidate(url);
|
|
3931
|
+
if (!isValidIcon(got)) {
|
|
3932
|
+
throw new Error("not-an-icon");
|
|
3933
|
+
}
|
|
3934
|
+
const mime = resolveImageMime(got.buf, got.ct);
|
|
3935
|
+
return { buf: got.buf, mime };
|
|
3936
|
+
});
|
|
3937
|
+
try {
|
|
3938
|
+
return await Promise.any(tasks2);
|
|
3939
|
+
} catch {
|
|
3940
|
+
return letterAvatarForDomain(domain);
|
|
3941
|
+
}
|
|
3942
|
+
}
|
|
3943
|
+
function fetchFaviconDeduped(domain) {
|
|
3944
|
+
let p = inflightByDomain.get(domain);
|
|
3945
|
+
if (p) return p;
|
|
3946
|
+
p = fetchFaviconFromNetwork(domain).finally(() => {
|
|
3947
|
+
if (inflightByDomain.get(domain) === p) inflightByDomain.delete(domain);
|
|
3948
|
+
});
|
|
3949
|
+
inflightByDomain.set(domain, p);
|
|
3950
|
+
return p;
|
|
3951
|
+
}
|
|
3952
|
+
function registerFeedFaviconRoutes(app) {
|
|
3953
|
+
app.get("/api/feed-favicon", async (c) => {
|
|
3954
|
+
const raw = (c.req.query("domain") ?? "").trim();
|
|
3955
|
+
if (!raw || !isPlausibleHostname(raw)) {
|
|
3956
|
+
return new Response(null, { status: 400 });
|
|
3957
|
+
}
|
|
3958
|
+
const domain = raw.toLowerCase();
|
|
3959
|
+
const path = cacheFilePath(domain);
|
|
3960
|
+
let diskStale = false;
|
|
3961
|
+
try {
|
|
3962
|
+
const st = await stat(path);
|
|
3963
|
+
if (Date.now() - st.mtimeMs >= CACHE_MAX_AGE_MS) {
|
|
3964
|
+
diskStale = true;
|
|
3965
|
+
await unlink(path).catch(() => {
|
|
3966
|
+
});
|
|
3967
|
+
}
|
|
3968
|
+
} catch (e) {
|
|
3969
|
+
if (!isEnoent(e)) {
|
|
3970
|
+
return new Response(null, { status: 500 });
|
|
3971
|
+
}
|
|
3972
|
+
}
|
|
3973
|
+
if (!diskStale) {
|
|
3974
|
+
try {
|
|
3975
|
+
const cached = await readFile(path);
|
|
3976
|
+
const mime2 = resolveImageMime(cached, null);
|
|
3977
|
+
if (mime2) {
|
|
3978
|
+
return new Response(new Uint8Array(cached), {
|
|
3979
|
+
status: 200,
|
|
3980
|
+
headers: {
|
|
3981
|
+
"Content-Type": mime2,
|
|
3982
|
+
"Cache-Control": CACHE_CONTROL
|
|
3983
|
+
}
|
|
3984
|
+
});
|
|
3985
|
+
}
|
|
3986
|
+
await unlink(path).catch(() => {
|
|
3987
|
+
});
|
|
3988
|
+
} catch (e) {
|
|
3989
|
+
if (!isEnoent(e)) {
|
|
3990
|
+
return new Response(null, { status: 500 });
|
|
3991
|
+
}
|
|
3992
|
+
}
|
|
3993
|
+
}
|
|
3994
|
+
const resolved = await fetchFaviconDeduped(domain);
|
|
3995
|
+
const { buf, mime } = resolved;
|
|
3996
|
+
try {
|
|
3997
|
+
await mkdir(join(CACHE_DIR, CACHE_SUBDIR), { recursive: true });
|
|
3998
|
+
await writeFile(path, buf);
|
|
3999
|
+
} catch {
|
|
4000
|
+
return new Response(null, { status: 500 });
|
|
4001
|
+
}
|
|
4002
|
+
return new Response(new Uint8Array(buf), {
|
|
4003
|
+
status: 200,
|
|
4004
|
+
headers: {
|
|
4005
|
+
"Content-Type": mime,
|
|
4006
|
+
"Cache-Control": CACHE_CONTROL
|
|
4007
|
+
}
|
|
4008
|
+
});
|
|
4009
|
+
});
|
|
4010
|
+
}
|
|
3654
4011
|
function registerApiRoutes(app) {
|
|
3655
4012
|
registerServerRoutes(app);
|
|
4013
|
+
registerFeedFaviconRoutes(app);
|
|
3656
4014
|
registerRssApiRoutes(app);
|
|
3657
|
-
registerEnrichRoutes(app);
|
|
3658
4015
|
registerSchedulerRoutes(app);
|
|
3659
4016
|
registerPluginsRoutes(app);
|
|
3660
4017
|
registerPipelineRoutes(app);
|
|
@@ -3665,6 +4022,8 @@ function registerApiRoutes(app) {
|
|
|
3665
4022
|
registerSourcesRoutes(app);
|
|
3666
4023
|
registerTopicsRoutes(app);
|
|
3667
4024
|
registerDeliverRoutes(app);
|
|
4025
|
+
registerLlmRoutes(app);
|
|
4026
|
+
registerProxySettingsRoutes(app);
|
|
3668
4027
|
registerTasksRoutes(app);
|
|
3669
4028
|
}
|
|
3670
4029
|
function registerAuthRoutes(app) {
|
|
@@ -3678,7 +4037,7 @@ function registerAuthRoutes(app) {
|
|
|
3678
4037
|
const authFlow = toAuthFlow(site);
|
|
3679
4038
|
if (!authFlow) return c.json({ ok: false, message: "该站点无需登录" }, 400);
|
|
3680
4039
|
try {
|
|
3681
|
-
const authenticated = await preCheckAuth(authFlow, CACHE_DIR);
|
|
4040
|
+
const authenticated = await preCheckAuth(authFlow, CACHE_DIR, { proxy: await resolveProxyForSite(site) });
|
|
3682
4041
|
return c.json({ ok: true, authenticated });
|
|
3683
4042
|
} catch (err) {
|
|
3684
4043
|
const msg = err instanceof Error ? err.message : String(err);
|
|
@@ -3695,12 +4054,23 @@ function registerAuthRoutes(app) {
|
|
|
3695
4054
|
const authFlow = toAuthFlow(site);
|
|
3696
4055
|
if (!authFlow) return c.json({ ok: false, message: "该站点无需登录" }, 400);
|
|
3697
4056
|
const { loginUrl } = authFlow;
|
|
3698
|
-
|
|
3699
|
-
|
|
3700
|
-
|
|
3701
|
-
|
|
3702
|
-
|
|
3703
|
-
|
|
4057
|
+
const proxy = await resolveProxyForSite(site);
|
|
4058
|
+
void launchBrowser({ headless: false, cacheDir: CACHE_DIR, proxy: resolveProxy({ proxy }) }).then(async (browser) => {
|
|
4059
|
+
try {
|
|
4060
|
+
const page = await browser.newPage();
|
|
4061
|
+
await applyProxyAuthToPage(page, { proxy });
|
|
4062
|
+
const realUserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36";
|
|
4063
|
+
await page.setUserAgent(realUserAgent);
|
|
4064
|
+
await page.setViewport({ width: 1366, height: 960 });
|
|
4065
|
+
await page.goto(loginUrl, { waitUntil: "domcontentloaded", timeout: 6e4 });
|
|
4066
|
+
page.once("close", () => {
|
|
4067
|
+
void browser.close().catch(() => {
|
|
4068
|
+
});
|
|
4069
|
+
});
|
|
4070
|
+
} catch {
|
|
4071
|
+
await browser.close().catch(() => {
|
|
4072
|
+
});
|
|
4073
|
+
}
|
|
3704
4074
|
}).catch(() => {
|
|
3705
4075
|
});
|
|
3706
4076
|
return c.json({ ok: true, message: "已打开登录页面" });
|
|
@@ -3721,7 +4091,7 @@ function registerAuthRoutes(app) {
|
|
|
3721
4091
|
}
|
|
3722
4092
|
const authFlow = toAuthFlow(site);
|
|
3723
4093
|
if (!authFlow) return c.json({ ok: false, message: "该站点无需登录" }, 400);
|
|
3724
|
-
ensureAuth(authFlow, CACHE_DIR).then(() => {
|
|
4094
|
+
ensureAuth(authFlow, CACHE_DIR, { proxy: await resolveProxyForSite(site) }).then(() => {
|
|
3725
4095
|
}).catch(() => {
|
|
3726
4096
|
});
|
|
3727
4097
|
return c.json({ ok: true, message: "已打开登录窗口,请在弹出的浏览器中完成登录,完成后刷新订阅页面即可。" });
|
|
@@ -3744,6 +4114,24 @@ async function readStaticHtml(name, fallback) {
|
|
|
3744
4114
|
function escapeHtml(s) {
|
|
3745
4115
|
return s.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """).replace(/'/g, "'");
|
|
3746
4116
|
}
|
|
4117
|
+
function effectiveProxyUsed(override, mergedFromSource) {
|
|
4118
|
+
const o = override?.trim();
|
|
4119
|
+
if (o) return o;
|
|
4120
|
+
const s = mergedFromSource?.trim();
|
|
4121
|
+
if (s) return s;
|
|
4122
|
+
return process.env.HTTP_PROXY?.trim() || process.env.HTTPS_PROXY?.trim();
|
|
4123
|
+
}
|
|
4124
|
+
function redactProxyForLog(p) {
|
|
4125
|
+
if (!p) return null;
|
|
4126
|
+
try {
|
|
4127
|
+
const u = new URL(p);
|
|
4128
|
+
if (u.username) u.username = "***";
|
|
4129
|
+
if (u.password) u.password = "***";
|
|
4130
|
+
return u.toString();
|
|
4131
|
+
} catch {
|
|
4132
|
+
return null;
|
|
4133
|
+
}
|
|
4134
|
+
}
|
|
3747
4135
|
function registerAdminRoutes(app) {
|
|
3748
4136
|
async function render401(listUrl) {
|
|
3749
4137
|
const raw = await readStaticHtml("401", '<!DOCTYPE html><html><head><meta charset="utf-8"><title>401</title></head><body><h1>401 需要登录</h1></body></html>');
|
|
@@ -3754,12 +4142,25 @@ function registerAdminRoutes(app) {
|
|
|
3754
4142
|
if (!url) return c.text("无效 URL,格式: /admin/parse/https://... 或 /admin/parse/example.com/...", 400);
|
|
3755
4143
|
try {
|
|
3756
4144
|
const headlessParam = c.req.query("headless");
|
|
3757
|
-
const headless = headlessParam === "
|
|
4145
|
+
const headless = headlessParam === "true" || headlessParam === "1";
|
|
4146
|
+
const proxyOverride = c.req.query("proxy")?.trim();
|
|
3758
4147
|
const source = getSource(url);
|
|
3759
|
-
const
|
|
4148
|
+
const fromSource = await getEffectiveProxyForListUrl(url, source);
|
|
4149
|
+
const ctx = buildSourceContext({
|
|
4150
|
+
cacheDir: CACHE_DIR,
|
|
4151
|
+
headless,
|
|
4152
|
+
proxy: proxyOverride || fromSource
|
|
4153
|
+
});
|
|
3760
4154
|
const items = await source.fetchItems(url, ctx);
|
|
3761
4155
|
const mode = source.id === "generic" ? "generic" : "plugin";
|
|
3762
|
-
|
|
4156
|
+
const effective = effectiveProxyUsed(proxyOverride, fromSource);
|
|
4157
|
+
return c.json({
|
|
4158
|
+
items,
|
|
4159
|
+
url,
|
|
4160
|
+
mode,
|
|
4161
|
+
pluginId: source.id,
|
|
4162
|
+
effectiveProxy: redactProxyForLog(effective)
|
|
4163
|
+
});
|
|
3763
4164
|
} catch (err) {
|
|
3764
4165
|
if (err instanceof AuthRequiredError) {
|
|
3765
4166
|
const html = await render401(url);
|
|
@@ -3774,28 +4175,20 @@ function registerAdminRoutes(app) {
|
|
|
3774
4175
|
if (!url) return c.text("无效 URL,格式: /admin/extractor/https://... 或 /admin/extractor/example.com/...", 400);
|
|
3775
4176
|
try {
|
|
3776
4177
|
const headlessParam = c.req.query("headless");
|
|
3777
|
-
const headless = headlessParam === "
|
|
3778
|
-
const
|
|
3779
|
-
|
|
3780
|
-
|
|
3781
|
-
|
|
3782
|
-
const enriched = await site.enrichItem(stub, siteCtx);
|
|
3783
|
-
return c.json({
|
|
3784
|
-
title: enriched.title ?? null,
|
|
3785
|
-
author: enriched.author ?? null,
|
|
3786
|
-
pubDate: enriched.pubDate instanceof Date ? enriched.pubDate.toISOString() : enriched.pubDate ?? null,
|
|
3787
|
-
content: enriched.content ?? null,
|
|
3788
|
-
_extractor: site.id
|
|
3789
|
-
});
|
|
3790
|
-
}
|
|
3791
|
-
const proxy = site?.proxy;
|
|
4178
|
+
const headless = headlessParam === "true" || headlessParam === "1";
|
|
4179
|
+
const proxyOverride = c.req.query("proxy")?.trim();
|
|
4180
|
+
const source = getSource(url);
|
|
4181
|
+
const fromSource = await getEffectiveProxyForListUrl(url, source);
|
|
4182
|
+
const proxy = proxyOverride || fromSource;
|
|
3792
4183
|
const result = await extractFromLink(url, {}, { timeoutMs: 6e4, headless, proxy });
|
|
4184
|
+
const effective = effectiveProxyUsed(proxyOverride, fromSource);
|
|
3793
4185
|
return c.json({
|
|
3794
4186
|
title: result.title ?? null,
|
|
3795
4187
|
author: result.author ?? null,
|
|
3796
4188
|
pubDate: result.pubDate ?? null,
|
|
3797
4189
|
content: result.content ?? null,
|
|
3798
|
-
_extractor: "readability"
|
|
4190
|
+
_extractor: "readability",
|
|
4191
|
+
effectiveProxy: redactProxyForLog(effective)
|
|
3799
4192
|
});
|
|
3800
4193
|
} catch (err) {
|
|
3801
4194
|
const msg = err instanceof Error ? err.message : String(err);
|