@cablate/banini-tracker 2.0.8 → 2.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +4 -1
- package/dist/db.js +15 -0
- package/dist/facebook.d.ts +6 -1
- package/dist/facebook.js +17 -5
- package/dist/index.js +84 -16
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -62,6 +62,8 @@ program
|
|
|
62
62
|
.description('抓取最新貼文(輸出 JSON 到 stdout)')
|
|
63
63
|
.option('-s, --source <source>', '來源:fb', 'fb')
|
|
64
64
|
.option('-n, --limit <n>', '每個來源抓幾篇', '3')
|
|
65
|
+
.option('--since <date>', '只抓此時間之後的貼文(YYYY-MM-DD 或 ISO 時間戳或相對時間如 "2 months")')
|
|
66
|
+
.option('--until <date>', '只抓此時間之前的貼文')
|
|
65
67
|
.option('--no-dedup', '不做去重,抓到什麼就輸出什麼')
|
|
66
68
|
.option('--mark-seen', '輸出後自動標記為已讀')
|
|
67
69
|
.action(async (opts) => {
|
|
@@ -69,7 +71,8 @@ program
|
|
|
69
71
|
const config = loadConfig();
|
|
70
72
|
const limit = parseInt(opts.limit, 10);
|
|
71
73
|
let posts = [];
|
|
72
|
-
const
|
|
74
|
+
const fetchOpts = (opts.since || opts.until) ? { since: opts.since, until: opts.until } : undefined;
|
|
75
|
+
const fp = await fetchFacebookPosts(config.targets.facebookPageUrl, config.apifyToken, limit, fetchOpts);
|
|
73
76
|
posts.push(...fp);
|
|
74
77
|
// 按時間從新到舊
|
|
75
78
|
posts.sort((a, b) => new Date(b.timestamp).getTime() - new Date(a.timestamp).getTime());
|
package/dist/db.js
CHANGED
|
@@ -19,6 +19,21 @@ export function getDb() {
|
|
|
19
19
|
}
|
|
20
20
|
function migrate(db) {
|
|
21
21
|
db.exec(`
|
|
22
|
+
CREATE TABLE IF NOT EXISTS posts (
|
|
23
|
+
id TEXT PRIMARY KEY,
|
|
24
|
+
source TEXT NOT NULL DEFAULT 'facebook',
|
|
25
|
+
text TEXT NOT NULL DEFAULT '',
|
|
26
|
+
ocr_text TEXT NOT NULL DEFAULT '',
|
|
27
|
+
transcript_text TEXT NOT NULL DEFAULT '',
|
|
28
|
+
media_type TEXT NOT NULL DEFAULT 'text',
|
|
29
|
+
media_url TEXT NOT NULL DEFAULT '',
|
|
30
|
+
url TEXT NOT NULL DEFAULT '',
|
|
31
|
+
like_count INTEGER NOT NULL DEFAULT 0,
|
|
32
|
+
comment_count INTEGER NOT NULL DEFAULT 0,
|
|
33
|
+
post_timestamp TEXT NOT NULL,
|
|
34
|
+
fetched_at TEXT NOT NULL
|
|
35
|
+
);
|
|
36
|
+
|
|
22
37
|
CREATE TABLE IF NOT EXISTS predictions (
|
|
23
38
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
24
39
|
post_id TEXT NOT NULL,
|
package/dist/facebook.d.ts
CHANGED
|
@@ -3,6 +3,7 @@ export interface FacebookPost {
|
|
|
3
3
|
source: 'facebook';
|
|
4
4
|
text: string;
|
|
5
5
|
ocrText: string;
|
|
6
|
+
captionText: string;
|
|
6
7
|
timestamp: string;
|
|
7
8
|
likeCount: number;
|
|
8
9
|
commentCount: number;
|
|
@@ -11,4 +12,8 @@ export interface FacebookPost {
|
|
|
11
12
|
mediaType: string;
|
|
12
13
|
mediaUrl: string;
|
|
13
14
|
}
|
|
14
|
-
export
|
|
15
|
+
export interface FetchOptions {
|
|
16
|
+
since?: string;
|
|
17
|
+
until?: string;
|
|
18
|
+
}
|
|
19
|
+
export declare function fetchFacebookPosts(pageUrl: string, token: string, maxPosts?: number, options?: FetchOptions): Promise<FacebookPost[]>;
|
package/dist/facebook.js
CHANGED
|
@@ -1,16 +1,22 @@
|
|
|
1
|
-
export async function fetchFacebookPosts(pageUrl, token, maxPosts = 3) {
|
|
1
|
+
export async function fetchFacebookPosts(pageUrl, token, maxPosts = 3, options) {
|
|
2
2
|
const actorId = 'apify~facebook-posts-scraper';
|
|
3
3
|
const url = `https://api.apify.com/v2/acts/${actorId}/run-sync-get-dataset-items`;
|
|
4
|
+
const body = {
|
|
5
|
+
startUrls: [{ url: pageUrl }],
|
|
6
|
+
resultsLimit: maxPosts,
|
|
7
|
+
captionText: true,
|
|
8
|
+
};
|
|
9
|
+
if (options?.since)
|
|
10
|
+
body.onlyPostsNewerThan = options.since;
|
|
11
|
+
if (options?.until)
|
|
12
|
+
body.onlyPostsOlderThan = options.until;
|
|
4
13
|
const res = await fetch(url, {
|
|
5
14
|
method: 'POST',
|
|
6
15
|
headers: {
|
|
7
16
|
'Content-Type': 'application/json',
|
|
8
17
|
Authorization: `Bearer ${token}`,
|
|
9
18
|
},
|
|
10
|
-
body: JSON.stringify(
|
|
11
|
-
startUrls: [{ url: pageUrl }],
|
|
12
|
-
resultsLimit: maxPosts,
|
|
13
|
-
}),
|
|
19
|
+
body: JSON.stringify(body),
|
|
14
20
|
signal: AbortSignal.timeout(180_000),
|
|
15
21
|
});
|
|
16
22
|
if (!res.ok) {
|
|
@@ -23,11 +29,17 @@ export async function fetchFacebookPosts(pageUrl, token, maxPosts = 3) {
|
|
|
23
29
|
const ocrTexts = (item.media ?? [])
|
|
24
30
|
.map((m) => m.ocrText ?? '')
|
|
25
31
|
.filter((t) => t.length > 0);
|
|
32
|
+
// captionText 可能在 media item 或頂層
|
|
33
|
+
const captionTexts = (item.media ?? [])
|
|
34
|
+
.map((m) => m.captionText ?? '')
|
|
35
|
+
.filter((t) => t.length > 0);
|
|
36
|
+
const captionText = captionTexts.join('\n') || item.captionText || '';
|
|
26
37
|
return {
|
|
27
38
|
id: `fb_${item.postId ?? item.id ?? ''}`,
|
|
28
39
|
source: 'facebook',
|
|
29
40
|
text: item.text ?? item.message ?? '',
|
|
30
41
|
ocrText: ocrTexts.join('\n'),
|
|
42
|
+
captionText,
|
|
31
43
|
timestamp: item.time ?? new Date().toISOString(),
|
|
32
44
|
likeCount: item.likes ?? 0,
|
|
33
45
|
commentCount: item.comments ?? 0,
|
package/dist/index.js
CHANGED
|
@@ -18,6 +18,7 @@ import { filterNewPosts as filterNew, markPostsSeen } from './seen.js';
|
|
|
18
18
|
import { withRetry } from './retry.js';
|
|
19
19
|
import { createTranscriber, transcribeVideoPosts } from './transcribe.js';
|
|
20
20
|
import { recordPredictions, updateTracking } from './tracker.js';
|
|
21
|
+
import { getDb } from './db.js';
|
|
21
22
|
// ── Config ──────────────────────────────────────────────────
|
|
22
23
|
const FB_PAGE_URL = 'https://www.facebook.com/DieWithoutBang/';
|
|
23
24
|
const DATA_DIR = process.env.DATA_DIR || join(process.cwd(), 'data');
|
|
@@ -34,7 +35,7 @@ function fromFacebook(p) {
|
|
|
34
35
|
source: 'facebook',
|
|
35
36
|
text: p.text,
|
|
36
37
|
ocrText: p.ocrText,
|
|
37
|
-
transcriptText: '',
|
|
38
|
+
transcriptText: p.captionText || '',
|
|
38
39
|
timestamp: p.timestamp,
|
|
39
40
|
likeCount: p.likeCount,
|
|
40
41
|
replyCount: p.commentCount,
|
|
@@ -65,7 +66,8 @@ async function runInner(opts) {
|
|
|
65
66
|
const allPosts = [];
|
|
66
67
|
// 1. 抓取 Facebook(含 retry)
|
|
67
68
|
try {
|
|
68
|
-
const
|
|
69
|
+
const fetchOpts = (opts.since || opts.until) ? { since: opts.since, until: opts.until } : undefined;
|
|
70
|
+
const fbPosts = await withRetry(() => fetchFacebookPosts(FB_PAGE_URL, apifyToken, opts.maxPosts, fetchOpts), { label: 'Facebook', maxRetries: 2, baseDelayMs: 5000 });
|
|
69
71
|
allPosts.push(...fbPosts.map(fromFacebook));
|
|
70
72
|
}
|
|
71
73
|
catch (err) {
|
|
@@ -81,17 +83,56 @@ async function runInner(opts) {
|
|
|
81
83
|
console.log('沒有新貼文,結束');
|
|
82
84
|
return;
|
|
83
85
|
}
|
|
84
|
-
// 2.5.
|
|
86
|
+
// 2.5. 影片轉錄(captionText 有值則跳過 Groq)
|
|
85
87
|
const transcriberType = (process.env.TRANSCRIBER ?? 'noop');
|
|
86
88
|
const transcriber = createTranscriber(transcriberType);
|
|
87
89
|
if (transcriber.name !== 'noop') {
|
|
88
|
-
const
|
|
89
|
-
|
|
90
|
-
const
|
|
91
|
-
|
|
92
|
-
|
|
90
|
+
const needsTranscribe = newPosts.filter((p) => !p.transcriptText);
|
|
91
|
+
if (needsTranscribe.length > 0) {
|
|
92
|
+
const transcripts = await transcribeVideoPosts(needsTranscribe, transcriber);
|
|
93
|
+
for (const p of needsTranscribe) {
|
|
94
|
+
const result = transcripts.get(p.id);
|
|
95
|
+
if (result)
|
|
96
|
+
p.transcriptText = result.text;
|
|
97
|
+
}
|
|
93
98
|
}
|
|
94
99
|
}
|
|
100
|
+
// 2.6. 貼文入庫
|
|
101
|
+
try {
|
|
102
|
+
const db = getDb();
|
|
103
|
+
const upsertPost = db.prepare(`
|
|
104
|
+
INSERT INTO posts (id, source, text, ocr_text, transcript_text, media_type, media_url, url, like_count, comment_count, post_timestamp, fetched_at)
|
|
105
|
+
VALUES (@id, @source, @text, @ocr_text, @transcript_text, @media_type, @media_url, @url, @like_count, @comment_count, @post_timestamp, @fetched_at)
|
|
106
|
+
ON CONFLICT(id) DO UPDATE SET
|
|
107
|
+
transcript_text = CASE WHEN excluded.transcript_text != '' THEN excluded.transcript_text ELSE posts.transcript_text END,
|
|
108
|
+
like_count = excluded.like_count,
|
|
109
|
+
comment_count = excluded.comment_count
|
|
110
|
+
`);
|
|
111
|
+
const now = new Date().toISOString();
|
|
112
|
+
const insertMany = db.transaction(() => {
|
|
113
|
+
for (const p of newPosts) {
|
|
114
|
+
upsertPost.run({
|
|
115
|
+
id: p.id,
|
|
116
|
+
source: p.source,
|
|
117
|
+
text: p.text,
|
|
118
|
+
ocr_text: p.ocrText,
|
|
119
|
+
transcript_text: p.transcriptText,
|
|
120
|
+
media_type: p.mediaType,
|
|
121
|
+
media_url: p.mediaUrl,
|
|
122
|
+
url: p.url,
|
|
123
|
+
like_count: p.likeCount,
|
|
124
|
+
comment_count: p.replyCount,
|
|
125
|
+
post_timestamp: p.timestamp,
|
|
126
|
+
fetched_at: now,
|
|
127
|
+
});
|
|
128
|
+
}
|
|
129
|
+
});
|
|
130
|
+
insertMany();
|
|
131
|
+
console.log(`[DB] 已存入 ${newPosts.length} 篇貼文`);
|
|
132
|
+
}
|
|
133
|
+
catch (err) {
|
|
134
|
+
console.error(`[DB] 貼文入庫失敗: ${err instanceof Error ? err.message : err}`);
|
|
135
|
+
}
|
|
95
136
|
// 按時間從新到舊排序
|
|
96
137
|
newPosts.sort((a, b) => new Date(b.timestamp).getTime() - new Date(a.timestamp).getTime());
|
|
97
138
|
// 標記當天貼文
|
|
@@ -226,16 +267,42 @@ async function runInner(opts) {
|
|
|
226
267
|
writeFileSync(outFile, JSON.stringify({ timestamp: new Date().toISOString(), posts: newPosts, analysis }, null, 2), 'utf-8');
|
|
227
268
|
console.log(`結果已存檔: ${outFile}`);
|
|
228
269
|
}
|
|
270
|
+
/**
|
|
271
|
+
* 產生台北時間今天指定時分的 ISO 時間戳
|
|
272
|
+
* 用於 Apify onlyPostsNewerThan 參數
|
|
273
|
+
*/
|
|
274
|
+
function taipeiToday(hours, minutes = 0) {
|
|
275
|
+
const now = new Date();
|
|
276
|
+
const taipeiStr = now.toLocaleString('en-US', { timeZone: 'Asia/Taipei' });
|
|
277
|
+
const taipeiNow = new Date(taipeiStr);
|
|
278
|
+
taipeiNow.setHours(hours, minutes, 0, 0);
|
|
279
|
+
// 轉回 UTC:台北 = UTC+8
|
|
280
|
+
const utc = new Date(taipeiNow.getTime() - 8 * 60 * 60 * 1000);
|
|
281
|
+
return utc.toISOString();
|
|
282
|
+
}
|
|
283
|
+
function taipeiYesterday(hours, minutes = 0) {
|
|
284
|
+
const now = new Date();
|
|
285
|
+
const taipeiStr = now.toLocaleString('en-US', { timeZone: 'Asia/Taipei' });
|
|
286
|
+
const taipeiNow = new Date(taipeiStr);
|
|
287
|
+
taipeiNow.setDate(taipeiNow.getDate() - 1);
|
|
288
|
+
taipeiNow.setHours(hours, minutes, 0, 0);
|
|
289
|
+
const utc = new Date(taipeiNow.getTime() - 8 * 60 * 60 * 1000);
|
|
290
|
+
return utc.toISOString();
|
|
291
|
+
}
|
|
229
292
|
// ── 入口 ────────────────────────────────────────────────────
|
|
230
293
|
if (isCronMode) {
|
|
231
|
-
//
|
|
232
|
-
|
|
294
|
+
// 早晨補漏:每天 08:00,抓前一晚 22:00 之後的貼文
|
|
295
|
+
cron.schedule('0 8 * * *', () => {
|
|
296
|
+
run({ maxPosts: 3, isDryRun: false, label: '早晨', since: taipeiYesterday(22, 0) })
|
|
297
|
+
.catch((err) => console.error('[早晨] 執行失敗:', err));
|
|
298
|
+
}, { timezone: 'Asia/Taipei' });
|
|
299
|
+
// 盤中:週一到五 09:00-13:30,每 30 分鐘,抓 08:30 之後的貼文
|
|
233
300
|
cron.schedule('7,37 9-12 * * 1-5', () => {
|
|
234
|
-
run({ maxPosts: 1, isDryRun: false, label: '盤中' })
|
|
301
|
+
run({ maxPosts: 1, isDryRun: false, label: '盤中', since: taipeiToday(8, 30) })
|
|
235
302
|
.catch((err) => console.error('[盤中] 執行失敗:', err));
|
|
236
303
|
}, { timezone: 'Asia/Taipei' });
|
|
237
304
|
cron.schedule('7 13 * * 1-5', () => {
|
|
238
|
-
run({ maxPosts: 1, isDryRun: false, label: '盤中' })
|
|
305
|
+
run({ maxPosts: 1, isDryRun: false, label: '盤中', since: taipeiToday(8, 30) })
|
|
239
306
|
.catch((err) => console.error('[盤中] 執行失敗:', err));
|
|
240
307
|
}, { timezone: 'Asia/Taipei' });
|
|
241
308
|
// 追蹤更新:週一到五 15:00(收盤後更新預測追蹤)
|
|
@@ -243,15 +310,16 @@ if (isCronMode) {
|
|
|
243
310
|
updateTracking()
|
|
244
311
|
.catch((err) => console.error('[追蹤更新] 執行失敗:', err));
|
|
245
312
|
}, { timezone: 'Asia/Taipei' });
|
|
246
|
-
// 盤後:每天晚上 23:
|
|
313
|
+
// 盤後:每天晚上 23:03,抓 13:30 之後的貼文
|
|
247
314
|
cron.schedule('3 23 * * *', () => {
|
|
248
|
-
run({ maxPosts: 3, isDryRun: false, label: '盤後' })
|
|
315
|
+
run({ maxPosts: 3, isDryRun: false, label: '盤後', since: taipeiToday(13, 30) })
|
|
249
316
|
.catch((err) => console.error('[盤後] 執行失敗:', err));
|
|
250
317
|
}, { timezone: 'Asia/Taipei' });
|
|
251
318
|
console.log('=== 巴逆逆排程已啟動 ===');
|
|
252
|
-
console.log('
|
|
319
|
+
console.log(' 早晨:每天 08:00(前晚 22:00 起,3 篇)');
|
|
320
|
+
console.log(' 盤中:週一~五 09:07/09:37/10:07/.../13:07(08:30 起,1 篇)');
|
|
253
321
|
console.log(' 追蹤更新:週一~五 15:00(預測追蹤判定)');
|
|
254
|
-
console.log(' 盤後:每天 23:03(
|
|
322
|
+
console.log(' 盤後:每天 23:03(13:30 起,3 篇)');
|
|
255
323
|
console.log(' 按 Ctrl+C 停止\n');
|
|
256
324
|
}
|
|
257
325
|
else {
|