@cablate/banini-tracker 2.0.8 → 2.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -62,6 +62,8 @@ program
62
62
  .description('抓取最新貼文(輸出 JSON 到 stdout)')
63
63
  .option('-s, --source <source>', '來源:fb', 'fb')
64
64
  .option('-n, --limit <n>', '每個來源抓幾篇', '3')
65
+ .option('--since <date>', '只抓此時間之後的貼文(YYYY-MM-DD 或 ISO 時間戳或相對時間如 "2 months")')
66
+ .option('--until <date>', '只抓此時間之前的貼文')
65
67
  .option('--no-dedup', '不做去重,抓到什麼就輸出什麼')
66
68
  .option('--mark-seen', '輸出後自動標記為已讀')
67
69
  .action(async (opts) => {
@@ -69,7 +71,8 @@ program
69
71
  const config = loadConfig();
70
72
  const limit = parseInt(opts.limit, 10);
71
73
  let posts = [];
72
- const fp = await fetchFacebookPosts(config.targets.facebookPageUrl, config.apifyToken, limit);
74
+ const fetchOpts = (opts.since || opts.until) ? { since: opts.since, until: opts.until } : undefined;
75
+ const fp = await fetchFacebookPosts(config.targets.facebookPageUrl, config.apifyToken, limit, fetchOpts);
73
76
  posts.push(...fp);
74
77
  // 按時間從新到舊
75
78
  posts.sort((a, b) => new Date(b.timestamp).getTime() - new Date(a.timestamp).getTime());
package/dist/db.js CHANGED
@@ -19,6 +19,21 @@ export function getDb() {
19
19
  }
20
20
  function migrate(db) {
21
21
  db.exec(`
22
+ CREATE TABLE IF NOT EXISTS posts (
23
+ id TEXT PRIMARY KEY,
24
+ source TEXT NOT NULL DEFAULT 'facebook',
25
+ text TEXT NOT NULL DEFAULT '',
26
+ ocr_text TEXT NOT NULL DEFAULT '',
27
+ transcript_text TEXT NOT NULL DEFAULT '',
28
+ media_type TEXT NOT NULL DEFAULT 'text',
29
+ media_url TEXT NOT NULL DEFAULT '',
30
+ url TEXT NOT NULL DEFAULT '',
31
+ like_count INTEGER NOT NULL DEFAULT 0,
32
+ comment_count INTEGER NOT NULL DEFAULT 0,
33
+ post_timestamp TEXT NOT NULL,
34
+ fetched_at TEXT NOT NULL
35
+ );
36
+
22
37
  CREATE TABLE IF NOT EXISTS predictions (
23
38
  id INTEGER PRIMARY KEY AUTOINCREMENT,
24
39
  post_id TEXT NOT NULL,
@@ -3,6 +3,7 @@ export interface FacebookPost {
3
3
  source: 'facebook';
4
4
  text: string;
5
5
  ocrText: string;
6
+ captionText: string;
6
7
  timestamp: string;
7
8
  likeCount: number;
8
9
  commentCount: number;
@@ -11,4 +12,8 @@ export interface FacebookPost {
11
12
  mediaType: string;
12
13
  mediaUrl: string;
13
14
  }
14
- export declare function fetchFacebookPosts(pageUrl: string, token: string, maxPosts?: number): Promise<FacebookPost[]>;
15
+ export interface FetchOptions {
16
+ since?: string;
17
+ until?: string;
18
+ }
19
+ export declare function fetchFacebookPosts(pageUrl: string, token: string, maxPosts?: number, options?: FetchOptions): Promise<FacebookPost[]>;
package/dist/facebook.js CHANGED
@@ -1,16 +1,22 @@
1
- export async function fetchFacebookPosts(pageUrl, token, maxPosts = 3) {
1
+ export async function fetchFacebookPosts(pageUrl, token, maxPosts = 3, options) {
2
2
  const actorId = 'apify~facebook-posts-scraper';
3
3
  const url = `https://api.apify.com/v2/acts/${actorId}/run-sync-get-dataset-items`;
4
+ const body = {
5
+ startUrls: [{ url: pageUrl }],
6
+ resultsLimit: maxPosts,
7
+ captionText: true,
8
+ };
9
+ if (options?.since)
10
+ body.onlyPostsNewerThan = options.since;
11
+ if (options?.until)
12
+ body.onlyPostsOlderThan = options.until;
4
13
  const res = await fetch(url, {
5
14
  method: 'POST',
6
15
  headers: {
7
16
  'Content-Type': 'application/json',
8
17
  Authorization: `Bearer ${token}`,
9
18
  },
10
- body: JSON.stringify({
11
- startUrls: [{ url: pageUrl }],
12
- resultsLimit: maxPosts,
13
- }),
19
+ body: JSON.stringify(body),
14
20
  signal: AbortSignal.timeout(180_000),
15
21
  });
16
22
  if (!res.ok) {
@@ -23,11 +29,17 @@ export async function fetchFacebookPosts(pageUrl, token, maxPosts = 3) {
23
29
  const ocrTexts = (item.media ?? [])
24
30
  .map((m) => m.ocrText ?? '')
25
31
  .filter((t) => t.length > 0);
32
+ // captionText 可能在 media item 或頂層
33
+ const captionTexts = (item.media ?? [])
34
+ .map((m) => m.captionText ?? '')
35
+ .filter((t) => t.length > 0);
36
+ const captionText = captionTexts.join('\n') || item.captionText || '';
26
37
  return {
27
38
  id: `fb_${item.postId ?? item.id ?? ''}`,
28
39
  source: 'facebook',
29
40
  text: item.text ?? item.message ?? '',
30
41
  ocrText: ocrTexts.join('\n'),
42
+ captionText,
31
43
  timestamp: item.time ?? new Date().toISOString(),
32
44
  likeCount: item.likes ?? 0,
33
45
  commentCount: item.comments ?? 0,
package/dist/index.js CHANGED
@@ -18,6 +18,7 @@ import { filterNewPosts as filterNew, markPostsSeen } from './seen.js';
18
18
  import { withRetry } from './retry.js';
19
19
  import { createTranscriber, transcribeVideoPosts } from './transcribe.js';
20
20
  import { recordPredictions, updateTracking } from './tracker.js';
21
+ import { getDb } from './db.js';
21
22
  // ── Config ──────────────────────────────────────────────────
22
23
  const FB_PAGE_URL = 'https://www.facebook.com/DieWithoutBang/';
23
24
  const DATA_DIR = process.env.DATA_DIR || join(process.cwd(), 'data');
@@ -34,7 +35,7 @@ function fromFacebook(p) {
34
35
  source: 'facebook',
35
36
  text: p.text,
36
37
  ocrText: p.ocrText,
37
- transcriptText: '',
38
+ transcriptText: p.captionText || '',
38
39
  timestamp: p.timestamp,
39
40
  likeCount: p.likeCount,
40
41
  replyCount: p.commentCount,
@@ -65,7 +66,8 @@ async function runInner(opts) {
65
66
  const allPosts = [];
66
67
  // 1. 抓取 Facebook(含 retry)
67
68
  try {
68
- const fbPosts = await withRetry(() => fetchFacebookPosts(FB_PAGE_URL, apifyToken, opts.maxPosts), { label: 'Facebook', maxRetries: 2, baseDelayMs: 5000 });
69
+ const fetchOpts = (opts.since || opts.until) ? { since: opts.since, until: opts.until } : undefined;
70
+ const fbPosts = await withRetry(() => fetchFacebookPosts(FB_PAGE_URL, apifyToken, opts.maxPosts, fetchOpts), { label: 'Facebook', maxRetries: 2, baseDelayMs: 5000 });
69
71
  allPosts.push(...fbPosts.map(fromFacebook));
70
72
  }
71
73
  catch (err) {
@@ -81,17 +83,56 @@ async function runInner(opts) {
81
83
  console.log('沒有新貼文,結束');
82
84
  return;
83
85
  }
84
- // 2.5. 影片轉錄
86
+ // 2.5. 影片轉錄(captionText 有值則跳過 Groq)
85
87
  const transcriberType = (process.env.TRANSCRIBER ?? 'noop');
86
88
  const transcriber = createTranscriber(transcriberType);
87
89
  if (transcriber.name !== 'noop') {
88
- const transcripts = await transcribeVideoPosts(newPosts, transcriber);
89
- for (const p of newPosts) {
90
- const result = transcripts.get(p.id);
91
- if (result)
92
- p.transcriptText = result.text;
90
+ const needsTranscribe = newPosts.filter((p) => !p.transcriptText);
91
+ if (needsTranscribe.length > 0) {
92
+ const transcripts = await transcribeVideoPosts(needsTranscribe, transcriber);
93
+ for (const p of needsTranscribe) {
94
+ const result = transcripts.get(p.id);
95
+ if (result)
96
+ p.transcriptText = result.text;
97
+ }
93
98
  }
94
99
  }
100
+ // 2.6. 貼文入庫
101
+ try {
102
+ const db = getDb();
103
+ const upsertPost = db.prepare(`
104
+ INSERT INTO posts (id, source, text, ocr_text, transcript_text, media_type, media_url, url, like_count, comment_count, post_timestamp, fetched_at)
105
+ VALUES (@id, @source, @text, @ocr_text, @transcript_text, @media_type, @media_url, @url, @like_count, @comment_count, @post_timestamp, @fetched_at)
106
+ ON CONFLICT(id) DO UPDATE SET
107
+ transcript_text = CASE WHEN excluded.transcript_text != '' THEN excluded.transcript_text ELSE posts.transcript_text END,
108
+ like_count = excluded.like_count,
109
+ comment_count = excluded.comment_count
110
+ `);
111
+ const now = new Date().toISOString();
112
+ const insertMany = db.transaction(() => {
113
+ for (const p of newPosts) {
114
+ upsertPost.run({
115
+ id: p.id,
116
+ source: p.source,
117
+ text: p.text,
118
+ ocr_text: p.ocrText,
119
+ transcript_text: p.transcriptText,
120
+ media_type: p.mediaType,
121
+ media_url: p.mediaUrl,
122
+ url: p.url,
123
+ like_count: p.likeCount,
124
+ comment_count: p.replyCount,
125
+ post_timestamp: p.timestamp,
126
+ fetched_at: now,
127
+ });
128
+ }
129
+ });
130
+ insertMany();
131
+ console.log(`[DB] 已存入 ${newPosts.length} 篇貼文`);
132
+ }
133
+ catch (err) {
134
+ console.error(`[DB] 貼文入庫失敗: ${err instanceof Error ? err.message : err}`);
135
+ }
95
136
  // 按時間從新到舊排序
96
137
  newPosts.sort((a, b) => new Date(b.timestamp).getTime() - new Date(a.timestamp).getTime());
97
138
  // 標記當天貼文
@@ -226,16 +267,42 @@ async function runInner(opts) {
226
267
  writeFileSync(outFile, JSON.stringify({ timestamp: new Date().toISOString(), posts: newPosts, analysis }, null, 2), 'utf-8');
227
268
  console.log(`結果已存檔: ${outFile}`);
228
269
  }
270
+ /**
271
+ * 產生台北時間今天指定時分的 ISO 時間戳
272
+ * 用於 Apify onlyPostsNewerThan 參數
273
+ */
274
+ function taipeiToday(hours, minutes = 0) {
275
+ const now = new Date();
276
+ const taipeiStr = now.toLocaleString('en-US', { timeZone: 'Asia/Taipei' });
277
+ const taipeiNow = new Date(taipeiStr);
278
+ taipeiNow.setHours(hours, minutes, 0, 0);
279
+ // 轉回 UTC:台北 = UTC+8
280
+ const utc = new Date(taipeiNow.getTime() - 8 * 60 * 60 * 1000);
281
+ return utc.toISOString();
282
+ }
283
+ function taipeiYesterday(hours, minutes = 0) {
284
+ const now = new Date();
285
+ const taipeiStr = now.toLocaleString('en-US', { timeZone: 'Asia/Taipei' });
286
+ const taipeiNow = new Date(taipeiStr);
287
+ taipeiNow.setDate(taipeiNow.getDate() - 1);
288
+ taipeiNow.setHours(hours, minutes, 0, 0);
289
+ const utc = new Date(taipeiNow.getTime() - 8 * 60 * 60 * 1000);
290
+ return utc.toISOString();
291
+ }
229
292
  // ── 入口 ────────────────────────────────────────────────────
230
293
  if (isCronMode) {
231
- // 盤中:週一到五 09:00-13:30,每 30 分鐘,FB only 抓 1 篇
232
- // cron 不支援半小時結束,用 9:00-13:00 30 + 13:30 單獨一個
294
+ // 早晨補漏:每天 08:00,抓前一晚 22:00 之後的貼文
295
+ cron.schedule('0 8 * * *', () => {
296
+ run({ maxPosts: 3, isDryRun: false, label: '早晨', since: taipeiYesterday(22, 0) })
297
+ .catch((err) => console.error('[早晨] 執行失敗:', err));
298
+ }, { timezone: 'Asia/Taipei' });
299
+ // 盤中:週一到五 09:00-13:30,每 30 分鐘,抓 08:30 之後的貼文
233
300
  cron.schedule('7,37 9-12 * * 1-5', () => {
234
- run({ maxPosts: 1, isDryRun: false, label: '盤中' })
301
+ run({ maxPosts: 1, isDryRun: false, label: '盤中', since: taipeiToday(8, 30) })
235
302
  .catch((err) => console.error('[盤中] 執行失敗:', err));
236
303
  }, { timezone: 'Asia/Taipei' });
237
304
  cron.schedule('7 13 * * 1-5', () => {
238
- run({ maxPosts: 1, isDryRun: false, label: '盤中' })
305
+ run({ maxPosts: 1, isDryRun: false, label: '盤中', since: taipeiToday(8, 30) })
239
306
  .catch((err) => console.error('[盤中] 執行失敗:', err));
240
307
  }, { timezone: 'Asia/Taipei' });
241
308
  // 追蹤更新:週一到五 15:00(收盤後更新預測追蹤)
@@ -243,15 +310,16 @@ if (isCronMode) {
243
310
  updateTracking()
244
311
  .catch((err) => console.error('[追蹤更新] 執行失敗:', err));
245
312
  }, { timezone: 'Asia/Taipei' });
246
- // 盤後:每天晚上 23:00,FB 3
313
+ // 盤後:每天晚上 23:03,抓 13:30 之後的貼文
247
314
  cron.schedule('3 23 * * *', () => {
248
- run({ maxPosts: 3, isDryRun: false, label: '盤後' })
315
+ run({ maxPosts: 3, isDryRun: false, label: '盤後', since: taipeiToday(13, 30) })
249
316
  .catch((err) => console.error('[盤後] 執行失敗:', err));
250
317
  }, { timezone: 'Asia/Taipei' });
251
318
  console.log('=== 巴逆逆排程已啟動 ===');
252
- console.log(' 盤中:週一~五 09:07/09:37/10:07/.../13:07(FB, 1 篇)');
319
+ console.log(' 早晨:每天 08:00(前晚 22:00 起,3 篇)');
320
+ console.log(' 盤中:週一~五 09:07/09:37/10:07/.../13:07(08:30 起,1 篇)');
253
321
  console.log(' 追蹤更新:週一~五 15:00(預測追蹤判定)');
254
- console.log(' 盤後:每天 23:03(FB, 3 篇)');
322
+ console.log(' 盤後:每天 23:03(13:30 起,3 篇)');
255
323
  console.log(' 按 Ctrl+C 停止\n');
256
324
  }
257
325
  else {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@cablate/banini-tracker",
3
- "version": "2.0.8",
3
+ "version": "2.0.10",
4
4
  "description": "巴逆逆反指標追蹤器 — 常駐排程 + CLI 雙模式",
5
5
  "type": "module",
6
6
  "bin": {