@aikeytake/social-automation 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +39 -0
- package/CLAUDE.md +256 -0
- package/CURRENT_CAPABILITIES.md +493 -0
- package/DATA_ORGANIZATION.md +416 -0
- package/IMPLEMENTATION_SUMMARY.md +287 -0
- package/INSTRUCTIONS.md +316 -0
- package/MASTER_PLAN.md +1096 -0
- package/README.md +280 -0
- package/config/sources.json +296 -0
- package/package.json +37 -0
- package/src/cli.js +197 -0
- package/src/fetchers/api.js +232 -0
- package/src/fetchers/hackernews.js +86 -0
- package/src/fetchers/linkedin.js +400 -0
- package/src/fetchers/linkedin_browser.js +167 -0
- package/src/fetchers/reddit.js +77 -0
- package/src/fetchers/rss.js +50 -0
- package/src/fetchers/twitter.js +194 -0
- package/src/index.js +346 -0
- package/src/query.js +316 -0
- package/src/utils/logger.js +74 -0
- package/src/utils/storage.js +134 -0
- package/src/writing-agents/QUICK-REFERENCE.md +111 -0
- package/src/writing-agents/WRITING-SKILLS-IMPROVEMENTS.md +273 -0
- package/src/writing-agents/utils/prompt-templates-improved.js +665 -0
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
import { chromium } from 'playwright';
|
|
2
|
+
import fs from 'fs';
|
|
3
|
+
import path from 'path';
|
|
4
|
+
import crypto from 'crypto';
|
|
5
|
+
import { fileURLToPath } from 'url';
|
|
6
|
+
import createLogger from '../utils/logger.js';
|
|
7
|
+
|
|
8
|
+
const logger = createLogger('TwitterFetcher');
|
|
9
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
10
|
+
const DEFAULT_PROFILE_DIR = path.join(__dirname, '../../data/playwright-profile');
|
|
11
|
+
|
|
12
|
+
const sleep = ms => new Promise(r => setTimeout(r, ms));
|
|
13
|
+
|
|
14
|
+
function validateProfile(profileDir) {
|
|
15
|
+
if (!fs.existsSync(profileDir)) {
|
|
16
|
+
return 'browser profile not found';
|
|
17
|
+
}
|
|
18
|
+
// A valid Chromium profile always contains a Default directory
|
|
19
|
+
if (!fs.existsSync(path.join(profileDir, 'Default'))) {
|
|
20
|
+
return 'browser profile is incomplete or empty';
|
|
21
|
+
}
|
|
22
|
+
return null; // valid
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export default async function twitterFetch(config) {
|
|
26
|
+
const cfg = config.trendingSources?.twitter;
|
|
27
|
+
if (!cfg?.enabled) return [];
|
|
28
|
+
|
|
29
|
+
const profileDir = cfg.profileDir || DEFAULT_PROFILE_DIR;
|
|
30
|
+
const profileError = validateProfile(profileDir);
|
|
31
|
+
if (profileError) {
|
|
32
|
+
logger.warn(`Twitter skipped: ${profileError}`);
|
|
33
|
+
logger.warn('Run: npm run setup:twitter');
|
|
34
|
+
return [];
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Randomise visit order
|
|
38
|
+
const accounts = [...(cfg.accounts || [])].sort(() => Math.random() - 0.5);
|
|
39
|
+
const minLikes = cfg.minLikes || 0;
|
|
40
|
+
const maxPerAccount = cfg.maxTweetsPerAccount || 10;
|
|
41
|
+
const maxAgeHours = cfg.maxAgeHours || 24;
|
|
42
|
+
const cutoff = new Date(Date.now() - maxAgeHours * 3600000);
|
|
43
|
+
|
|
44
|
+
let context;
|
|
45
|
+
try {
|
|
46
|
+
context = await chromium.launchPersistentContext(profileDir, {
|
|
47
|
+
headless: false,
|
|
48
|
+
channel: 'chrome',
|
|
49
|
+
ignoreDefaultArgs: ['--enable-automation'],
|
|
50
|
+
args: ['--disable-blink-features=AutomationControlled'],
|
|
51
|
+
viewport: { width: 1280, height: 800 },
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
const page = context.pages()[0] ?? await context.newPage();
|
|
55
|
+
await sleep(5000);
|
|
56
|
+
|
|
57
|
+
// Land on X home first so the search box is available
|
|
58
|
+
await page.goto('https://x.com/home', { waitUntil: 'domcontentloaded', timeout: 20000 });
|
|
59
|
+
await page.waitForSelector('[data-testid="SearchBox_Search_Input"], [aria-label="Search query"]', { timeout: 15000 });
|
|
60
|
+
await sleep(2000);
|
|
61
|
+
|
|
62
|
+
const allItems = [];
|
|
63
|
+
|
|
64
|
+
for (let i = 0; i < accounts.length; i++) {
|
|
65
|
+
const account = accounts[i];
|
|
66
|
+
try {
|
|
67
|
+
logger.info(`Scraping @${account}...`);
|
|
68
|
+
const tweets = await scrapeAccount(page, account, maxPerAccount, minLikes, cutoff);
|
|
69
|
+
allItems.push(...tweets);
|
|
70
|
+
logger.debug(` → ${tweets.length} tweets from @${account}`);
|
|
71
|
+
} catch (err) {
|
|
72
|
+
logger.error(`Failed @${account}: ${err.message}`);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Rate limit: random 20-30s between accounts
|
|
76
|
+
if (i < accounts.length - 1) {
|
|
77
|
+
const wait = 20000 + Math.random() * 10000;
|
|
78
|
+
logger.debug(` Waiting ${Math.round(wait / 1000)}s before next account...`);
|
|
79
|
+
await sleep(wait);
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
logger.success(`Fetched ${allItems.length} tweets from ${accounts.length} accounts`);
|
|
84
|
+
return allItems;
|
|
85
|
+
} finally {
|
|
86
|
+
if (context) await context.close();
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
async function navigateViaSearch(page, account) {
|
|
91
|
+
// Click the search box
|
|
92
|
+
await page.click('[data-testid="SearchBox_Search_Input"], [aria-label="Search query"]');
|
|
93
|
+
await sleep(800 + Math.random() * 400);
|
|
94
|
+
|
|
95
|
+
// Type account name with human-like delay
|
|
96
|
+
await page.keyboard.type(account, { delay: 80 + Math.random() * 60 });
|
|
97
|
+
await sleep(1500);
|
|
98
|
+
|
|
99
|
+
// Wait for dropdown results
|
|
100
|
+
await page.waitForSelector('[data-testid="TypeaheadUser"]', { timeout: 8000 });
|
|
101
|
+
|
|
102
|
+
// Find the result whose username matches the account
|
|
103
|
+
const matched = await page.evaluate((account) => {
|
|
104
|
+
const results = [...document.querySelectorAll('[data-testid="TypeaheadUser"]')];
|
|
105
|
+
for (const el of results) {
|
|
106
|
+
const handle = el.querySelector('[tabindex="-1"] span')?.innerText?.toLowerCase() || '';
|
|
107
|
+
if (handle.includes(account.toLowerCase())) {
|
|
108
|
+
el.click();
|
|
109
|
+
return true;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
// Fall back to first result
|
|
113
|
+
if (results[0]) { results[0].click(); return true; }
|
|
114
|
+
return false;
|
|
115
|
+
}, account);
|
|
116
|
+
|
|
117
|
+
if (!matched) throw new Error(`No search result found for @${account}`);
|
|
118
|
+
await page.waitForLoadState('domcontentloaded');
|
|
119
|
+
await sleep(1500);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
async function scrollAndWait(page, times = 3) {
|
|
123
|
+
for (let i = 0; i < times; i++) {
|
|
124
|
+
await page.mouse.wheel(0, 400 + Math.random() * 300);
|
|
125
|
+
await sleep(600 + Math.random() * 400);
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
async function scrapeAccount(page, account, limit, minLikes, cutoff) {
|
|
130
|
+
await navigateViaSearch(page, account);
|
|
131
|
+
await page.waitForSelector('article[data-testid="tweet"]', { timeout: 15000 });
|
|
132
|
+
await sleep(1500);
|
|
133
|
+
|
|
134
|
+
// Scroll to load more tweets naturally
|
|
135
|
+
await scrollAndWait(page, 3);
|
|
136
|
+
|
|
137
|
+
const rawTweets = await page.evaluate((limit) => {
|
|
138
|
+
const articles = [...document.querySelectorAll('article[data-testid="tweet"]')].slice(0, limit);
|
|
139
|
+
|
|
140
|
+
return articles.map(article => {
|
|
141
|
+
const textEl = article.querySelector('[data-testid="tweetText"]');
|
|
142
|
+
const text = textEl?.innerText?.trim() || '';
|
|
143
|
+
|
|
144
|
+
const timeEl = article.querySelector('time');
|
|
145
|
+
const link = timeEl?.closest('a')?.href || '';
|
|
146
|
+
const date = timeEl?.getAttribute('datetime') || '';
|
|
147
|
+
|
|
148
|
+
const parseCount = (testId) => {
|
|
149
|
+
const el = article.querySelector(`[data-testid="${testId}"]`);
|
|
150
|
+
const label = el?.getAttribute('aria-label') || '';
|
|
151
|
+
const m = label.match(/(\d[\d,]*)/);
|
|
152
|
+
return m ? parseInt(m[1].replace(/,/g, ''), 10) : 0;
|
|
153
|
+
};
|
|
154
|
+
|
|
155
|
+
return {
|
|
156
|
+
text,
|
|
157
|
+
link,
|
|
158
|
+
date,
|
|
159
|
+
likes: parseCount('like'),
|
|
160
|
+
replies: parseCount('reply'),
|
|
161
|
+
retweets: parseCount('retweet'),
|
|
162
|
+
};
|
|
163
|
+
});
|
|
164
|
+
}, limit);
|
|
165
|
+
|
|
166
|
+
return rawTweets
|
|
167
|
+
.filter(t => t.link && t.text && t.likes >= minLikes && (!t.date || new Date(t.date) >= cutoff))
|
|
168
|
+
.map(t => ({
|
|
169
|
+
id: crypto.createHash('md5').update(t.link).digest('hex'),
|
|
170
|
+
source: 'twitter',
|
|
171
|
+
sourceName: `@${account}`,
|
|
172
|
+
category: 'social',
|
|
173
|
+
title: t.text.substring(0, 100) + (t.text.length > 100 ? '…' : ''),
|
|
174
|
+
link: t.link,
|
|
175
|
+
url: t.link,
|
|
176
|
+
content: t.text,
|
|
177
|
+
summary: t.text.substring(0, 200),
|
|
178
|
+
author: account,
|
|
179
|
+
pubDate: t.date || new Date().toISOString(),
|
|
180
|
+
scraped_at: new Date().toISOString(),
|
|
181
|
+
age_hours: t.date
|
|
182
|
+
? Math.floor((Date.now() - new Date(t.date).getTime()) / 3600000)
|
|
183
|
+
: 0,
|
|
184
|
+
tags: [],
|
|
185
|
+
engagement: {
|
|
186
|
+
upvotes: t.likes,
|
|
187
|
+
comments: t.replies,
|
|
188
|
+
retweets: t.retweets,
|
|
189
|
+
},
|
|
190
|
+
metadata: {
|
|
191
|
+
score: t.likes,
|
|
192
|
+
},
|
|
193
|
+
}));
|
|
194
|
+
}
|
package/src/index.js
ADDED
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
import dotenv from 'dotenv';
|
|
2
|
+
import createLogger from './utils/logger.js';
|
|
3
|
+
import rssFetch from './fetchers/rss.js';
|
|
4
|
+
import redditFetch from './fetchers/reddit.js';
|
|
5
|
+
import hnFetch from './fetchers/hackernews.js';
|
|
6
|
+
import linkedinFetch from './fetchers/linkedin.js';
|
|
7
|
+
import apiFetch from './fetchers/api.js';
|
|
8
|
+
import twitterFetch from './fetchers/twitter.js';
|
|
9
|
+
import linkedinBrowserFetch from './fetchers/linkedin_browser.js';
|
|
10
|
+
import fs from 'fs';
|
|
11
|
+
import path from 'path';
|
|
12
|
+
import { fileURLToPath } from 'url';
|
|
13
|
+
|
|
14
|
+
dotenv.config();
|
|
15
|
+
|
|
16
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
17
|
+
const logger = createLogger('Main');
|
|
18
|
+
|
|
19
|
+
/**
|
|
20
|
+
* Scrape function - exports for library use.
|
|
21
|
+
* @param {Object} options
|
|
22
|
+
* @param {boolean} options.toSupabase - Save results to Supabase
|
|
23
|
+
* @param {string} options.supabaseUrl - Supabase URL
|
|
24
|
+
* @param {string} options.supabaseKey - Supabase service key
|
|
25
|
+
* @param {boolean} options.saveToFilesystem - Save to local files (default: true)
|
|
26
|
+
*/
|
|
27
|
+
async function scrape(options = {}) {
|
|
28
|
+
const {
|
|
29
|
+
toSupabase = false,
|
|
30
|
+
supabaseUrl,
|
|
31
|
+
supabaseKey,
|
|
32
|
+
saveToFilesystem = true,
|
|
33
|
+
} = options;
|
|
34
|
+
|
|
35
|
+
const config = loadConfig();
|
|
36
|
+
const today = getDateString();
|
|
37
|
+
const results = {
|
|
38
|
+
date: today,
|
|
39
|
+
scraped_at: new Date().toISOString(),
|
|
40
|
+
sources: {},
|
|
41
|
+
items: [],
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
// Initialize Supabase if needed
|
|
45
|
+
let supabase = null;
|
|
46
|
+
if (toSupabase && supabaseUrl && supabaseKey) {
|
|
47
|
+
const { createClient } = await import('@supabase/supabase-js');
|
|
48
|
+
supabase = createClient(supabaseUrl, supabaseKey);
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
// RSS Feeds
|
|
52
|
+
if (config.rssFeeds && config.rssFeeds.length > 0) {
|
|
53
|
+
logger.info('📰 Fetching from RSS feeds...');
|
|
54
|
+
try {
|
|
55
|
+
const rssItems = await rssFetch(config);
|
|
56
|
+
results.sources.rss = rssItems.length;
|
|
57
|
+
results.items.push(...rssItems);
|
|
58
|
+
if (saveToFilesystem) await saveSourceData('rss', rssItems, today);
|
|
59
|
+
logger.success(`✅ RSS: ${rssItems.length} items`);
|
|
60
|
+
} catch (error) {
|
|
61
|
+
logger.error(`RSS fetch failed: ${error.message}`);
|
|
62
|
+
results.sources.rss = 0;
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Reddit
|
|
67
|
+
if (config.trendingSources?.reddit?.enabled) {
|
|
68
|
+
logger.info('📱 Fetching from Reddit...');
|
|
69
|
+
try {
|
|
70
|
+
const redditItems = await redditFetch(config);
|
|
71
|
+
results.sources.reddit = redditItems.length;
|
|
72
|
+
results.items.push(...redditItems);
|
|
73
|
+
if (saveToFilesystem) await saveSourceData('reddit', redditItems, today);
|
|
74
|
+
logger.success(`✅ Reddit: ${redditItems.length} items`);
|
|
75
|
+
} catch (error) {
|
|
76
|
+
logger.error(`Reddit fetch failed: ${error.message}`);
|
|
77
|
+
results.sources.reddit = 0;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Hacker News
|
|
82
|
+
if (config.trendingSources?.hackernews?.enabled) {
|
|
83
|
+
logger.info('📰 Fetching from Hacker News...');
|
|
84
|
+
try {
|
|
85
|
+
const hnItems = await hnFetch(config);
|
|
86
|
+
results.sources.hackernews = hnItems.length;
|
|
87
|
+
results.items.push(...hnItems);
|
|
88
|
+
if (saveToFilesystem) await saveSourceData('hackernews', hnItems, today);
|
|
89
|
+
logger.success(`✅ Hacker News: ${hnItems.length} items`);
|
|
90
|
+
} catch (error) {
|
|
91
|
+
logger.error(`Hacker News fetch failed: ${error.message}`);
|
|
92
|
+
results.sources.hackernews = 0;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Generic API sources
|
|
97
|
+
for (const source of (config.apiSources || [])) {
|
|
98
|
+
if (!source.enabled) continue;
|
|
99
|
+
logger.info(`🔌 Fetching from ${source.name}...`);
|
|
100
|
+
try {
|
|
101
|
+
const items = await apiFetch(source);
|
|
102
|
+
results.sources[source.id] = items.length;
|
|
103
|
+
results.items.push(...items);
|
|
104
|
+
if (saveToFilesystem) await saveSourceData(source.id, items, today);
|
|
105
|
+
logger.success(`✅ ${source.name}: ${items.length} items`);
|
|
106
|
+
} catch (error) {
|
|
107
|
+
logger.error(`${source.name} fetch failed: ${error.message}`);
|
|
108
|
+
results.sources[source.id] = 0;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// LinkedIn Browser (skip when toSupabase - requires browser)
|
|
113
|
+
if (config.linkedin_browser?.enabled && !toSupabase) {
|
|
114
|
+
logger.info('💼 Fetching from LinkedIn (browser)...');
|
|
115
|
+
try {
|
|
116
|
+
const items = await linkedinBrowserFetch(config);
|
|
117
|
+
results.sources.linkedin_browser = items.length;
|
|
118
|
+
results.items.push(...items);
|
|
119
|
+
if (saveToFilesystem) await saveSourceData('linkedin_browser', items, today);
|
|
120
|
+
logger.success(`✅ LinkedIn Browser: ${items.length} items`);
|
|
121
|
+
} catch (error) {
|
|
122
|
+
logger.error(`LinkedIn Browser fetch failed: ${error.message}`);
|
|
123
|
+
results.sources.linkedin_browser = 0;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Twitter / X (skip when toSupabase - may require auth)
|
|
128
|
+
if (config.trendingSources?.twitter?.enabled && !toSupabase) {
|
|
129
|
+
logger.info('🐦 Fetching from Twitter/X...');
|
|
130
|
+
try {
|
|
131
|
+
const twitterItems = await twitterFetch(config);
|
|
132
|
+
results.sources.twitter = twitterItems.length;
|
|
133
|
+
results.items.push(...twitterItems);
|
|
134
|
+
if (saveToFilesystem) await saveSourceData('twitter', twitterItems, today);
|
|
135
|
+
logger.success(`✅ Twitter: ${twitterItems.length} items`);
|
|
136
|
+
} catch (error) {
|
|
137
|
+
logger.error(`Twitter fetch failed: ${error.message}`);
|
|
138
|
+
results.sources.twitter = 0;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// LinkedIn (skip when toSupabase - requires auth)
|
|
143
|
+
if (config.linkedin?.enabled && !toSupabase) {
|
|
144
|
+
logger.info('💼 Fetching from LinkedIn...');
|
|
145
|
+
try {
|
|
146
|
+
const linkedinItems = await linkedinFetch(config);
|
|
147
|
+
results.sources.linkedin = linkedinItems.length;
|
|
148
|
+
results.items.push(...linkedinItems);
|
|
149
|
+
if (saveToFilesystem) await saveSourceData('linkedin', linkedinItems, today);
|
|
150
|
+
logger.success(`✅ LinkedIn: ${linkedinItems.length} items`);
|
|
151
|
+
} catch (error) {
|
|
152
|
+
logger.error(`LinkedIn fetch failed: ${error.message}`);
|
|
153
|
+
results.sources.linkedin = 0;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// Save to Supabase if requested
|
|
158
|
+
if (supabase) {
|
|
159
|
+
await saveToSupabase(supabase, results.items, today);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// Generate combined files (filesystem only)
|
|
163
|
+
if (saveToFilesystem && results.items.length > 0) {
|
|
164
|
+
await generateCombinedFiles(results, today);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
const totalItems = Object.values(results.sources).reduce((a, b) => a + b, 0);
|
|
168
|
+
logger.success(`✨ Scraping complete: ${totalItems} total items`);
|
|
169
|
+
|
|
170
|
+
return results;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
function loadConfig() {
|
|
174
|
+
const configPath = path.join(__dirname, '../config/sources.json');
|
|
175
|
+
const content = fs.readFileSync(configPath, 'utf-8');
|
|
176
|
+
return JSON.parse(content);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
function getDateString() {
|
|
180
|
+
return new Date().toISOString().split('T')[0];
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
async function saveSourceData(source, items, today) {
|
|
184
|
+
const todayFolder = path.join(__dirname, '../data', today);
|
|
185
|
+
if (!fs.existsSync(todayFolder)) {
|
|
186
|
+
fs.mkdirSync(todayFolder, { recursive: true });
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
const filePath = path.join(todayFolder, `${source}.json`);
|
|
190
|
+
const data = {
|
|
191
|
+
date: today,
|
|
192
|
+
source: source,
|
|
193
|
+
total_items: items.length,
|
|
194
|
+
scraped_at: new Date().toISOString(),
|
|
195
|
+
items: items
|
|
196
|
+
};
|
|
197
|
+
fs.writeFileSync(filePath, JSON.stringify(data, null, 2));
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
async function saveToSupabase(supabase, items, date) {
|
|
201
|
+
// Normalize items to standard format
|
|
202
|
+
const normalizedItems = items.map(item => ({
|
|
203
|
+
title: item.title,
|
|
204
|
+
url: item.url || item.link,
|
|
205
|
+
summary: item.summary || item.content?.substring(0, 300) || '',
|
|
206
|
+
source: item.source,
|
|
207
|
+
source_detail: item.sourceName || item.source,
|
|
208
|
+
published_at: item.pubDate || item.publishedAt || new Date().toISOString(),
|
|
209
|
+
category: item.category || 'general',
|
|
210
|
+
engagement: item.engagement || item.metadata || { upvotes: 0, comments: 0 },
|
|
211
|
+
date: date,
|
|
212
|
+
scraped_at: new Date().toISOString(),
|
|
213
|
+
}));
|
|
214
|
+
|
|
215
|
+
// Clear old items for this date
|
|
216
|
+
await supabase.from('newsletter_items').delete().eq('date', date);
|
|
217
|
+
|
|
218
|
+
// Insert new items
|
|
219
|
+
const { error } = await supabase.from('newsletter_items').insert(normalizedItems);
|
|
220
|
+
if (error) {
|
|
221
|
+
logger.error(`Supabase insert failed: ${error.message}`);
|
|
222
|
+
throw error;
|
|
223
|
+
}
|
|
224
|
+
logger.success(`✅ Saved ${normalizedItems.length} items to Supabase`);
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
async function generateCombinedFiles(results, today) {
|
|
228
|
+
const todayFolder = path.join(__dirname, '../data', today);
|
|
229
|
+
|
|
230
|
+
// Load all source files
|
|
231
|
+
const allItems = [];
|
|
232
|
+
const sourceFiles = fs.readdirSync(todayFolder)
|
|
233
|
+
.filter(f => f.endsWith('.json') && f !== 'all.json' && f !== 'trending.json');
|
|
234
|
+
for (const file of sourceFiles) {
|
|
235
|
+
const filePath = path.join(todayFolder, file);
|
|
236
|
+
const content = fs.readFileSync(filePath, 'utf-8');
|
|
237
|
+
const data = JSON.parse(content);
|
|
238
|
+
allItems.push(...(data.items || []));
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
// Save all.json
|
|
242
|
+
const allData = {
|
|
243
|
+
date: today,
|
|
244
|
+
generated_at: new Date().toISOString(),
|
|
245
|
+
total_items: allItems.length,
|
|
246
|
+
sources: results.sources,
|
|
247
|
+
items: allItems
|
|
248
|
+
};
|
|
249
|
+
fs.writeFileSync(path.join(todayFolder, 'all.json'), JSON.stringify(allData, null, 2));
|
|
250
|
+
|
|
251
|
+
// Generate trending.json (top 20 by score with source diversity)
|
|
252
|
+
const scoredItems = allItems
|
|
253
|
+
.filter(item => item.metadata?.score || item.engagement?.upvotes || item.engagement?.points || 0)
|
|
254
|
+
.map(item => ({ ...item, combined_score: calculateScore(item) }))
|
|
255
|
+
.sort((a, b) => b.combined_score - a.combined_score);
|
|
256
|
+
|
|
257
|
+
// Apply source diversity: max 5 items per source
|
|
258
|
+
const trendingBySource = {};
|
|
259
|
+
const finalTrending = [];
|
|
260
|
+
for (const item of scoredItems) {
|
|
261
|
+
const source = item.source || 'unknown';
|
|
262
|
+
if (!trendingBySource[source]) trendingBySource[source] = 0;
|
|
263
|
+
if (trendingBySource[source] < 5) {
|
|
264
|
+
trendingBySource[source]++;
|
|
265
|
+
finalTrending.push(item);
|
|
266
|
+
}
|
|
267
|
+
if (finalTrending.length >= 20) break;
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
const trendingData = {
|
|
271
|
+
date: today,
|
|
272
|
+
generated_at: new Date().toISOString(),
|
|
273
|
+
total_items: finalTrending.length,
|
|
274
|
+
items: finalTrending.map((item, index) => ({
|
|
275
|
+
rank: index + 1,
|
|
276
|
+
score: item.combined_score,
|
|
277
|
+
sources: getItemSources(item),
|
|
278
|
+
title: item.title,
|
|
279
|
+
url: item.url || item.link,
|
|
280
|
+
summary: extractSummary(item),
|
|
281
|
+
keywords: extractKeywords(item),
|
|
282
|
+
engagement: item.engagement || item.metadata || {}
|
|
283
|
+
}))
|
|
284
|
+
};
|
|
285
|
+
|
|
286
|
+
fs.writeFileSync(path.join(todayFolder, 'trending.json'), JSON.stringify(trendingData, null, 2));
|
|
287
|
+
logger.success(`✅ Generated: trending.json (${finalTrending.length} items)`);
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
function calculateScore(item) {
|
|
291
|
+
let score = 0;
|
|
292
|
+
if (item.engagement?.upvotes) score += item.engagement.upvotes;
|
|
293
|
+
if (item.engagement?.points) score += item.engagement.points * 2;
|
|
294
|
+
if (item.engagement?.comments) score += item.engagement.comments * 0.5;
|
|
295
|
+
if (item.metadata?.score) score += item.metadata.score;
|
|
296
|
+
return Math.round(score);
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
function getItemSources(item) {
|
|
300
|
+
const sources = [item.source];
|
|
301
|
+
if (item.sourceName) sources.push(item.sourceName);
|
|
302
|
+
return sources;
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
function extractSummary(item) {
|
|
306
|
+
if (item.summary) return item.summary;
|
|
307
|
+
if (item.content) return item.content.substring(0, 200) + '...';
|
|
308
|
+
return '';
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
function extractKeywords(item) {
|
|
312
|
+
if (item.keywords) return item.keywords;
|
|
313
|
+
if (item.tags) return item.tags;
|
|
314
|
+
return [];
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
// CLI interface
|
|
318
|
+
async function main() {
|
|
319
|
+
const command = process.argv[2] || 'scrape';
|
|
320
|
+
|
|
321
|
+
switch (command) {
|
|
322
|
+
case 'scrape':
|
|
323
|
+
await scrape({ saveToFilesystem: true });
|
|
324
|
+
process.exit(0);
|
|
325
|
+
|
|
326
|
+
default:
|
|
327
|
+
console.log(`
|
|
328
|
+
Usage: npm run scrape OR node src/index.js scrape
|
|
329
|
+
|
|
330
|
+
Commands:
|
|
331
|
+
scrape - Scrape all sources and save to today's folder
|
|
332
|
+
|
|
333
|
+
Output will be saved to: data/YYYY-MM-DD/
|
|
334
|
+
- trending.json Top 20 trending items
|
|
335
|
+
- reddit.json All Reddit items
|
|
336
|
+
- hackernews.json All HN items
|
|
337
|
+
- rss.json All RSS items
|
|
338
|
+
- all.json All items combined
|
|
339
|
+
`);
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
main().catch(console.error);
|
|
344
|
+
|
|
345
|
+
export { scrape };
|
|
346
|
+
export default { scrape };
|