@aikeytake/social-automation 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,194 @@
1
+ import { chromium } from 'playwright';
2
+ import fs from 'fs';
3
+ import path from 'path';
4
+ import crypto from 'crypto';
5
+ import { fileURLToPath } from 'url';
6
+ import createLogger from '../utils/logger.js';
7
+
8
+ const logger = createLogger('TwitterFetcher');
9
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
10
+ const DEFAULT_PROFILE_DIR = path.join(__dirname, '../../data/playwright-profile');
11
+
12
+ const sleep = ms => new Promise(r => setTimeout(r, ms));
13
+
14
+ function validateProfile(profileDir) {
15
+ if (!fs.existsSync(profileDir)) {
16
+ return 'browser profile not found';
17
+ }
18
+ // A valid Chromium profile always contains a Default directory
19
+ if (!fs.existsSync(path.join(profileDir, 'Default'))) {
20
+ return 'browser profile is incomplete or empty';
21
+ }
22
+ return null; // valid
23
+ }
24
+
25
+ export default async function twitterFetch(config) {
26
+ const cfg = config.trendingSources?.twitter;
27
+ if (!cfg?.enabled) return [];
28
+
29
+ const profileDir = cfg.profileDir || DEFAULT_PROFILE_DIR;
30
+ const profileError = validateProfile(profileDir);
31
+ if (profileError) {
32
+ logger.warn(`Twitter skipped: ${profileError}`);
33
+ logger.warn('Run: npm run setup:twitter');
34
+ return [];
35
+ }
36
+
37
+ // Randomise visit order
38
+ const accounts = [...(cfg.accounts || [])].sort(() => Math.random() - 0.5);
39
+ const minLikes = cfg.minLikes || 0;
40
+ const maxPerAccount = cfg.maxTweetsPerAccount || 10;
41
+ const maxAgeHours = cfg.maxAgeHours || 24;
42
+ const cutoff = new Date(Date.now() - maxAgeHours * 3600000);
43
+
44
+ let context;
45
+ try {
46
+ context = await chromium.launchPersistentContext(profileDir, {
47
+ headless: false,
48
+ channel: 'chrome',
49
+ ignoreDefaultArgs: ['--enable-automation'],
50
+ args: ['--disable-blink-features=AutomationControlled'],
51
+ viewport: { width: 1280, height: 800 },
52
+ });
53
+
54
+ const page = context.pages()[0] ?? await context.newPage();
55
+ await sleep(5000);
56
+
57
+ // Land on X home first so the search box is available
58
+ await page.goto('https://x.com/home', { waitUntil: 'domcontentloaded', timeout: 20000 });
59
+ await page.waitForSelector('[data-testid="SearchBox_Search_Input"], [aria-label="Search query"]', { timeout: 15000 });
60
+ await sleep(2000);
61
+
62
+ const allItems = [];
63
+
64
+ for (let i = 0; i < accounts.length; i++) {
65
+ const account = accounts[i];
66
+ try {
67
+ logger.info(`Scraping @${account}...`);
68
+ const tweets = await scrapeAccount(page, account, maxPerAccount, minLikes, cutoff);
69
+ allItems.push(...tweets);
70
+ logger.debug(` → ${tweets.length} tweets from @${account}`);
71
+ } catch (err) {
72
+ logger.error(`Failed @${account}: ${err.message}`);
73
+ }
74
+
75
+ // Rate limit: random 20-30s between accounts
76
+ if (i < accounts.length - 1) {
77
+ const wait = 20000 + Math.random() * 10000;
78
+ logger.debug(` Waiting ${Math.round(wait / 1000)}s before next account...`);
79
+ await sleep(wait);
80
+ }
81
+ }
82
+
83
+ logger.success(`Fetched ${allItems.length} tweets from ${accounts.length} accounts`);
84
+ return allItems;
85
+ } finally {
86
+ if (context) await context.close();
87
+ }
88
+ }
89
+
90
+ async function navigateViaSearch(page, account) {
91
+ // Click the search box
92
+ await page.click('[data-testid="SearchBox_Search_Input"], [aria-label="Search query"]');
93
+ await sleep(800 + Math.random() * 400);
94
+
95
+ // Type account name with human-like delay
96
+ await page.keyboard.type(account, { delay: 80 + Math.random() * 60 });
97
+ await sleep(1500);
98
+
99
+ // Wait for dropdown results
100
+ await page.waitForSelector('[data-testid="TypeaheadUser"]', { timeout: 8000 });
101
+
102
+ // Find the result whose username matches the account
103
+ const matched = await page.evaluate((account) => {
104
+ const results = [...document.querySelectorAll('[data-testid="TypeaheadUser"]')];
105
+ for (const el of results) {
106
+ const handle = el.querySelector('[tabindex="-1"] span')?.innerText?.toLowerCase() || '';
107
+ if (handle.includes(account.toLowerCase())) {
108
+ el.click();
109
+ return true;
110
+ }
111
+ }
112
+ // Fall back to first result
113
+ if (results[0]) { results[0].click(); return true; }
114
+ return false;
115
+ }, account);
116
+
117
+ if (!matched) throw new Error(`No search result found for @${account}`);
118
+ await page.waitForLoadState('domcontentloaded');
119
+ await sleep(1500);
120
+ }
121
+
122
+ async function scrollAndWait(page, times = 3) {
123
+ for (let i = 0; i < times; i++) {
124
+ await page.mouse.wheel(0, 400 + Math.random() * 300);
125
+ await sleep(600 + Math.random() * 400);
126
+ }
127
+ }
128
+
129
+ async function scrapeAccount(page, account, limit, minLikes, cutoff) {
130
+ await navigateViaSearch(page, account);
131
+ await page.waitForSelector('article[data-testid="tweet"]', { timeout: 15000 });
132
+ await sleep(1500);
133
+
134
+ // Scroll to load more tweets naturally
135
+ await scrollAndWait(page, 3);
136
+
137
+ const rawTweets = await page.evaluate((limit) => {
138
+ const articles = [...document.querySelectorAll('article[data-testid="tweet"]')].slice(0, limit);
139
+
140
+ return articles.map(article => {
141
+ const textEl = article.querySelector('[data-testid="tweetText"]');
142
+ const text = textEl?.innerText?.trim() || '';
143
+
144
+ const timeEl = article.querySelector('time');
145
+ const link = timeEl?.closest('a')?.href || '';
146
+ const date = timeEl?.getAttribute('datetime') || '';
147
+
148
+ const parseCount = (testId) => {
149
+ const el = article.querySelector(`[data-testid="${testId}"]`);
150
+ const label = el?.getAttribute('aria-label') || '';
151
+ const m = label.match(/(\d[\d,]*)/);
152
+ return m ? parseInt(m[1].replace(/,/g, ''), 10) : 0;
153
+ };
154
+
155
+ return {
156
+ text,
157
+ link,
158
+ date,
159
+ likes: parseCount('like'),
160
+ replies: parseCount('reply'),
161
+ retweets: parseCount('retweet'),
162
+ };
163
+ });
164
+ }, limit);
165
+
166
+ return rawTweets
167
+ .filter(t => t.link && t.text && t.likes >= minLikes && (!t.date || new Date(t.date) >= cutoff))
168
+ .map(t => ({
169
+ id: crypto.createHash('md5').update(t.link).digest('hex'),
170
+ source: 'twitter',
171
+ sourceName: `@${account}`,
172
+ category: 'social',
173
+ title: t.text.substring(0, 100) + (t.text.length > 100 ? '…' : ''),
174
+ link: t.link,
175
+ url: t.link,
176
+ content: t.text,
177
+ summary: t.text.substring(0, 200),
178
+ author: account,
179
+ pubDate: t.date || new Date().toISOString(),
180
+ scraped_at: new Date().toISOString(),
181
+ age_hours: t.date
182
+ ? Math.floor((Date.now() - new Date(t.date).getTime()) / 3600000)
183
+ : 0,
184
+ tags: [],
185
+ engagement: {
186
+ upvotes: t.likes,
187
+ comments: t.replies,
188
+ retweets: t.retweets,
189
+ },
190
+ metadata: {
191
+ score: t.likes,
192
+ },
193
+ }));
194
+ }
package/src/index.js ADDED
@@ -0,0 +1,346 @@
1
+ import dotenv from 'dotenv';
2
+ import createLogger from './utils/logger.js';
3
+ import rssFetch from './fetchers/rss.js';
4
+ import redditFetch from './fetchers/reddit.js';
5
+ import hnFetch from './fetchers/hackernews.js';
6
+ import linkedinFetch from './fetchers/linkedin.js';
7
+ import apiFetch from './fetchers/api.js';
8
+ import twitterFetch from './fetchers/twitter.js';
9
+ import linkedinBrowserFetch from './fetchers/linkedin_browser.js';
10
+ import fs from 'fs';
11
+ import path from 'path';
12
+ import { fileURLToPath } from 'url';
13
+
14
+ dotenv.config();
15
+
16
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
17
+ const logger = createLogger('Main');
18
+
19
+ /**
20
+ * Scrape function - exports for library use.
21
+ * @param {Object} options
22
+ * @param {boolean} options.toSupabase - Save results to Supabase
23
+ * @param {string} options.supabaseUrl - Supabase URL
24
+ * @param {string} options.supabaseKey - Supabase service key
25
+ * @param {boolean} options.saveToFilesystem - Save to local files (default: true)
26
+ */
27
+ async function scrape(options = {}) {
28
+ const {
29
+ toSupabase = false,
30
+ supabaseUrl,
31
+ supabaseKey,
32
+ saveToFilesystem = true,
33
+ } = options;
34
+
35
+ const config = loadConfig();
36
+ const today = getDateString();
37
+ const results = {
38
+ date: today,
39
+ scraped_at: new Date().toISOString(),
40
+ sources: {},
41
+ items: [],
42
+ };
43
+
44
+ // Initialize Supabase if needed
45
+ let supabase = null;
46
+ if (toSupabase && supabaseUrl && supabaseKey) {
47
+ const { createClient } = await import('@supabase/supabase-js');
48
+ supabase = createClient(supabaseUrl, supabaseKey);
49
+ }
50
+
51
+ // RSS Feeds
52
+ if (config.rssFeeds && config.rssFeeds.length > 0) {
53
+ logger.info('📰 Fetching from RSS feeds...');
54
+ try {
55
+ const rssItems = await rssFetch(config);
56
+ results.sources.rss = rssItems.length;
57
+ results.items.push(...rssItems);
58
+ if (saveToFilesystem) await saveSourceData('rss', rssItems, today);
59
+ logger.success(`✅ RSS: ${rssItems.length} items`);
60
+ } catch (error) {
61
+ logger.error(`RSS fetch failed: ${error.message}`);
62
+ results.sources.rss = 0;
63
+ }
64
+ }
65
+
66
+ // Reddit
67
+ if (config.trendingSources?.reddit?.enabled) {
68
+ logger.info('📱 Fetching from Reddit...');
69
+ try {
70
+ const redditItems = await redditFetch(config);
71
+ results.sources.reddit = redditItems.length;
72
+ results.items.push(...redditItems);
73
+ if (saveToFilesystem) await saveSourceData('reddit', redditItems, today);
74
+ logger.success(`✅ Reddit: ${redditItems.length} items`);
75
+ } catch (error) {
76
+ logger.error(`Reddit fetch failed: ${error.message}`);
77
+ results.sources.reddit = 0;
78
+ }
79
+ }
80
+
81
+ // Hacker News
82
+ if (config.trendingSources?.hackernews?.enabled) {
83
+ logger.info('📰 Fetching from Hacker News...');
84
+ try {
85
+ const hnItems = await hnFetch(config);
86
+ results.sources.hackernews = hnItems.length;
87
+ results.items.push(...hnItems);
88
+ if (saveToFilesystem) await saveSourceData('hackernews', hnItems, today);
89
+ logger.success(`✅ Hacker News: ${hnItems.length} items`);
90
+ } catch (error) {
91
+ logger.error(`Hacker News fetch failed: ${error.message}`);
92
+ results.sources.hackernews = 0;
93
+ }
94
+ }
95
+
96
+ // Generic API sources
97
+ for (const source of (config.apiSources || [])) {
98
+ if (!source.enabled) continue;
99
+ logger.info(`🔌 Fetching from ${source.name}...`);
100
+ try {
101
+ const items = await apiFetch(source);
102
+ results.sources[source.id] = items.length;
103
+ results.items.push(...items);
104
+ if (saveToFilesystem) await saveSourceData(source.id, items, today);
105
+ logger.success(`✅ ${source.name}: ${items.length} items`);
106
+ } catch (error) {
107
+ logger.error(`${source.name} fetch failed: ${error.message}`);
108
+ results.sources[source.id] = 0;
109
+ }
110
+ }
111
+
112
+ // LinkedIn Browser (skip when toSupabase - requires browser)
113
+ if (config.linkedin_browser?.enabled && !toSupabase) {
114
+ logger.info('💼 Fetching from LinkedIn (browser)...');
115
+ try {
116
+ const items = await linkedinBrowserFetch(config);
117
+ results.sources.linkedin_browser = items.length;
118
+ results.items.push(...items);
119
+ if (saveToFilesystem) await saveSourceData('linkedin_browser', items, today);
120
+ logger.success(`✅ LinkedIn Browser: ${items.length} items`);
121
+ } catch (error) {
122
+ logger.error(`LinkedIn Browser fetch failed: ${error.message}`);
123
+ results.sources.linkedin_browser = 0;
124
+ }
125
+ }
126
+
127
+ // Twitter / X (skip when toSupabase - may require auth)
128
+ if (config.trendingSources?.twitter?.enabled && !toSupabase) {
129
+ logger.info('🐦 Fetching from Twitter/X...');
130
+ try {
131
+ const twitterItems = await twitterFetch(config);
132
+ results.sources.twitter = twitterItems.length;
133
+ results.items.push(...twitterItems);
134
+ if (saveToFilesystem) await saveSourceData('twitter', twitterItems, today);
135
+ logger.success(`✅ Twitter: ${twitterItems.length} items`);
136
+ } catch (error) {
137
+ logger.error(`Twitter fetch failed: ${error.message}`);
138
+ results.sources.twitter = 0;
139
+ }
140
+ }
141
+
142
+ // LinkedIn (skip when toSupabase - requires auth)
143
+ if (config.linkedin?.enabled && !toSupabase) {
144
+ logger.info('💼 Fetching from LinkedIn...');
145
+ try {
146
+ const linkedinItems = await linkedinFetch(config);
147
+ results.sources.linkedin = linkedinItems.length;
148
+ results.items.push(...linkedinItems);
149
+ if (saveToFilesystem) await saveSourceData('linkedin', linkedinItems, today);
150
+ logger.success(`✅ LinkedIn: ${linkedinItems.length} items`);
151
+ } catch (error) {
152
+ logger.error(`LinkedIn fetch failed: ${error.message}`);
153
+ results.sources.linkedin = 0;
154
+ }
155
+ }
156
+
157
+ // Save to Supabase if requested
158
+ if (supabase) {
159
+ await saveToSupabase(supabase, results.items, today);
160
+ }
161
+
162
+ // Generate combined files (filesystem only)
163
+ if (saveToFilesystem && results.items.length > 0) {
164
+ await generateCombinedFiles(results, today);
165
+ }
166
+
167
+ const totalItems = Object.values(results.sources).reduce((a, b) => a + b, 0);
168
+ logger.success(`✨ Scraping complete: ${totalItems} total items`);
169
+
170
+ return results;
171
+ }
172
+
173
+ function loadConfig() {
174
+ const configPath = path.join(__dirname, '../config/sources.json');
175
+ const content = fs.readFileSync(configPath, 'utf-8');
176
+ return JSON.parse(content);
177
+ }
178
+
179
+ function getDateString() {
180
+ return new Date().toISOString().split('T')[0];
181
+ }
182
+
183
+ async function saveSourceData(source, items, today) {
184
+ const todayFolder = path.join(__dirname, '../data', today);
185
+ if (!fs.existsSync(todayFolder)) {
186
+ fs.mkdirSync(todayFolder, { recursive: true });
187
+ }
188
+
189
+ const filePath = path.join(todayFolder, `${source}.json`);
190
+ const data = {
191
+ date: today,
192
+ source: source,
193
+ total_items: items.length,
194
+ scraped_at: new Date().toISOString(),
195
+ items: items
196
+ };
197
+ fs.writeFileSync(filePath, JSON.stringify(data, null, 2));
198
+ }
199
+
200
+ async function saveToSupabase(supabase, items, date) {
201
+ // Normalize items to standard format
202
+ const normalizedItems = items.map(item => ({
203
+ title: item.title,
204
+ url: item.url || item.link,
205
+ summary: item.summary || item.content?.substring(0, 300) || '',
206
+ source: item.source,
207
+ source_detail: item.sourceName || item.source,
208
+ published_at: item.pubDate || item.publishedAt || new Date().toISOString(),
209
+ category: item.category || 'general',
210
+ engagement: item.engagement || item.metadata || { upvotes: 0, comments: 0 },
211
+ date: date,
212
+ scraped_at: new Date().toISOString(),
213
+ }));
214
+
215
+ // Clear old items for this date
216
+ await supabase.from('newsletter_items').delete().eq('date', date);
217
+
218
+ // Insert new items
219
+ const { error } = await supabase.from('newsletter_items').insert(normalizedItems);
220
+ if (error) {
221
+ logger.error(`Supabase insert failed: ${error.message}`);
222
+ throw error;
223
+ }
224
+ logger.success(`✅ Saved ${normalizedItems.length} items to Supabase`);
225
+ }
226
+
227
+ async function generateCombinedFiles(results, today) {
228
+ const todayFolder = path.join(__dirname, '../data', today);
229
+
230
+ // Load all source files
231
+ const allItems = [];
232
+ const sourceFiles = fs.readdirSync(todayFolder)
233
+ .filter(f => f.endsWith('.json') && f !== 'all.json' && f !== 'trending.json');
234
+ for (const file of sourceFiles) {
235
+ const filePath = path.join(todayFolder, file);
236
+ const content = fs.readFileSync(filePath, 'utf-8');
237
+ const data = JSON.parse(content);
238
+ allItems.push(...(data.items || []));
239
+ }
240
+
241
+ // Save all.json
242
+ const allData = {
243
+ date: today,
244
+ generated_at: new Date().toISOString(),
245
+ total_items: allItems.length,
246
+ sources: results.sources,
247
+ items: allItems
248
+ };
249
+ fs.writeFileSync(path.join(todayFolder, 'all.json'), JSON.stringify(allData, null, 2));
250
+
251
+ // Generate trending.json (top 20 by score with source diversity)
252
+ const scoredItems = allItems
253
+ .filter(item => item.metadata?.score || item.engagement?.upvotes || item.engagement?.points || 0)
254
+ .map(item => ({ ...item, combined_score: calculateScore(item) }))
255
+ .sort((a, b) => b.combined_score - a.combined_score);
256
+
257
+ // Apply source diversity: max 5 items per source
258
+ const trendingBySource = {};
259
+ const finalTrending = [];
260
+ for (const item of scoredItems) {
261
+ const source = item.source || 'unknown';
262
+ if (!trendingBySource[source]) trendingBySource[source] = 0;
263
+ if (trendingBySource[source] < 5) {
264
+ trendingBySource[source]++;
265
+ finalTrending.push(item);
266
+ }
267
+ if (finalTrending.length >= 20) break;
268
+ }
269
+
270
+ const trendingData = {
271
+ date: today,
272
+ generated_at: new Date().toISOString(),
273
+ total_items: finalTrending.length,
274
+ items: finalTrending.map((item, index) => ({
275
+ rank: index + 1,
276
+ score: item.combined_score,
277
+ sources: getItemSources(item),
278
+ title: item.title,
279
+ url: item.url || item.link,
280
+ summary: extractSummary(item),
281
+ keywords: extractKeywords(item),
282
+ engagement: item.engagement || item.metadata || {}
283
+ }))
284
+ };
285
+
286
+ fs.writeFileSync(path.join(todayFolder, 'trending.json'), JSON.stringify(trendingData, null, 2));
287
+ logger.success(`✅ Generated: trending.json (${finalTrending.length} items)`);
288
+ }
289
+
290
+ function calculateScore(item) {
291
+ let score = 0;
292
+ if (item.engagement?.upvotes) score += item.engagement.upvotes;
293
+ if (item.engagement?.points) score += item.engagement.points * 2;
294
+ if (item.engagement?.comments) score += item.engagement.comments * 0.5;
295
+ if (item.metadata?.score) score += item.metadata.score;
296
+ return Math.round(score);
297
+ }
298
+
299
+ function getItemSources(item) {
300
+ const sources = [item.source];
301
+ if (item.sourceName) sources.push(item.sourceName);
302
+ return sources;
303
+ }
304
+
305
+ function extractSummary(item) {
306
+ if (item.summary) return item.summary;
307
+ if (item.content) return item.content.substring(0, 200) + '...';
308
+ return '';
309
+ }
310
+
311
+ function extractKeywords(item) {
312
+ if (item.keywords) return item.keywords;
313
+ if (item.tags) return item.tags;
314
+ return [];
315
+ }
316
+
317
+ // CLI interface
318
+ async function main() {
319
+ const command = process.argv[2] || 'scrape';
320
+
321
+ switch (command) {
322
+ case 'scrape':
323
+ await scrape({ saveToFilesystem: true });
324
+ process.exit(0);
325
+
326
+ default:
327
+ console.log(`
328
+ Usage: npm run scrape OR node src/index.js scrape
329
+
330
+ Commands:
331
+ scrape - Scrape all sources and save to today's folder
332
+
333
+ Output will be saved to: data/YYYY-MM-DD/
334
+ - trending.json Top 20 trending items
335
+ - reddit.json All Reddit items
336
+ - hackernews.json All HN items
337
+ - rss.json All RSS items
338
+ - all.json All items combined
339
+ `);
340
+ }
341
+ }
342
+
343
+ main().catch(console.error);
344
+
345
+ export { scrape };
346
+ export default { scrape };