@aikeytake/social-automation 2.0.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@aikeytake/social-automation",
3
- "version": "2.0.0",
3
+ "version": "2.0.2",
4
4
  "description": "Content research and aggregation tool for AI agents",
5
5
  "main": "src/index.js",
6
6
  "type": "module",
@@ -28,7 +28,6 @@
28
28
  "axios": "^1.7.9",
29
29
  "cheerio": "^1.0.0",
30
30
  "dotenv": "^16.4.7",
31
- "playwright": "^1.58.2",
32
31
  "rss-parser": "^3.13.0"
33
32
  },
34
33
  "devDependencies": {
package/src/index.js CHANGED
@@ -3,10 +3,7 @@ import createLogger from './utils/logger.js';
3
3
  import rssFetch from './fetchers/rss.js';
4
4
  import redditFetch from './fetchers/reddit.js';
5
5
  import hnFetch from './fetchers/hackernews.js';
6
- import linkedinFetch from './fetchers/linkedin.js';
7
6
  import apiFetch from './fetchers/api.js';
8
- import twitterFetch from './fetchers/twitter.js';
9
- import linkedinBrowserFetch from './fetchers/linkedin_browser.js';
10
7
  import fs from 'fs';
11
8
  import path from 'path';
12
9
  import { fileURLToPath } from 'url';
@@ -109,51 +106,6 @@ async function scrape(options = {}) {
109
106
  }
110
107
  }
111
108
 
112
- // LinkedIn Browser (skip when toSupabase - requires browser)
113
- if (config.linkedin_browser?.enabled && !toSupabase) {
114
- logger.info('💼 Fetching from LinkedIn (browser)...');
115
- try {
116
- const items = await linkedinBrowserFetch(config);
117
- results.sources.linkedin_browser = items.length;
118
- results.items.push(...items);
119
- if (saveToFilesystem) await saveSourceData('linkedin_browser', items, today);
120
- logger.success(`✅ LinkedIn Browser: ${items.length} items`);
121
- } catch (error) {
122
- logger.error(`LinkedIn Browser fetch failed: ${error.message}`);
123
- results.sources.linkedin_browser = 0;
124
- }
125
- }
126
-
127
- // Twitter / X (skip when toSupabase - may require auth)
128
- if (config.trendingSources?.twitter?.enabled && !toSupabase) {
129
- logger.info('🐦 Fetching from Twitter/X...');
130
- try {
131
- const twitterItems = await twitterFetch(config);
132
- results.sources.twitter = twitterItems.length;
133
- results.items.push(...twitterItems);
134
- if (saveToFilesystem) await saveSourceData('twitter', twitterItems, today);
135
- logger.success(`✅ Twitter: ${twitterItems.length} items`);
136
- } catch (error) {
137
- logger.error(`Twitter fetch failed: ${error.message}`);
138
- results.sources.twitter = 0;
139
- }
140
- }
141
-
142
- // LinkedIn (skip when toSupabase - requires auth)
143
- if (config.linkedin?.enabled && !toSupabase) {
144
- logger.info('💼 Fetching from LinkedIn...');
145
- try {
146
- const linkedinItems = await linkedinFetch(config);
147
- results.sources.linkedin = linkedinItems.length;
148
- results.items.push(...linkedinItems);
149
- if (saveToFilesystem) await saveSourceData('linkedin', linkedinItems, today);
150
- logger.success(`✅ LinkedIn: ${linkedinItems.length} items`);
151
- } catch (error) {
152
- logger.error(`LinkedIn fetch failed: ${error.message}`);
153
- results.sources.linkedin = 0;
154
- }
155
- }
156
-
157
109
  // Save to Supabase if requested
158
110
  if (supabase) {
159
111
  await saveToSupabase(supabase, results.items, today);
@@ -1,400 +0,0 @@
1
- import axios from 'axios';
2
- import fs from 'fs';
3
- import path from 'path';
4
- import { fileURLToPath } from 'url';
5
- import crypto from 'crypto';
6
- import createLogger from '../utils/logger.js';
7
-
8
- const logger = createLogger('LinkedInFetcher');
9
-
10
- const BRIGHTDATA_API_URL = 'https://api.brightdata.com/request';
11
- const __dirname = path.dirname(fileURLToPath(import.meta.url));
12
- const STATE_FILE = path.join(__dirname, '../../data/kol-state.json');
13
-
14
- // Defaults (overridable via config/sources.json linkedin section)
15
- const DEFAULTS = {
16
- batchSize: 8, // KOLs per SERP query
17
- budgetPerRun: 25, // Max SERP API calls per run (25 × 8 = 200 KOLs per run)
18
- checkIntervalHours: 24, // Re-check each KOL every 24h
19
- timeRange: 'w', // w=week, d=day, m=month
20
- resultsPerBatch: 10, // Google results per batch query
21
- enrichContent: true, // Scrape each post URL for full content + engagement
22
- enrichConcurrency: 5, // Parallel enrichment requests
23
- };
24
-
25
- export default async function linkedinFetch(config) {
26
- if (!config.linkedin?.enabled) return [];
27
-
28
- const BRIGHTDATA_API_KEY = process.env.BRIGHTDATA_API_KEY;
29
- const BRIGHTDATA_ZONE = process.env.BRIGHTDATA_ZONE || 'mcp_unlocker';
30
-
31
- if (!BRIGHTDATA_API_KEY) {
32
- logger.warn('BRIGHTDATA_API_KEY not set, skipping LinkedIn scraping');
33
- return [];
34
- }
35
-
36
- const profilesFile = config.linkedin.profilesFile;
37
- if (!fs.existsSync(profilesFile)) {
38
- logger.error(`LinkedIn profiles file not found: ${profilesFile}`);
39
- return [];
40
- }
41
-
42
- let profiles;
43
- try {
44
- profiles = JSON.parse(fs.readFileSync(profilesFile, 'utf-8'));
45
- } catch (err) {
46
- logger.error(`Failed to parse LinkedIn profiles file: ${err.message}`);
47
- return [];
48
- }
49
-
50
- const cfg = { ...DEFAULTS, ...config.linkedin };
51
- const state = loadState();
52
- const now = new Date();
53
- const cutoffMs = cfg.checkIntervalHours * 3600 * 1000;
54
-
55
- // Select only KOLs not checked recently
56
- const dueKols = profiles.filter(p => {
57
- const last = state[p.name]?.lastChecked;
58
- return !last || (now - new Date(last)) >= cutoffMs;
59
- });
60
-
61
- const maxKols = cfg.budgetPerRun * cfg.batchSize;
62
- const selectedKols = dueKols.slice(0, maxKols);
63
-
64
- if (selectedKols.length === 0) {
65
- logger.info(`LinkedIn: all ${profiles.length} KOLs recently checked, nothing due`);
66
- return [];
67
- }
68
-
69
- const numBatches = Math.ceil(selectedKols.length / cfg.batchSize);
70
- logger.info(`LinkedIn: checking ${selectedKols.length}/${profiles.length} KOLs in ${numBatches} batches...`);
71
-
72
- const allPosts = [];
73
- const batches = chunk(selectedKols, cfg.batchSize);
74
-
75
- for (const batch of batches) {
76
- try {
77
- let posts = await fetchBatchWithRetry(batch, state, cfg, BRIGHTDATA_API_KEY, BRIGHTDATA_ZONE);
78
-
79
- // Enrich posts with full content + engagement by scraping each post URL
80
- if (cfg.enrichContent && posts.length > 0) {
81
- posts = await enrichPosts(posts, cfg, BRIGHTDATA_API_KEY, BRIGHTDATA_ZONE);
82
- }
83
-
84
- allPosts.push(...posts);
85
-
86
- // Update state: mark all KOLs in batch as checked, record seen post IDs
87
- for (const kol of batch) {
88
- if (!state[kol.name]) state[kol.name] = { seenPostIds: [] };
89
- state[kol.name].lastChecked = now.toISOString();
90
- const newIds = posts.filter(p => p.sourceName === kol.name).map(p => p.id);
91
- state[kol.name].seenPostIds = [
92
- ...new Set([...newIds, ...(state[kol.name].seenPostIds || [])]),
93
- ].slice(0, 100); // keep last 100 seen post IDs per KOL
94
- }
95
-
96
- await new Promise(r => setTimeout(r, 600));
97
- } catch (err) {
98
- logger.error(`Batch failed: ${err.message}`);
99
- }
100
- }
101
-
102
- saveState(state);
103
-
104
- const checked = selectedKols.length;
105
- const remaining = dueKols.length - checked;
106
- logger.success(`LinkedIn: ${allPosts.length} new posts (${checked} KOLs checked, ${remaining} still due)`);
107
- return allPosts;
108
- }
109
-
110
- async function fetchBatchWithRetry(batch, state, cfg, apiKey, zone, retries = 1) {
111
- try {
112
- return await fetchBatch(batch, state, cfg, apiKey, zone);
113
- } catch (err) {
114
- if (retries > 0 && (err.code === 'ECONNABORTED' || err.message?.includes('timeout'))) {
115
- logger.warn(`Batch timed out, retrying... (${retries} left)`);
116
- await new Promise(r => setTimeout(r, 2000));
117
- return fetchBatchWithRetry(batch, state, cfg, apiKey, zone, retries - 1);
118
- }
119
- throw err;
120
- }
121
- }
122
-
123
- async function fetchBatch(batch, state, cfg, apiKey, zone) {
124
- // Batch multiple KOL names into one SERP query
125
- const nameList = batch.map(k => `"${k.name}"`).join(' OR ');
126
- const searchQuery = `site:linkedin.com/posts (${nameList})`;
127
- const googleUrl = [
128
- 'https://www.google.com/search',
129
- `?q=${encodeURIComponent(searchQuery)}`,
130
- `&num=${cfg.resultsPerBatch}`,
131
- `&tbs=qdr:${cfg.timeRange}`, // time filter: recent posts only
132
- '&brd_json=1',
133
- ].join('');
134
-
135
- const response = await axios.post(
136
- BRIGHTDATA_API_URL,
137
- { zone, url: googleUrl, format: 'raw', data_format: 'parsed_light' },
138
- {
139
- headers: { Authorization: `Bearer ${apiKey}`, 'Content-Type': 'application/json' },
140
- timeout: 40000,
141
- }
142
- );
143
-
144
- const organicResults = response.data?.organic || [];
145
- const posts = [];
146
-
147
- for (const item of organicResults) {
148
- if (!item.link?.includes('linkedin.com/posts')) continue;
149
-
150
- // Match result back to a specific KOL from this batch
151
- const kol = matchKol(item, batch);
152
- if (!kol) continue;
153
-
154
- // Skip boilerplate / profile-bio-only snippets — these have no post content
155
- const rawContent = item.description || item.snippet || '';
156
- if (!isUsefulContent(rawContent)) continue;
157
-
158
- const id = crypto.createHash('md5').update(item.link).digest('hex');
159
-
160
- // Skip posts we've already seen for this KOL
161
- if (state[kol.name]?.seenPostIds?.includes(id)) continue;
162
-
163
- posts.push({
164
- id,
165
- source: 'linkedin',
166
- sourceName: kol.name,
167
- category: 'linkedin-kol',
168
- title: cleanTitle(item.title || '', kol.name),
169
- link: item.link,
170
- url: item.link,
171
- content: cleanContent(rawContent),
172
- summary: cleanContent(rawContent).substring(0, 200),
173
- author: kol.name,
174
- role: kol.role || '',
175
- pubDate: extractDate(item) || new Date().toISOString(),
176
- scraped_at: new Date().toISOString(),
177
- age_hours: 0,
178
- engagement: { upvotes: 0, comments: 0 },
179
- metadata: { score: 0 },
180
- });
181
- }
182
-
183
- return posts;
184
- }
185
-
186
- // Scrape each post URL for full content + engagement, with concurrency limit
187
- async function enrichPosts(posts, cfg, apiKey, zone) {
188
- const concurrency = cfg.enrichConcurrency || 5;
189
- const enriched = [];
190
-
191
- for (let i = 0; i < posts.length; i += concurrency) {
192
- const batch = posts.slice(i, i + concurrency);
193
- const results = await Promise.all(batch.map(p => enrichPost(p, apiKey, zone)));
194
- enriched.push(...results);
195
- }
196
-
197
- return enriched;
198
- }
199
-
200
- async function enrichPost(post, apiKey, zone) {
201
- try {
202
- const response = await axios.post(
203
- BRIGHTDATA_API_URL,
204
- { zone, url: post.url, format: 'raw', data_format: 'markdown' },
205
- {
206
- headers: { Authorization: `Bearer ${apiKey}`, 'Content-Type': 'application/json' },
207
- timeout: 40000,
208
- }
209
- );
210
-
211
- const markdown = typeof response.data === 'string' ? response.data : JSON.stringify(response.data);
212
- const fullContent = extractPostContent(markdown);
213
- const engagement = extractEngagement(markdown);
214
- const pubDate = extractDateFromPage(markdown) || post.pubDate;
215
-
216
- if (fullContent) {
217
- post.content = fullContent;
218
- post.summary = fullContent.substring(0, 200);
219
- }
220
- post.engagement = engagement;
221
- post.pubDate = pubDate;
222
- } catch (err) {
223
- // Best-effort — keep SERP snippet if scraping fails
224
- logger.debug(`Enrich failed for ${post.sourceName}: ${err.message}`);
225
- }
226
- return post;
227
- }
228
-
229
- // Extract full post content from LinkedIn page markdown.
230
- // LinkedIn page structure (logged-out view):
231
- // [Author] [Job] [N followers]
232
- // [Nd/w/h ago] •
233
- // [Full post text here]
234
- // Like Comment Repost Send
235
- // [N reactions] • [N comments]
236
- function extractPostContent(markdown) {
237
- const lines = markdown.split('\n');
238
- let start = -1;
239
- let end = lines.length;
240
-
241
- // Find the line after the date indicator (e.g. "3d •", "2w •", "1mo •")
242
- for (let i = 0; i < lines.length; i++) {
243
- const line = lines[i].trim();
244
- if (/^\d+[dwm]\s*[•·]/.test(line) || /\d+\s+(hour|day|week|month)s?\s+ago/i.test(line)) {
245
- start = i + 1;
246
- break;
247
- }
248
- }
249
-
250
- if (start === -1) return null;
251
-
252
- // Find where content ends: engagement buttons or reaction counts
253
- for (let i = start; i < lines.length; i++) {
254
- const line = lines[i].trim().toLowerCase();
255
- if (
256
- line === 'like comment repost send' ||
257
- line.startsWith('like') && line.includes('comment') ||
258
- /^\d[\d,]*\s*reaction/.test(line) ||
259
- line === 'reactions' ||
260
- line.includes('sign in') ||
261
- line.includes('join now')
262
- ) {
263
- end = i;
264
- break;
265
- }
266
- }
267
-
268
- const content = lines
269
- .slice(start, end)
270
- .join('\n')
271
- .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1') // collapse markdown links to text
272
- .replace(/^#+\s/gm, '') // strip heading markers
273
- .replace(/\n{3,}/g, '\n\n')
274
- .trim();
275
-
276
- return content.length > 30 ? content : null;
277
- }
278
-
279
- // Extract like/comment/repost counts from LinkedIn page markdown
280
- function extractEngagement(markdown) {
281
- const parse = (pattern) => {
282
- const m = markdown.match(pattern);
283
- return m ? parseInt(m[1].replace(/,/g, '')) : 0;
284
- };
285
- return {
286
- upvotes: parse(/(\d[\d,]*)\s*reaction/i),
287
- comments: parse(/(\d[\d,]*)\s*comment/i),
288
- reposts: parse(/(\d[\d,]*)\s*repost/i),
289
- };
290
- }
291
-
292
- // Extract post date from LinkedIn page markdown (more accurate than SERP snippet)
293
- function extractDateFromPage(markdown) {
294
- const ago = markdown.match(/(\d+)\s*(hour|day|week|month)s?\s*ago/i) ||
295
- markdown.match(/(\d+)(h|d|w|mo)\s*[•·]/);
296
- if (!ago) return null;
297
- const n = parseInt(ago[1]);
298
- const unitRaw = ago[2].toLowerCase();
299
- const unit = { h: 'hour', d: 'day', w: 'week', mo: 'month' }[unitRaw] || unitRaw;
300
- const ms = { hour: 3600000, day: 86400000, week: 604800000, month: 2592000000 }[unit] || 0;
301
- return ms ? new Date(Date.now() - n * ms).toISOString() : null;
302
- }
303
-
304
- // Match a search result to a KOL in the batch.
305
- // We use two signals, in order of reliability:
306
- // 1. URL username contains BOTH first+last name parts of the KOL → post is by them
307
- // 2. Title is "[KOL Name]'s Post" format → LinkedIn's own-post title format
308
- // Anything else (posts merely mentioning the KOL) is rejected.
309
- function matchKol(item, batch) {
310
- const urlUsername = extractUrlUsername(item.link || '');
311
- const titleAuthor = (item.title || '').match(/^(.+?)'s Post/i)?.[1]?.toLowerCase().trim() || '';
312
-
313
- for (const kol of batch) {
314
- const parts = sanitizeName(kol.name).split(' ').filter(Boolean);
315
- if (parts.length < 2) continue;
316
- const [first, ...rest] = parts;
317
- const last = rest[rest.length - 1];
318
-
319
- // Signal 1: URL username contains both first AND last name → reliable authorship
320
- if (urlUsername && urlUsername.includes(first) && urlUsername.includes(last)) return kol;
321
-
322
- // Signal 2: Title's "Name's Post" format matches this KOL
323
- if (titleAuthor && titleAuthor.includes(first) && titleAuthor.includes(last)) return kol;
324
- }
325
-
326
- return null;
327
- }
328
-
329
- // Extract the author username from a LinkedIn post URL
330
- // linkedin.com/posts/USERNAME_post-title-activity-ID
331
- function extractUrlUsername(url) {
332
- const m = url.match(/linkedin\.com\/posts\/([^_/]+)/);
333
- return m?.[1]?.toLowerCase() || '';
334
- }
335
-
336
- function sanitizeName(name) {
337
- return name.toLowerCase().replace(/[^a-z0-9 ]/g, '').trim();
338
- }
339
-
340
- // Returns false for snippets that are just LinkedIn profile bios / boilerplate
341
- function isUsefulContent(content) {
342
- if (!content || content.length < 40) return false;
343
- const lower = content.toLowerCase();
344
- // Common boilerplate patterns from LinkedIn search results
345
- const boilerplate = [
346
- 'view profile for',
347
- 'report this comment',
348
- 'close menu',
349
- 'like · reply',
350
- '1 reaction',
351
- ];
352
- return !boilerplate.some(b => lower.includes(b));
353
- }
354
-
355
- // Remove LinkedIn UI chrome from content
356
- function cleanContent(text) {
357
- return text
358
- .replace(/\.\.\.Read more$/i, '')
359
- .replace(/View profile for [^.]+\./gi, '')
360
- .replace(/\s+/g, ' ')
361
- .trim();
362
- }
363
-
364
- // Remove "Name's Post" boilerplate from title
365
- function cleanTitle(title, authorName) {
366
- const cleaned = title
367
- .replace(new RegExp(`^${authorName}'s Post\\s*[-–]?\\s*`, 'i'), '')
368
- .trim();
369
- return cleaned || title;
370
- }
371
-
372
- // Parse relative date strings from Google snippets into ISO dates
373
- function extractDate(item) {
374
- const text = `${item.description || ''} ${item.date || ''}`;
375
- const ago = text.match(/(\d+)\s*(hour|day|week|month)s?\s*ago/i);
376
- if (!ago) return null;
377
- const n = parseInt(ago[1]);
378
- const unit = ago[2].toLowerCase();
379
- const ms = { hour: 3600000, day: 86400000, week: 604800000, month: 2592000000 }[unit] || 0;
380
- return new Date(Date.now() - n * ms).toISOString();
381
- }
382
-
383
- function chunk(arr, size) {
384
- const result = [];
385
- for (let i = 0; i < arr.length; i += size) result.push(arr.slice(i, i + size));
386
- return result;
387
- }
388
-
389
- function loadState() {
390
- try {
391
- if (fs.existsSync(STATE_FILE)) return JSON.parse(fs.readFileSync(STATE_FILE, 'utf-8'));
392
- } catch { /* start fresh */ }
393
- return {};
394
- }
395
-
396
- function saveState(state) {
397
- const dir = path.dirname(STATE_FILE);
398
- if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
399
- fs.writeFileSync(STATE_FILE, JSON.stringify(state, null, 2));
400
- }
@@ -1,167 +0,0 @@
1
- import { chromium } from 'playwright';
2
- import fs from 'fs';
3
- import path from 'path';
4
- import crypto from 'crypto';
5
- import { fileURLToPath } from 'url';
6
- import createLogger from '../utils/logger.js';
7
-
8
- const logger = createLogger('LinkedInBrowser');
9
- const __dirname = path.dirname(fileURLToPath(import.meta.url));
10
- const DEFAULT_PROFILE_DIR = path.join(__dirname, '../../data/playwright-profile');
11
-
12
- const sleep = ms => new Promise(r => setTimeout(r, ms));
13
-
14
- function validateProfile(profileDir) {
15
- if (!fs.existsSync(profileDir)) return 'browser profile not found';
16
- if (!fs.existsSync(path.join(profileDir, 'Default'))) return 'browser profile is incomplete';
17
- return null;
18
- }
19
-
20
- export default async function linkedinBrowserFetch(config) {
21
- const cfg = config.linkedin_browser;
22
- if (!cfg?.enabled) return [];
23
-
24
- const profileDir = cfg.profileDir || DEFAULT_PROFILE_DIR;
25
- const profileError = validateProfile(profileDir);
26
- if (profileError) {
27
- logger.warn(`LinkedIn Browser skipped: ${profileError}`);
28
- logger.warn('Run: npm run setup:twitter (same profile as Twitter)');
29
- return [];
30
- }
31
-
32
- const accounts = [...(cfg.accounts || [])].sort(() => Math.random() - 0.5);
33
- const maxPerAccount = cfg.maxPostsPerAccount || 5;
34
- const maxAgeHours = cfg.maxAgeHours || 48;
35
- const cutoff = new Date(Date.now() - maxAgeHours * 3600000);
36
- const delay = cfg.delayBetweenAccountsMs || 10000;
37
-
38
- let context;
39
- try {
40
- context = await chromium.launchPersistentContext(profileDir, {
41
- headless: false,
42
- channel: 'chrome',
43
- ignoreDefaultArgs: ['--enable-automation'],
44
- args: ['--disable-blink-features=AutomationControlled'],
45
- viewport: { width: 1280, height: 900 },
46
- });
47
-
48
- const page = context.pages()[0] ?? await context.newPage();
49
- await sleep(3000);
50
-
51
- const allItems = [];
52
-
53
- for (let i = 0; i < accounts.length; i++) {
54
- const account = accounts[i];
55
- try {
56
- logger.info(`Scraping linkedin.com/in/${account}...`);
57
- const posts = await scrapeAccount(page, account, maxPerAccount, cutoff);
58
- allItems.push(...posts);
59
- logger.debug(` → ${posts.length} posts from ${account}`);
60
- } catch (err) {
61
- logger.error(`Failed ${account}: ${err.message}`);
62
- }
63
-
64
- if (i < accounts.length - 1) {
65
- const wait = delay + Math.random() * 5000;
66
- logger.debug(` Waiting ${Math.round(wait / 1000)}s...`);
67
- await sleep(wait);
68
- }
69
- }
70
-
71
- logger.success(`Fetched ${allItems.length} posts from ${accounts.length} LinkedIn accounts`);
72
- return allItems;
73
- } finally {
74
- if (context) await context.close();
75
- }
76
- }
77
-
78
- async function scrapeAccount(page, slug, limit, cutoff) {
79
- const url = `https://www.linkedin.com/in/${slug}/recent-activity/all/`;
80
- await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
81
-
82
- // Wait for posts feed
83
- await page.waitForSelector('div[data-urn]', { timeout: 15000 });
84
- await sleep(2000);
85
-
86
- // Scroll to load more posts
87
- for (let i = 0; i < 3; i++) {
88
- await page.mouse.wheel(0, 500 + Math.random() * 300);
89
- await sleep(700 + Math.random() * 400);
90
- }
91
-
92
- const rawPosts = await page.evaluate((limit) => {
93
- const containers = [...document.querySelectorAll('div[data-urn^="urn:li:activity"]')]
94
- .slice(0, limit);
95
-
96
- return containers.map(el => {
97
- const urn = el.getAttribute('data-urn');
98
-
99
- // Post text
100
- const textEl = el.querySelector('.update-components-text, [class*="commentary"]');
101
- const text = textEl?.innerText?.trim() || '';
102
-
103
- // Construct post URL directly from urn
104
- const link = urn ? `https://www.linkedin.com/feed/update/${urn}/` : '';
105
-
106
- // Time ago — take first segment before " •" or newline
107
- const timeEl = el.querySelector('.update-components-actor__sub-description');
108
- const timeAgo = timeEl?.innerText?.trim().split(/\s*[•\n]/)[0].trim() || '';
109
-
110
- // Reactions count
111
- const reactionsEl = el.querySelector('.social-details-social-counts__reactions-count');
112
- const reactions = parseInt(reactionsEl?.innerText?.replace(/[^0-9]/g, '') || '0', 10);
113
-
114
- // Comments count — parse from social counts block
115
- const countsEl = el.querySelector('[class*="social-counts"]');
116
- const commentsMatch = countsEl?.innerText?.match(/(\d+)\s+comment/);
117
- const comments = commentsMatch ? parseInt(commentsMatch[1]) : 0;
118
-
119
- return { text, link, timeAgo, reactions, comments };
120
- });
121
- }, limit);
122
-
123
- return rawPosts
124
- .filter(p => p.link && p.text)
125
- .filter(p => {
126
- if (!p.timeAgo) return true;
127
- const pubDate = parseTimeAgo(p.timeAgo);
128
- return !pubDate || pubDate >= cutoff;
129
- })
130
- .map(p => {
131
- const pubDate = parseTimeAgo(p.timeAgo) || new Date();
132
- return {
133
- id: crypto.createHash('md5').update(p.link).digest('hex'),
134
- source: 'linkedin_browser',
135
- sourceName: slug,
136
- category: 'linkedin',
137
- title: p.text.substring(0, 100) + (p.text.length > 100 ? '…' : ''),
138
- link: p.link,
139
- url: p.link,
140
- content: p.text,
141
- summary: p.text.substring(0, 200),
142
- author: slug,
143
- pubDate: pubDate.toISOString(),
144
- scraped_at: new Date().toISOString(),
145
- age_hours: Math.floor((Date.now() - pubDate.getTime()) / 3600000),
146
- tags: [],
147
- engagement: {
148
- upvotes: p.reactions,
149
- comments: p.comments,
150
- },
151
- metadata: {
152
- score: p.reactions,
153
- timeAgo: p.timeAgo,
154
- },
155
- };
156
- });
157
- }
158
-
159
- function parseTimeAgo(str) {
160
- if (!str) return null;
161
- const m = str.match(/(\d+)\s*(s|m|h|d|w|mo)/i);
162
- if (!m) return null;
163
- const n = parseInt(m[1]);
164
- const unit = m[2].toLowerCase();
165
- const ms = { s: 1000, m: 60000, h: 3600000, d: 86400000, w: 604800000, mo: 2592000000 }[unit] || 0;
166
- return new Date(Date.now() - n * ms);
167
- }
@@ -1,194 +0,0 @@
1
- import { chromium } from 'playwright';
2
- import fs from 'fs';
3
- import path from 'path';
4
- import crypto from 'crypto';
5
- import { fileURLToPath } from 'url';
6
- import createLogger from '../utils/logger.js';
7
-
8
- const logger = createLogger('TwitterFetcher');
9
- const __dirname = path.dirname(fileURLToPath(import.meta.url));
10
- const DEFAULT_PROFILE_DIR = path.join(__dirname, '../../data/playwright-profile');
11
-
12
- const sleep = ms => new Promise(r => setTimeout(r, ms));
13
-
14
- function validateProfile(profileDir) {
15
- if (!fs.existsSync(profileDir)) {
16
- return 'browser profile not found';
17
- }
18
- // A valid Chromium profile always contains a Default directory
19
- if (!fs.existsSync(path.join(profileDir, 'Default'))) {
20
- return 'browser profile is incomplete or empty';
21
- }
22
- return null; // valid
23
- }
24
-
25
- export default async function twitterFetch(config) {
26
- const cfg = config.trendingSources?.twitter;
27
- if (!cfg?.enabled) return [];
28
-
29
- const profileDir = cfg.profileDir || DEFAULT_PROFILE_DIR;
30
- const profileError = validateProfile(profileDir);
31
- if (profileError) {
32
- logger.warn(`Twitter skipped: ${profileError}`);
33
- logger.warn('Run: npm run setup:twitter');
34
- return [];
35
- }
36
-
37
- // Randomise visit order
38
- const accounts = [...(cfg.accounts || [])].sort(() => Math.random() - 0.5);
39
- const minLikes = cfg.minLikes || 0;
40
- const maxPerAccount = cfg.maxTweetsPerAccount || 10;
41
- const maxAgeHours = cfg.maxAgeHours || 24;
42
- const cutoff = new Date(Date.now() - maxAgeHours * 3600000);
43
-
44
- let context;
45
- try {
46
- context = await chromium.launchPersistentContext(profileDir, {
47
- headless: false,
48
- channel: 'chrome',
49
- ignoreDefaultArgs: ['--enable-automation'],
50
- args: ['--disable-blink-features=AutomationControlled'],
51
- viewport: { width: 1280, height: 800 },
52
- });
53
-
54
- const page = context.pages()[0] ?? await context.newPage();
55
- await sleep(5000);
56
-
57
- // Land on X home first so the search box is available
58
- await page.goto('https://x.com/home', { waitUntil: 'domcontentloaded', timeout: 20000 });
59
- await page.waitForSelector('[data-testid="SearchBox_Search_Input"], [aria-label="Search query"]', { timeout: 15000 });
60
- await sleep(2000);
61
-
62
- const allItems = [];
63
-
64
- for (let i = 0; i < accounts.length; i++) {
65
- const account = accounts[i];
66
- try {
67
- logger.info(`Scraping @${account}...`);
68
- const tweets = await scrapeAccount(page, account, maxPerAccount, minLikes, cutoff);
69
- allItems.push(...tweets);
70
- logger.debug(` → ${tweets.length} tweets from @${account}`);
71
- } catch (err) {
72
- logger.error(`Failed @${account}: ${err.message}`);
73
- }
74
-
75
- // Rate limit: random 20-30s between accounts
76
- if (i < accounts.length - 1) {
77
- const wait = 20000 + Math.random() * 10000;
78
- logger.debug(` Waiting ${Math.round(wait / 1000)}s before next account...`);
79
- await sleep(wait);
80
- }
81
- }
82
-
83
- logger.success(`Fetched ${allItems.length} tweets from ${accounts.length} accounts`);
84
- return allItems;
85
- } finally {
86
- if (context) await context.close();
87
- }
88
- }
89
-
90
- async function navigateViaSearch(page, account) {
91
- // Click the search box
92
- await page.click('[data-testid="SearchBox_Search_Input"], [aria-label="Search query"]');
93
- await sleep(800 + Math.random() * 400);
94
-
95
- // Type account name with human-like delay
96
- await page.keyboard.type(account, { delay: 80 + Math.random() * 60 });
97
- await sleep(1500);
98
-
99
- // Wait for dropdown results
100
- await page.waitForSelector('[data-testid="TypeaheadUser"]', { timeout: 8000 });
101
-
102
- // Find the result whose username matches the account
103
- const matched = await page.evaluate((account) => {
104
- const results = [...document.querySelectorAll('[data-testid="TypeaheadUser"]')];
105
- for (const el of results) {
106
- const handle = el.querySelector('[tabindex="-1"] span')?.innerText?.toLowerCase() || '';
107
- if (handle.includes(account.toLowerCase())) {
108
- el.click();
109
- return true;
110
- }
111
- }
112
- // Fall back to first result
113
- if (results[0]) { results[0].click(); return true; }
114
- return false;
115
- }, account);
116
-
117
- if (!matched) throw new Error(`No search result found for @${account}`);
118
- await page.waitForLoadState('domcontentloaded');
119
- await sleep(1500);
120
- }
121
-
122
- async function scrollAndWait(page, times = 3) {
123
- for (let i = 0; i < times; i++) {
124
- await page.mouse.wheel(0, 400 + Math.random() * 300);
125
- await sleep(600 + Math.random() * 400);
126
- }
127
- }
128
-
129
- async function scrapeAccount(page, account, limit, minLikes, cutoff) {
130
- await navigateViaSearch(page, account);
131
- await page.waitForSelector('article[data-testid="tweet"]', { timeout: 15000 });
132
- await sleep(1500);
133
-
134
- // Scroll to load more tweets naturally
135
- await scrollAndWait(page, 3);
136
-
137
- const rawTweets = await page.evaluate((limit) => {
138
- const articles = [...document.querySelectorAll('article[data-testid="tweet"]')].slice(0, limit);
139
-
140
- return articles.map(article => {
141
- const textEl = article.querySelector('[data-testid="tweetText"]');
142
- const text = textEl?.innerText?.trim() || '';
143
-
144
- const timeEl = article.querySelector('time');
145
- const link = timeEl?.closest('a')?.href || '';
146
- const date = timeEl?.getAttribute('datetime') || '';
147
-
148
- const parseCount = (testId) => {
149
- const el = article.querySelector(`[data-testid="${testId}"]`);
150
- const label = el?.getAttribute('aria-label') || '';
151
- const m = label.match(/(\d[\d,]*)/);
152
- return m ? parseInt(m[1].replace(/,/g, ''), 10) : 0;
153
- };
154
-
155
- return {
156
- text,
157
- link,
158
- date,
159
- likes: parseCount('like'),
160
- replies: parseCount('reply'),
161
- retweets: parseCount('retweet'),
162
- };
163
- });
164
- }, limit);
165
-
166
- return rawTweets
167
- .filter(t => t.link && t.text && t.likes >= minLikes && (!t.date || new Date(t.date) >= cutoff))
168
- .map(t => ({
169
- id: crypto.createHash('md5').update(t.link).digest('hex'),
170
- source: 'twitter',
171
- sourceName: `@${account}`,
172
- category: 'social',
173
- title: t.text.substring(0, 100) + (t.text.length > 100 ? '…' : ''),
174
- link: t.link,
175
- url: t.link,
176
- content: t.text,
177
- summary: t.text.substring(0, 200),
178
- author: account,
179
- pubDate: t.date || new Date().toISOString(),
180
- scraped_at: new Date().toISOString(),
181
- age_hours: t.date
182
- ? Math.floor((Date.now() - new Date(t.date).getTime()) / 3600000)
183
- : 0,
184
- tags: [],
185
- engagement: {
186
- upvotes: t.likes,
187
- comments: t.replies,
188
- retweets: t.retweets,
189
- },
190
- metadata: {
191
- score: t.likes,
192
- },
193
- }));
194
- }