@aikeytake/social-automation 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,400 @@
1
+ import axios from 'axios';
2
+ import fs from 'fs';
3
+ import path from 'path';
4
+ import { fileURLToPath } from 'url';
5
+ import crypto from 'crypto';
6
+ import createLogger from '../utils/logger.js';
7
+
8
+ const logger = createLogger('LinkedInFetcher');
9
+
10
+ const BRIGHTDATA_API_URL = 'https://api.brightdata.com/request';
11
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
12
+ const STATE_FILE = path.join(__dirname, '../../data/kol-state.json');
13
+
14
+ // Defaults (overridable via config/sources.json linkedin section)
15
+ const DEFAULTS = {
16
+ batchSize: 8, // KOLs per SERP query
17
+ budgetPerRun: 25, // Max SERP API calls per run (25 × 8 = 200 KOLs per run)
18
+ checkIntervalHours: 24, // Re-check each KOL every 24h
19
+ timeRange: 'w', // w=week, d=day, m=month
20
+ resultsPerBatch: 10, // Google results per batch query
21
+ enrichContent: true, // Scrape each post URL for full content + engagement
22
+ enrichConcurrency: 5, // Parallel enrichment requests
23
+ };
24
+
25
+ export default async function linkedinFetch(config) {
26
+ if (!config.linkedin?.enabled) return [];
27
+
28
+ const BRIGHTDATA_API_KEY = process.env.BRIGHTDATA_API_KEY;
29
+ const BRIGHTDATA_ZONE = process.env.BRIGHTDATA_ZONE || 'mcp_unlocker';
30
+
31
+ if (!BRIGHTDATA_API_KEY) {
32
+ logger.warn('BRIGHTDATA_API_KEY not set, skipping LinkedIn scraping');
33
+ return [];
34
+ }
35
+
36
+ const profilesFile = config.linkedin.profilesFile;
37
+ if (!fs.existsSync(profilesFile)) {
38
+ logger.error(`LinkedIn profiles file not found: ${profilesFile}`);
39
+ return [];
40
+ }
41
+
42
+ let profiles;
43
+ try {
44
+ profiles = JSON.parse(fs.readFileSync(profilesFile, 'utf-8'));
45
+ } catch (err) {
46
+ logger.error(`Failed to parse LinkedIn profiles file: ${err.message}`);
47
+ return [];
48
+ }
49
+
50
+ const cfg = { ...DEFAULTS, ...config.linkedin };
51
+ const state = loadState();
52
+ const now = new Date();
53
+ const cutoffMs = cfg.checkIntervalHours * 3600 * 1000;
54
+
55
+ // Select only KOLs not checked recently
56
+ const dueKols = profiles.filter(p => {
57
+ const last = state[p.name]?.lastChecked;
58
+ return !last || (now - new Date(last)) >= cutoffMs;
59
+ });
60
+
61
+ const maxKols = cfg.budgetPerRun * cfg.batchSize;
62
+ const selectedKols = dueKols.slice(0, maxKols);
63
+
64
+ if (selectedKols.length === 0) {
65
+ logger.info(`LinkedIn: all ${profiles.length} KOLs recently checked, nothing due`);
66
+ return [];
67
+ }
68
+
69
+ const numBatches = Math.ceil(selectedKols.length / cfg.batchSize);
70
+ logger.info(`LinkedIn: checking ${selectedKols.length}/${profiles.length} KOLs in ${numBatches} batches...`);
71
+
72
+ const allPosts = [];
73
+ const batches = chunk(selectedKols, cfg.batchSize);
74
+
75
+ for (const batch of batches) {
76
+ try {
77
+ let posts = await fetchBatchWithRetry(batch, state, cfg, BRIGHTDATA_API_KEY, BRIGHTDATA_ZONE);
78
+
79
+ // Enrich posts with full content + engagement by scraping each post URL
80
+ if (cfg.enrichContent && posts.length > 0) {
81
+ posts = await enrichPosts(posts, cfg, BRIGHTDATA_API_KEY, BRIGHTDATA_ZONE);
82
+ }
83
+
84
+ allPosts.push(...posts);
85
+
86
+ // Update state: mark all KOLs in batch as checked, record seen post IDs
87
+ for (const kol of batch) {
88
+ if (!state[kol.name]) state[kol.name] = { seenPostIds: [] };
89
+ state[kol.name].lastChecked = now.toISOString();
90
+ const newIds = posts.filter(p => p.sourceName === kol.name).map(p => p.id);
91
+ state[kol.name].seenPostIds = [
92
+ ...new Set([...newIds, ...(state[kol.name].seenPostIds || [])]),
93
+ ].slice(0, 100); // keep last 100 seen post IDs per KOL
94
+ }
95
+
96
+ await new Promise(r => setTimeout(r, 600));
97
+ } catch (err) {
98
+ logger.error(`Batch failed: ${err.message}`);
99
+ }
100
+ }
101
+
102
+ saveState(state);
103
+
104
+ const checked = selectedKols.length;
105
+ const remaining = dueKols.length - checked;
106
+ logger.success(`LinkedIn: ${allPosts.length} new posts (${checked} KOLs checked, ${remaining} still due)`);
107
+ return allPosts;
108
+ }
109
+
110
+ async function fetchBatchWithRetry(batch, state, cfg, apiKey, zone, retries = 1) {
111
+ try {
112
+ return await fetchBatch(batch, state, cfg, apiKey, zone);
113
+ } catch (err) {
114
+ if (retries > 0 && (err.code === 'ECONNABORTED' || err.message?.includes('timeout'))) {
115
+ logger.warn(`Batch timed out, retrying... (${retries} left)`);
116
+ await new Promise(r => setTimeout(r, 2000));
117
+ return fetchBatchWithRetry(batch, state, cfg, apiKey, zone, retries - 1);
118
+ }
119
+ throw err;
120
+ }
121
+ }
122
+
123
+ async function fetchBatch(batch, state, cfg, apiKey, zone) {
124
+ // Batch multiple KOL names into one SERP query
125
+ const nameList = batch.map(k => `"${k.name}"`).join(' OR ');
126
+ const searchQuery = `site:linkedin.com/posts (${nameList})`;
127
+ const googleUrl = [
128
+ 'https://www.google.com/search',
129
+ `?q=${encodeURIComponent(searchQuery)}`,
130
+ `&num=${cfg.resultsPerBatch}`,
131
+ `&tbs=qdr:${cfg.timeRange}`, // time filter: recent posts only
132
+ '&brd_json=1',
133
+ ].join('');
134
+
135
+ const response = await axios.post(
136
+ BRIGHTDATA_API_URL,
137
+ { zone, url: googleUrl, format: 'raw', data_format: 'parsed_light' },
138
+ {
139
+ headers: { Authorization: `Bearer ${apiKey}`, 'Content-Type': 'application/json' },
140
+ timeout: 40000,
141
+ }
142
+ );
143
+
144
+ const organicResults = response.data?.organic || [];
145
+ const posts = [];
146
+
147
+ for (const item of organicResults) {
148
+ if (!item.link?.includes('linkedin.com/posts')) continue;
149
+
150
+ // Match result back to a specific KOL from this batch
151
+ const kol = matchKol(item, batch);
152
+ if (!kol) continue;
153
+
154
+ // Skip boilerplate / profile-bio-only snippets — these have no post content
155
+ const rawContent = item.description || item.snippet || '';
156
+ if (!isUsefulContent(rawContent)) continue;
157
+
158
+ const id = crypto.createHash('md5').update(item.link).digest('hex');
159
+
160
+ // Skip posts we've already seen for this KOL
161
+ if (state[kol.name]?.seenPostIds?.includes(id)) continue;
162
+
163
+ posts.push({
164
+ id,
165
+ source: 'linkedin',
166
+ sourceName: kol.name,
167
+ category: 'linkedin-kol',
168
+ title: cleanTitle(item.title || '', kol.name),
169
+ link: item.link,
170
+ url: item.link,
171
+ content: cleanContent(rawContent),
172
+ summary: cleanContent(rawContent).substring(0, 200),
173
+ author: kol.name,
174
+ role: kol.role || '',
175
+ pubDate: extractDate(item) || new Date().toISOString(),
176
+ scraped_at: new Date().toISOString(),
177
+ age_hours: 0,
178
+ engagement: { upvotes: 0, comments: 0 },
179
+ metadata: { score: 0 },
180
+ });
181
+ }
182
+
183
+ return posts;
184
+ }
185
+
186
+ // Scrape each post URL for full content + engagement, with concurrency limit
187
+ async function enrichPosts(posts, cfg, apiKey, zone) {
188
+ const concurrency = cfg.enrichConcurrency || 5;
189
+ const enriched = [];
190
+
191
+ for (let i = 0; i < posts.length; i += concurrency) {
192
+ const batch = posts.slice(i, i + concurrency);
193
+ const results = await Promise.all(batch.map(p => enrichPost(p, apiKey, zone)));
194
+ enriched.push(...results);
195
+ }
196
+
197
+ return enriched;
198
+ }
199
+
200
+ async function enrichPost(post, apiKey, zone) {
201
+ try {
202
+ const response = await axios.post(
203
+ BRIGHTDATA_API_URL,
204
+ { zone, url: post.url, format: 'raw', data_format: 'markdown' },
205
+ {
206
+ headers: { Authorization: `Bearer ${apiKey}`, 'Content-Type': 'application/json' },
207
+ timeout: 40000,
208
+ }
209
+ );
210
+
211
+ const markdown = typeof response.data === 'string' ? response.data : JSON.stringify(response.data);
212
+ const fullContent = extractPostContent(markdown);
213
+ const engagement = extractEngagement(markdown);
214
+ const pubDate = extractDateFromPage(markdown) || post.pubDate;
215
+
216
+ if (fullContent) {
217
+ post.content = fullContent;
218
+ post.summary = fullContent.substring(0, 200);
219
+ }
220
+ post.engagement = engagement;
221
+ post.pubDate = pubDate;
222
+ } catch (err) {
223
+ // Best-effort — keep SERP snippet if scraping fails
224
+ logger.debug(`Enrich failed for ${post.sourceName}: ${err.message}`);
225
+ }
226
+ return post;
227
+ }
228
+
229
+ // Extract full post content from LinkedIn page markdown.
230
+ // LinkedIn page structure (logged-out view):
231
+ // [Author] [Job] [N followers]
232
+ // [Nd/w/h ago] •
233
+ // [Full post text here]
234
+ // Like Comment Repost Send
235
+ // [N reactions] • [N comments]
236
+ function extractPostContent(markdown) {
237
+ const lines = markdown.split('\n');
238
+ let start = -1;
239
+ let end = lines.length;
240
+
241
+ // Find the line after the date indicator (e.g. "3d •", "2w •", "1mo •")
242
+ for (let i = 0; i < lines.length; i++) {
243
+ const line = lines[i].trim();
244
+ if (/^\d+[dwm]\s*[•·]/.test(line) || /\d+\s+(hour|day|week|month)s?\s+ago/i.test(line)) {
245
+ start = i + 1;
246
+ break;
247
+ }
248
+ }
249
+
250
+ if (start === -1) return null;
251
+
252
+ // Find where content ends: engagement buttons or reaction counts
253
+ for (let i = start; i < lines.length; i++) {
254
+ const line = lines[i].trim().toLowerCase();
255
+ if (
256
+ line === 'like comment repost send' ||
257
+ line.startsWith('like') && line.includes('comment') ||
258
+ /^\d[\d,]*\s*reaction/.test(line) ||
259
+ line === 'reactions' ||
260
+ line.includes('sign in') ||
261
+ line.includes('join now')
262
+ ) {
263
+ end = i;
264
+ break;
265
+ }
266
+ }
267
+
268
+ const content = lines
269
+ .slice(start, end)
270
+ .join('\n')
271
+ .replace(/\[([^\]]+)\]\([^)]+\)/g, '$1') // collapse markdown links to text
272
+ .replace(/^#+\s/gm, '') // strip heading markers
273
+ .replace(/\n{3,}/g, '\n\n')
274
+ .trim();
275
+
276
+ return content.length > 30 ? content : null;
277
+ }
278
+
279
+ // Extract like/comment/repost counts from LinkedIn page markdown
280
+ function extractEngagement(markdown) {
281
+ const parse = (pattern) => {
282
+ const m = markdown.match(pattern);
283
+ return m ? parseInt(m[1].replace(/,/g, '')) : 0;
284
+ };
285
+ return {
286
+ upvotes: parse(/(\d[\d,]*)\s*reaction/i),
287
+ comments: parse(/(\d[\d,]*)\s*comment/i),
288
+ reposts: parse(/(\d[\d,]*)\s*repost/i),
289
+ };
290
+ }
291
+
292
+ // Extract post date from LinkedIn page markdown (more accurate than SERP snippet)
293
+ function extractDateFromPage(markdown) {
294
+ const ago = markdown.match(/(\d+)\s*(hour|day|week|month)s?\s*ago/i) ||
295
+ markdown.match(/(\d+)(h|d|w|mo)\s*[•·]/);
296
+ if (!ago) return null;
297
+ const n = parseInt(ago[1]);
298
+ const unitRaw = ago[2].toLowerCase();
299
+ const unit = { h: 'hour', d: 'day', w: 'week', mo: 'month' }[unitRaw] || unitRaw;
300
+ const ms = { hour: 3600000, day: 86400000, week: 604800000, month: 2592000000 }[unit] || 0;
301
+ return ms ? new Date(Date.now() - n * ms).toISOString() : null;
302
+ }
303
+
304
+ // Match a search result to a KOL in the batch.
305
+ // We use two signals, in order of reliability:
306
+ // 1. URL username contains BOTH first+last name parts of the KOL → post is by them
307
+ // 2. Title is "[KOL Name]'s Post" format → LinkedIn's own-post title format
308
+ // Anything else (posts merely mentioning the KOL) is rejected.
309
+ function matchKol(item, batch) {
310
+ const urlUsername = extractUrlUsername(item.link || '');
311
+ const titleAuthor = (item.title || '').match(/^(.+?)'s Post/i)?.[1]?.toLowerCase().trim() || '';
312
+
313
+ for (const kol of batch) {
314
+ const parts = sanitizeName(kol.name).split(' ').filter(Boolean);
315
+ if (parts.length < 2) continue;
316
+ const [first, ...rest] = parts;
317
+ const last = rest[rest.length - 1];
318
+
319
+ // Signal 1: URL username contains both first AND last name → reliable authorship
320
+ if (urlUsername && urlUsername.includes(first) && urlUsername.includes(last)) return kol;
321
+
322
+ // Signal 2: Title's "Name's Post" format matches this KOL
323
+ if (titleAuthor && titleAuthor.includes(first) && titleAuthor.includes(last)) return kol;
324
+ }
325
+
326
+ return null;
327
+ }
328
+
329
+ // Extract the author username from a LinkedIn post URL
330
+ // linkedin.com/posts/USERNAME_post-title-activity-ID
331
+ function extractUrlUsername(url) {
332
+ const m = url.match(/linkedin\.com\/posts\/([^_/]+)/);
333
+ return m?.[1]?.toLowerCase() || '';
334
+ }
335
+
336
+ function sanitizeName(name) {
337
+ return name.toLowerCase().replace(/[^a-z0-9 ]/g, '').trim();
338
+ }
339
+
340
+ // Returns false for snippets that are just LinkedIn profile bios / boilerplate
341
+ function isUsefulContent(content) {
342
+ if (!content || content.length < 40) return false;
343
+ const lower = content.toLowerCase();
344
+ // Common boilerplate patterns from LinkedIn search results
345
+ const boilerplate = [
346
+ 'view profile for',
347
+ 'report this comment',
348
+ 'close menu',
349
+ 'like · reply',
350
+ '1 reaction',
351
+ ];
352
+ return !boilerplate.some(b => lower.includes(b));
353
+ }
354
+
355
+ // Remove LinkedIn UI chrome from content
356
+ function cleanContent(text) {
357
+ return text
358
+ .replace(/\.\.\.Read more$/i, '')
359
+ .replace(/View profile for [^.]+\./gi, '')
360
+ .replace(/\s+/g, ' ')
361
+ .trim();
362
+ }
363
+
364
+ // Remove "Name's Post" boilerplate from title
365
+ function cleanTitle(title, authorName) {
366
+ const cleaned = title
367
+ .replace(new RegExp(`^${authorName}'s Post\\s*[-–]?\\s*`, 'i'), '')
368
+ .trim();
369
+ return cleaned || title;
370
+ }
371
+
372
+ // Parse relative date strings from Google snippets into ISO dates
373
+ function extractDate(item) {
374
+ const text = `${item.description || ''} ${item.date || ''}`;
375
+ const ago = text.match(/(\d+)\s*(hour|day|week|month)s?\s*ago/i);
376
+ if (!ago) return null;
377
+ const n = parseInt(ago[1]);
378
+ const unit = ago[2].toLowerCase();
379
+ const ms = { hour: 3600000, day: 86400000, week: 604800000, month: 2592000000 }[unit] || 0;
380
+ return new Date(Date.now() - n * ms).toISOString();
381
+ }
382
+
383
+ function chunk(arr, size) {
384
+ const result = [];
385
+ for (let i = 0; i < arr.length; i += size) result.push(arr.slice(i, i + size));
386
+ return result;
387
+ }
388
+
389
+ function loadState() {
390
+ try {
391
+ if (fs.existsSync(STATE_FILE)) return JSON.parse(fs.readFileSync(STATE_FILE, 'utf-8'));
392
+ } catch { /* start fresh */ }
393
+ return {};
394
+ }
395
+
396
+ function saveState(state) {
397
+ const dir = path.dirname(STATE_FILE);
398
+ if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
399
+ fs.writeFileSync(STATE_FILE, JSON.stringify(state, null, 2));
400
+ }
@@ -0,0 +1,167 @@
1
+ import { chromium } from 'playwright';
2
+ import fs from 'fs';
3
+ import path from 'path';
4
+ import crypto from 'crypto';
5
+ import { fileURLToPath } from 'url';
6
+ import createLogger from '../utils/logger.js';
7
+
8
+ const logger = createLogger('LinkedInBrowser');
9
+ const __dirname = path.dirname(fileURLToPath(import.meta.url));
10
+ const DEFAULT_PROFILE_DIR = path.join(__dirname, '../../data/playwright-profile');
11
+
12
+ const sleep = ms => new Promise(r => setTimeout(r, ms));
13
+
14
+ function validateProfile(profileDir) {
15
+ if (!fs.existsSync(profileDir)) return 'browser profile not found';
16
+ if (!fs.existsSync(path.join(profileDir, 'Default'))) return 'browser profile is incomplete';
17
+ return null;
18
+ }
19
+
20
+ export default async function linkedinBrowserFetch(config) {
21
+ const cfg = config.linkedin_browser;
22
+ if (!cfg?.enabled) return [];
23
+
24
+ const profileDir = cfg.profileDir || DEFAULT_PROFILE_DIR;
25
+ const profileError = validateProfile(profileDir);
26
+ if (profileError) {
27
+ logger.warn(`LinkedIn Browser skipped: ${profileError}`);
28
+ logger.warn('Run: npm run setup:twitter (same profile as Twitter)');
29
+ return [];
30
+ }
31
+
32
+ const accounts = [...(cfg.accounts || [])].sort(() => Math.random() - 0.5);
33
+ const maxPerAccount = cfg.maxPostsPerAccount || 5;
34
+ const maxAgeHours = cfg.maxAgeHours || 48;
35
+ const cutoff = new Date(Date.now() - maxAgeHours * 3600000);
36
+ const delay = cfg.delayBetweenAccountsMs || 10000;
37
+
38
+ let context;
39
+ try {
40
+ context = await chromium.launchPersistentContext(profileDir, {
41
+ headless: false,
42
+ channel: 'chrome',
43
+ ignoreDefaultArgs: ['--enable-automation'],
44
+ args: ['--disable-blink-features=AutomationControlled'],
45
+ viewport: { width: 1280, height: 900 },
46
+ });
47
+
48
+ const page = context.pages()[0] ?? await context.newPage();
49
+ await sleep(3000);
50
+
51
+ const allItems = [];
52
+
53
+ for (let i = 0; i < accounts.length; i++) {
54
+ const account = accounts[i];
55
+ try {
56
+ logger.info(`Scraping linkedin.com/in/${account}...`);
57
+ const posts = await scrapeAccount(page, account, maxPerAccount, cutoff);
58
+ allItems.push(...posts);
59
+ logger.debug(` → ${posts.length} posts from ${account}`);
60
+ } catch (err) {
61
+ logger.error(`Failed ${account}: ${err.message}`);
62
+ }
63
+
64
+ if (i < accounts.length - 1) {
65
+ const wait = delay + Math.random() * 5000;
66
+ logger.debug(` Waiting ${Math.round(wait / 1000)}s...`);
67
+ await sleep(wait);
68
+ }
69
+ }
70
+
71
+ logger.success(`Fetched ${allItems.length} posts from ${accounts.length} LinkedIn accounts`);
72
+ return allItems;
73
+ } finally {
74
+ if (context) await context.close();
75
+ }
76
+ }
77
+
78
+ async function scrapeAccount(page, slug, limit, cutoff) {
79
+ const url = `https://www.linkedin.com/in/${slug}/recent-activity/all/`;
80
+ await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
81
+
82
+ // Wait for posts feed
83
+ await page.waitForSelector('div[data-urn]', { timeout: 15000 });
84
+ await sleep(2000);
85
+
86
+ // Scroll to load more posts
87
+ for (let i = 0; i < 3; i++) {
88
+ await page.mouse.wheel(0, 500 + Math.random() * 300);
89
+ await sleep(700 + Math.random() * 400);
90
+ }
91
+
92
+ const rawPosts = await page.evaluate((limit) => {
93
+ const containers = [...document.querySelectorAll('div[data-urn^="urn:li:activity"]')]
94
+ .slice(0, limit);
95
+
96
+ return containers.map(el => {
97
+ const urn = el.getAttribute('data-urn');
98
+
99
+ // Post text
100
+ const textEl = el.querySelector('.update-components-text, [class*="commentary"]');
101
+ const text = textEl?.innerText?.trim() || '';
102
+
103
+ // Construct post URL directly from urn
104
+ const link = urn ? `https://www.linkedin.com/feed/update/${urn}/` : '';
105
+
106
+ // Time ago — take first segment before " •" or newline
107
+ const timeEl = el.querySelector('.update-components-actor__sub-description');
108
+ const timeAgo = timeEl?.innerText?.trim().split(/\s*[•\n]/)[0].trim() || '';
109
+
110
+ // Reactions count
111
+ const reactionsEl = el.querySelector('.social-details-social-counts__reactions-count');
112
+ const reactions = parseInt(reactionsEl?.innerText?.replace(/[^0-9]/g, '') || '0', 10);
113
+
114
+ // Comments count — parse from social counts block
115
+ const countsEl = el.querySelector('[class*="social-counts"]');
116
+ const commentsMatch = countsEl?.innerText?.match(/(\d+)\s+comment/);
117
+ const comments = commentsMatch ? parseInt(commentsMatch[1]) : 0;
118
+
119
+ return { text, link, timeAgo, reactions, comments };
120
+ });
121
+ }, limit);
122
+
123
+ return rawPosts
124
+ .filter(p => p.link && p.text)
125
+ .filter(p => {
126
+ if (!p.timeAgo) return true;
127
+ const pubDate = parseTimeAgo(p.timeAgo);
128
+ return !pubDate || pubDate >= cutoff;
129
+ })
130
+ .map(p => {
131
+ const pubDate = parseTimeAgo(p.timeAgo) || new Date();
132
+ return {
133
+ id: crypto.createHash('md5').update(p.link).digest('hex'),
134
+ source: 'linkedin_browser',
135
+ sourceName: slug,
136
+ category: 'linkedin',
137
+ title: p.text.substring(0, 100) + (p.text.length > 100 ? '…' : ''),
138
+ link: p.link,
139
+ url: p.link,
140
+ content: p.text,
141
+ summary: p.text.substring(0, 200),
142
+ author: slug,
143
+ pubDate: pubDate.toISOString(),
144
+ scraped_at: new Date().toISOString(),
145
+ age_hours: Math.floor((Date.now() - pubDate.getTime()) / 3600000),
146
+ tags: [],
147
+ engagement: {
148
+ upvotes: p.reactions,
149
+ comments: p.comments,
150
+ },
151
+ metadata: {
152
+ score: p.reactions,
153
+ timeAgo: p.timeAgo,
154
+ },
155
+ };
156
+ });
157
+ }
158
+
159
+ function parseTimeAgo(str) {
160
+ if (!str) return null;
161
+ const m = str.match(/(\d+)\s*(s|m|h|d|w|mo)/i);
162
+ if (!m) return null;
163
+ const n = parseInt(m[1]);
164
+ const unit = m[2].toLowerCase();
165
+ const ms = { s: 1000, m: 60000, h: 3600000, d: 86400000, w: 604800000, mo: 2592000000 }[unit] || 0;
166
+ return new Date(Date.now() - n * ms);
167
+ }
@@ -0,0 +1,77 @@
1
+ import axios from 'axios';
2
+ import createLogger from '../utils/logger.js';
3
+
4
+ const logger = createLogger('RedditFetcher');
5
+
6
+ export default async function redditFetch(config) {
7
+ const redditConfig = config.trendingSources?.reddit;
8
+ if (!redditConfig?.enabled) {
9
+ return [];
10
+ }
11
+
12
+ const subreddits = redditConfig.subreddits || [];
13
+ const minScore = redditConfig.minScore || 100;
14
+ const maxAge = parseInt(redditConfig.maxAge) || 24;
15
+ const cutoff = new Date(Date.now() - maxAge * 60 * 60 * 1000);
16
+
17
+ logger.info(`Fetching top posts from ${subreddits.length} subreddits...`);
18
+
19
+ const allItems = [];
20
+
21
+ for (const subreddit of subreddits) {
22
+ try {
23
+ const response = await axios.get(
24
+ `https://www.reddit.com/r/${subreddit}/hot.json?limit=50`,
25
+ { headers: { 'User-Agent': 'AI-Keytake-Scraper/1.0' } }
26
+ );
27
+
28
+ const posts = response.data.data.children;
29
+
30
+ for (const post of posts) {
31
+ const data = post.data;
32
+ const created = new Date(data.created_utc * 1000);
33
+
34
+ // Skip old posts
35
+ if (created < cutoff) continue;
36
+
37
+ // Skip low-score posts
38
+ if (data.score < minScore) continue;
39
+
40
+ // Skip NSFW
41
+ if (data.over_18) continue;
42
+
43
+ allItems.push({
44
+ id: `reddit_${data.id}`,
45
+ source: 'reddit',
46
+ sourceName: `r/${subreddit}`,
47
+ title: data.title,
48
+ content: data.selftext || '',
49
+ summary: (data.selftext || '').substring(0, 200),
50
+ url: `https://reddit.com${data.permalink}`,
51
+ external_url: data.url,
52
+ author: data.author,
53
+ posted_at: new Date(data.created_utc * 1000).toISOString(),
54
+ scraped_at: new Date().toISOString(),
55
+ age_hours: Math.floor((Date.now() - created.getTime()) / (1000 * 60 * 60)),
56
+ engagement: {
57
+ upvotes: data.score,
58
+ comments: data.num_comments,
59
+ ratio: data.upvote_ratio
60
+ },
61
+ metadata: {
62
+ score: data.score,
63
+ is_self: data.is_self,
64
+ is_video: data.is_video
65
+ }
66
+ });
67
+ }
68
+
69
+ logger.debug(`Fetched from r/${subreddit}`);
70
+ } catch (error) {
71
+ logger.error(`Error fetching r/${subreddit}: ${error.message}`);
72
+ }
73
+ }
74
+
75
+ logger.success(`Fetched ${allItems.length} total posts from Reddit`);
76
+ return allItems;
77
+ }
@@ -0,0 +1,50 @@
1
+ import Parser from 'rss-parser';
2
+ import crypto from 'crypto';
3
+ import createLogger from '../utils/logger.js';
4
+
5
+ const logger = createLogger('RSSFetcher');
6
+ const parser = new Parser();
7
+
8
+ export default async function rssFetch(config) {
9
+ const feeds = config.rssFeeds.filter(f => f.enabled);
10
+ logger.info(`Fetching from ${feeds.length} RSS feeds`);
11
+
12
+ const allItems = [];
13
+ const maxAge = config.filtering?.maxAgeHours || 48;
14
+ const cutoff = new Date(Date.now() - maxAge * 60 * 60 * 1000);
15
+
16
+ for (const feed of feeds) {
17
+ try {
18
+ const parsed = await parser.parseURL(feed.url);
19
+
20
+ for (const item of parsed.items) {
21
+ const pubDate = new Date(item.pubDate);
22
+
23
+ // Skip old items
24
+ if (pubDate < cutoff) continue;
25
+
26
+ allItems.push({
27
+ id: crypto.createHash('md5').update(item.link).digest('hex'),
28
+ source: 'rss',
29
+ sourceName: feed.name,
30
+ category: feed.category,
31
+ title: item.title,
32
+ link: item.link,
33
+ content: item.contentSnippet || item.content || '',
34
+ summary: (item.contentSnippet || item.content || '').substring(0, 200),
35
+ pubDate: item.pubDate,
36
+ author: item.creator || item.author || feed.name,
37
+ scraped_at: new Date().toISOString(),
38
+ age_hours: Math.floor((Date.now() - pubDate.getTime()) / (1000 * 60 * 60))
39
+ });
40
+ }
41
+
42
+ logger.debug(`Fetched items from ${feed.name}`);
43
+ } catch (error) {
44
+ logger.error(`Failed to fetch ${feed.name}: ${error.message}`);
45
+ }
46
+ }
47
+
48
+ logger.success(`Fetched ${allItems.length} total items`);
49
+ return allItems;
50
+ }