@aikeytake/social-automation 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +39 -0
- package/CLAUDE.md +256 -0
- package/CURRENT_CAPABILITIES.md +493 -0
- package/DATA_ORGANIZATION.md +416 -0
- package/IMPLEMENTATION_SUMMARY.md +287 -0
- package/INSTRUCTIONS.md +316 -0
- package/MASTER_PLAN.md +1096 -0
- package/README.md +280 -0
- package/config/sources.json +296 -0
- package/package.json +37 -0
- package/src/cli.js +197 -0
- package/src/fetchers/api.js +232 -0
- package/src/fetchers/hackernews.js +86 -0
- package/src/fetchers/linkedin.js +400 -0
- package/src/fetchers/linkedin_browser.js +167 -0
- package/src/fetchers/reddit.js +77 -0
- package/src/fetchers/rss.js +50 -0
- package/src/fetchers/twitter.js +194 -0
- package/src/index.js +346 -0
- package/src/query.js +316 -0
- package/src/utils/logger.js +74 -0
- package/src/utils/storage.js +134 -0
- package/src/writing-agents/QUICK-REFERENCE.md +111 -0
- package/src/writing-agents/WRITING-SKILLS-IMPROVEMENTS.md +273 -0
- package/src/writing-agents/utils/prompt-templates-improved.js +665 -0
|
@@ -0,0 +1,400 @@
|
|
|
1
|
+
import axios from 'axios';
|
|
2
|
+
import fs from 'fs';
|
|
3
|
+
import path from 'path';
|
|
4
|
+
import { fileURLToPath } from 'url';
|
|
5
|
+
import crypto from 'crypto';
|
|
6
|
+
import createLogger from '../utils/logger.js';
|
|
7
|
+
|
|
8
|
+
const logger = createLogger('LinkedInFetcher');
|
|
9
|
+
|
|
10
|
+
const BRIGHTDATA_API_URL = 'https://api.brightdata.com/request';
|
|
11
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
12
|
+
const STATE_FILE = path.join(__dirname, '../../data/kol-state.json');
|
|
13
|
+
|
|
14
|
+
// Defaults (overridable via config/sources.json linkedin section)
|
|
15
|
+
const DEFAULTS = {
|
|
16
|
+
batchSize: 8, // KOLs per SERP query
|
|
17
|
+
budgetPerRun: 25, // Max SERP API calls per run (25 × 8 = 200 KOLs per run)
|
|
18
|
+
checkIntervalHours: 24, // Re-check each KOL every 24h
|
|
19
|
+
timeRange: 'w', // w=week, d=day, m=month
|
|
20
|
+
resultsPerBatch: 10, // Google results per batch query
|
|
21
|
+
enrichContent: true, // Scrape each post URL for full content + engagement
|
|
22
|
+
enrichConcurrency: 5, // Parallel enrichment requests
|
|
23
|
+
};
|
|
24
|
+
|
|
25
|
+
export default async function linkedinFetch(config) {
|
|
26
|
+
if (!config.linkedin?.enabled) return [];
|
|
27
|
+
|
|
28
|
+
const BRIGHTDATA_API_KEY = process.env.BRIGHTDATA_API_KEY;
|
|
29
|
+
const BRIGHTDATA_ZONE = process.env.BRIGHTDATA_ZONE || 'mcp_unlocker';
|
|
30
|
+
|
|
31
|
+
if (!BRIGHTDATA_API_KEY) {
|
|
32
|
+
logger.warn('BRIGHTDATA_API_KEY not set, skipping LinkedIn scraping');
|
|
33
|
+
return [];
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
const profilesFile = config.linkedin.profilesFile;
|
|
37
|
+
if (!fs.existsSync(profilesFile)) {
|
|
38
|
+
logger.error(`LinkedIn profiles file not found: ${profilesFile}`);
|
|
39
|
+
return [];
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
let profiles;
|
|
43
|
+
try {
|
|
44
|
+
profiles = JSON.parse(fs.readFileSync(profilesFile, 'utf-8'));
|
|
45
|
+
} catch (err) {
|
|
46
|
+
logger.error(`Failed to parse LinkedIn profiles file: ${err.message}`);
|
|
47
|
+
return [];
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
const cfg = { ...DEFAULTS, ...config.linkedin };
|
|
51
|
+
const state = loadState();
|
|
52
|
+
const now = new Date();
|
|
53
|
+
const cutoffMs = cfg.checkIntervalHours * 3600 * 1000;
|
|
54
|
+
|
|
55
|
+
// Select only KOLs not checked recently
|
|
56
|
+
const dueKols = profiles.filter(p => {
|
|
57
|
+
const last = state[p.name]?.lastChecked;
|
|
58
|
+
return !last || (now - new Date(last)) >= cutoffMs;
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
const maxKols = cfg.budgetPerRun * cfg.batchSize;
|
|
62
|
+
const selectedKols = dueKols.slice(0, maxKols);
|
|
63
|
+
|
|
64
|
+
if (selectedKols.length === 0) {
|
|
65
|
+
logger.info(`LinkedIn: all ${profiles.length} KOLs recently checked, nothing due`);
|
|
66
|
+
return [];
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const numBatches = Math.ceil(selectedKols.length / cfg.batchSize);
|
|
70
|
+
logger.info(`LinkedIn: checking ${selectedKols.length}/${profiles.length} KOLs in ${numBatches} batches...`);
|
|
71
|
+
|
|
72
|
+
const allPosts = [];
|
|
73
|
+
const batches = chunk(selectedKols, cfg.batchSize);
|
|
74
|
+
|
|
75
|
+
for (const batch of batches) {
|
|
76
|
+
try {
|
|
77
|
+
let posts = await fetchBatchWithRetry(batch, state, cfg, BRIGHTDATA_API_KEY, BRIGHTDATA_ZONE);
|
|
78
|
+
|
|
79
|
+
// Enrich posts with full content + engagement by scraping each post URL
|
|
80
|
+
if (cfg.enrichContent && posts.length > 0) {
|
|
81
|
+
posts = await enrichPosts(posts, cfg, BRIGHTDATA_API_KEY, BRIGHTDATA_ZONE);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
allPosts.push(...posts);
|
|
85
|
+
|
|
86
|
+
// Update state: mark all KOLs in batch as checked, record seen post IDs
|
|
87
|
+
for (const kol of batch) {
|
|
88
|
+
if (!state[kol.name]) state[kol.name] = { seenPostIds: [] };
|
|
89
|
+
state[kol.name].lastChecked = now.toISOString();
|
|
90
|
+
const newIds = posts.filter(p => p.sourceName === kol.name).map(p => p.id);
|
|
91
|
+
state[kol.name].seenPostIds = [
|
|
92
|
+
...new Set([...newIds, ...(state[kol.name].seenPostIds || [])]),
|
|
93
|
+
].slice(0, 100); // keep last 100 seen post IDs per KOL
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
await new Promise(r => setTimeout(r, 600));
|
|
97
|
+
} catch (err) {
|
|
98
|
+
logger.error(`Batch failed: ${err.message}`);
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
saveState(state);
|
|
103
|
+
|
|
104
|
+
const checked = selectedKols.length;
|
|
105
|
+
const remaining = dueKols.length - checked;
|
|
106
|
+
logger.success(`LinkedIn: ${allPosts.length} new posts (${checked} KOLs checked, ${remaining} still due)`);
|
|
107
|
+
return allPosts;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
async function fetchBatchWithRetry(batch, state, cfg, apiKey, zone, retries = 1) {
|
|
111
|
+
try {
|
|
112
|
+
return await fetchBatch(batch, state, cfg, apiKey, zone);
|
|
113
|
+
} catch (err) {
|
|
114
|
+
if (retries > 0 && (err.code === 'ECONNABORTED' || err.message?.includes('timeout'))) {
|
|
115
|
+
logger.warn(`Batch timed out, retrying... (${retries} left)`);
|
|
116
|
+
await new Promise(r => setTimeout(r, 2000));
|
|
117
|
+
return fetchBatchWithRetry(batch, state, cfg, apiKey, zone, retries - 1);
|
|
118
|
+
}
|
|
119
|
+
throw err;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
async function fetchBatch(batch, state, cfg, apiKey, zone) {
|
|
124
|
+
// Batch multiple KOL names into one SERP query
|
|
125
|
+
const nameList = batch.map(k => `"${k.name}"`).join(' OR ');
|
|
126
|
+
const searchQuery = `site:linkedin.com/posts (${nameList})`;
|
|
127
|
+
const googleUrl = [
|
|
128
|
+
'https://www.google.com/search',
|
|
129
|
+
`?q=${encodeURIComponent(searchQuery)}`,
|
|
130
|
+
`&num=${cfg.resultsPerBatch}`,
|
|
131
|
+
`&tbs=qdr:${cfg.timeRange}`, // time filter: recent posts only
|
|
132
|
+
'&brd_json=1',
|
|
133
|
+
].join('');
|
|
134
|
+
|
|
135
|
+
const response = await axios.post(
|
|
136
|
+
BRIGHTDATA_API_URL,
|
|
137
|
+
{ zone, url: googleUrl, format: 'raw', data_format: 'parsed_light' },
|
|
138
|
+
{
|
|
139
|
+
headers: { Authorization: `Bearer ${apiKey}`, 'Content-Type': 'application/json' },
|
|
140
|
+
timeout: 40000,
|
|
141
|
+
}
|
|
142
|
+
);
|
|
143
|
+
|
|
144
|
+
const organicResults = response.data?.organic || [];
|
|
145
|
+
const posts = [];
|
|
146
|
+
|
|
147
|
+
for (const item of organicResults) {
|
|
148
|
+
if (!item.link?.includes('linkedin.com/posts')) continue;
|
|
149
|
+
|
|
150
|
+
// Match result back to a specific KOL from this batch
|
|
151
|
+
const kol = matchKol(item, batch);
|
|
152
|
+
if (!kol) continue;
|
|
153
|
+
|
|
154
|
+
// Skip boilerplate / profile-bio-only snippets — these have no post content
|
|
155
|
+
const rawContent = item.description || item.snippet || '';
|
|
156
|
+
if (!isUsefulContent(rawContent)) continue;
|
|
157
|
+
|
|
158
|
+
const id = crypto.createHash('md5').update(item.link).digest('hex');
|
|
159
|
+
|
|
160
|
+
// Skip posts we've already seen for this KOL
|
|
161
|
+
if (state[kol.name]?.seenPostIds?.includes(id)) continue;
|
|
162
|
+
|
|
163
|
+
posts.push({
|
|
164
|
+
id,
|
|
165
|
+
source: 'linkedin',
|
|
166
|
+
sourceName: kol.name,
|
|
167
|
+
category: 'linkedin-kol',
|
|
168
|
+
title: cleanTitle(item.title || '', kol.name),
|
|
169
|
+
link: item.link,
|
|
170
|
+
url: item.link,
|
|
171
|
+
content: cleanContent(rawContent),
|
|
172
|
+
summary: cleanContent(rawContent).substring(0, 200),
|
|
173
|
+
author: kol.name,
|
|
174
|
+
role: kol.role || '',
|
|
175
|
+
pubDate: extractDate(item) || new Date().toISOString(),
|
|
176
|
+
scraped_at: new Date().toISOString(),
|
|
177
|
+
age_hours: 0,
|
|
178
|
+
engagement: { upvotes: 0, comments: 0 },
|
|
179
|
+
metadata: { score: 0 },
|
|
180
|
+
});
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
return posts;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Scrape each post URL for full content + engagement, with concurrency limit
|
|
187
|
+
async function enrichPosts(posts, cfg, apiKey, zone) {
|
|
188
|
+
const concurrency = cfg.enrichConcurrency || 5;
|
|
189
|
+
const enriched = [];
|
|
190
|
+
|
|
191
|
+
for (let i = 0; i < posts.length; i += concurrency) {
|
|
192
|
+
const batch = posts.slice(i, i + concurrency);
|
|
193
|
+
const results = await Promise.all(batch.map(p => enrichPost(p, apiKey, zone)));
|
|
194
|
+
enriched.push(...results);
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
return enriched;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
async function enrichPost(post, apiKey, zone) {
|
|
201
|
+
try {
|
|
202
|
+
const response = await axios.post(
|
|
203
|
+
BRIGHTDATA_API_URL,
|
|
204
|
+
{ zone, url: post.url, format: 'raw', data_format: 'markdown' },
|
|
205
|
+
{
|
|
206
|
+
headers: { Authorization: `Bearer ${apiKey}`, 'Content-Type': 'application/json' },
|
|
207
|
+
timeout: 40000,
|
|
208
|
+
}
|
|
209
|
+
);
|
|
210
|
+
|
|
211
|
+
const markdown = typeof response.data === 'string' ? response.data : JSON.stringify(response.data);
|
|
212
|
+
const fullContent = extractPostContent(markdown);
|
|
213
|
+
const engagement = extractEngagement(markdown);
|
|
214
|
+
const pubDate = extractDateFromPage(markdown) || post.pubDate;
|
|
215
|
+
|
|
216
|
+
if (fullContent) {
|
|
217
|
+
post.content = fullContent;
|
|
218
|
+
post.summary = fullContent.substring(0, 200);
|
|
219
|
+
}
|
|
220
|
+
post.engagement = engagement;
|
|
221
|
+
post.pubDate = pubDate;
|
|
222
|
+
} catch (err) {
|
|
223
|
+
// Best-effort — keep SERP snippet if scraping fails
|
|
224
|
+
logger.debug(`Enrich failed for ${post.sourceName}: ${err.message}`);
|
|
225
|
+
}
|
|
226
|
+
return post;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// Extract full post content from LinkedIn page markdown.
|
|
230
|
+
// LinkedIn page structure (logged-out view):
|
|
231
|
+
// [Author] [Job] [N followers]
|
|
232
|
+
// [Nd/w/h ago] •
|
|
233
|
+
// [Full post text here]
|
|
234
|
+
// Like Comment Repost Send
|
|
235
|
+
// [N reactions] • [N comments]
|
|
236
|
+
function extractPostContent(markdown) {
|
|
237
|
+
const lines = markdown.split('\n');
|
|
238
|
+
let start = -1;
|
|
239
|
+
let end = lines.length;
|
|
240
|
+
|
|
241
|
+
// Find the line after the date indicator (e.g. "3d •", "2w •", "1mo •")
|
|
242
|
+
for (let i = 0; i < lines.length; i++) {
|
|
243
|
+
const line = lines[i].trim();
|
|
244
|
+
if (/^\d+[dwm]\s*[•·]/.test(line) || /\d+\s+(hour|day|week|month)s?\s+ago/i.test(line)) {
|
|
245
|
+
start = i + 1;
|
|
246
|
+
break;
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
if (start === -1) return null;
|
|
251
|
+
|
|
252
|
+
// Find where content ends: engagement buttons or reaction counts
|
|
253
|
+
for (let i = start; i < lines.length; i++) {
|
|
254
|
+
const line = lines[i].trim().toLowerCase();
|
|
255
|
+
if (
|
|
256
|
+
line === 'like comment repost send' ||
|
|
257
|
+
line.startsWith('like') && line.includes('comment') ||
|
|
258
|
+
/^\d[\d,]*\s*reaction/.test(line) ||
|
|
259
|
+
line === 'reactions' ||
|
|
260
|
+
line.includes('sign in') ||
|
|
261
|
+
line.includes('join now')
|
|
262
|
+
) {
|
|
263
|
+
end = i;
|
|
264
|
+
break;
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
const content = lines
|
|
269
|
+
.slice(start, end)
|
|
270
|
+
.join('\n')
|
|
271
|
+
.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1') // collapse markdown links to text
|
|
272
|
+
.replace(/^#+\s/gm, '') // strip heading markers
|
|
273
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
274
|
+
.trim();
|
|
275
|
+
|
|
276
|
+
return content.length > 30 ? content : null;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
// Extract like/comment/repost counts from LinkedIn page markdown
|
|
280
|
+
function extractEngagement(markdown) {
|
|
281
|
+
const parse = (pattern) => {
|
|
282
|
+
const m = markdown.match(pattern);
|
|
283
|
+
return m ? parseInt(m[1].replace(/,/g, '')) : 0;
|
|
284
|
+
};
|
|
285
|
+
return {
|
|
286
|
+
upvotes: parse(/(\d[\d,]*)\s*reaction/i),
|
|
287
|
+
comments: parse(/(\d[\d,]*)\s*comment/i),
|
|
288
|
+
reposts: parse(/(\d[\d,]*)\s*repost/i),
|
|
289
|
+
};
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// Extract post date from LinkedIn page markdown (more accurate than SERP snippet)
|
|
293
|
+
function extractDateFromPage(markdown) {
|
|
294
|
+
const ago = markdown.match(/(\d+)\s*(hour|day|week|month)s?\s*ago/i) ||
|
|
295
|
+
markdown.match(/(\d+)(h|d|w|mo)\s*[•·]/);
|
|
296
|
+
if (!ago) return null;
|
|
297
|
+
const n = parseInt(ago[1]);
|
|
298
|
+
const unitRaw = ago[2].toLowerCase();
|
|
299
|
+
const unit = { h: 'hour', d: 'day', w: 'week', mo: 'month' }[unitRaw] || unitRaw;
|
|
300
|
+
const ms = { hour: 3600000, day: 86400000, week: 604800000, month: 2592000000 }[unit] || 0;
|
|
301
|
+
return ms ? new Date(Date.now() - n * ms).toISOString() : null;
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
// Match a search result to a KOL in the batch.
|
|
305
|
+
// We use two signals, in order of reliability:
|
|
306
|
+
// 1. URL username contains BOTH first+last name parts of the KOL → post is by them
|
|
307
|
+
// 2. Title is "[KOL Name]'s Post" format → LinkedIn's own-post title format
|
|
308
|
+
// Anything else (posts merely mentioning the KOL) is rejected.
|
|
309
|
+
function matchKol(item, batch) {
|
|
310
|
+
const urlUsername = extractUrlUsername(item.link || '');
|
|
311
|
+
const titleAuthor = (item.title || '').match(/^(.+?)'s Post/i)?.[1]?.toLowerCase().trim() || '';
|
|
312
|
+
|
|
313
|
+
for (const kol of batch) {
|
|
314
|
+
const parts = sanitizeName(kol.name).split(' ').filter(Boolean);
|
|
315
|
+
if (parts.length < 2) continue;
|
|
316
|
+
const [first, ...rest] = parts;
|
|
317
|
+
const last = rest[rest.length - 1];
|
|
318
|
+
|
|
319
|
+
// Signal 1: URL username contains both first AND last name → reliable authorship
|
|
320
|
+
if (urlUsername && urlUsername.includes(first) && urlUsername.includes(last)) return kol;
|
|
321
|
+
|
|
322
|
+
// Signal 2: Title's "Name's Post" format matches this KOL
|
|
323
|
+
if (titleAuthor && titleAuthor.includes(first) && titleAuthor.includes(last)) return kol;
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
return null;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
// Extract the author username from a LinkedIn post URL
|
|
330
|
+
// linkedin.com/posts/USERNAME_post-title-activity-ID
|
|
331
|
+
function extractUrlUsername(url) {
|
|
332
|
+
const m = url.match(/linkedin\.com\/posts\/([^_/]+)/);
|
|
333
|
+
return m?.[1]?.toLowerCase() || '';
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
function sanitizeName(name) {
|
|
337
|
+
return name.toLowerCase().replace(/[^a-z0-9 ]/g, '').trim();
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
// Returns false for snippets that are just LinkedIn profile bios / boilerplate
|
|
341
|
+
function isUsefulContent(content) {
|
|
342
|
+
if (!content || content.length < 40) return false;
|
|
343
|
+
const lower = content.toLowerCase();
|
|
344
|
+
// Common boilerplate patterns from LinkedIn search results
|
|
345
|
+
const boilerplate = [
|
|
346
|
+
'view profile for',
|
|
347
|
+
'report this comment',
|
|
348
|
+
'close menu',
|
|
349
|
+
'like · reply',
|
|
350
|
+
'1 reaction',
|
|
351
|
+
];
|
|
352
|
+
return !boilerplate.some(b => lower.includes(b));
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
// Remove LinkedIn UI chrome from content
|
|
356
|
+
function cleanContent(text) {
|
|
357
|
+
return text
|
|
358
|
+
.replace(/\.\.\.Read more$/i, '')
|
|
359
|
+
.replace(/View profile for [^.]+\./gi, '')
|
|
360
|
+
.replace(/\s+/g, ' ')
|
|
361
|
+
.trim();
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
// Remove "Name's Post" boilerplate from title
|
|
365
|
+
function cleanTitle(title, authorName) {
|
|
366
|
+
const cleaned = title
|
|
367
|
+
.replace(new RegExp(`^${authorName}'s Post\\s*[-–]?\\s*`, 'i'), '')
|
|
368
|
+
.trim();
|
|
369
|
+
return cleaned || title;
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
// Parse relative date strings from Google snippets into ISO dates
|
|
373
|
+
function extractDate(item) {
|
|
374
|
+
const text = `${item.description || ''} ${item.date || ''}`;
|
|
375
|
+
const ago = text.match(/(\d+)\s*(hour|day|week|month)s?\s*ago/i);
|
|
376
|
+
if (!ago) return null;
|
|
377
|
+
const n = parseInt(ago[1]);
|
|
378
|
+
const unit = ago[2].toLowerCase();
|
|
379
|
+
const ms = { hour: 3600000, day: 86400000, week: 604800000, month: 2592000000 }[unit] || 0;
|
|
380
|
+
return new Date(Date.now() - n * ms).toISOString();
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
function chunk(arr, size) {
|
|
384
|
+
const result = [];
|
|
385
|
+
for (let i = 0; i < arr.length; i += size) result.push(arr.slice(i, i + size));
|
|
386
|
+
return result;
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
function loadState() {
|
|
390
|
+
try {
|
|
391
|
+
if (fs.existsSync(STATE_FILE)) return JSON.parse(fs.readFileSync(STATE_FILE, 'utf-8'));
|
|
392
|
+
} catch { /* start fresh */ }
|
|
393
|
+
return {};
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
function saveState(state) {
|
|
397
|
+
const dir = path.dirname(STATE_FILE);
|
|
398
|
+
if (!fs.existsSync(dir)) fs.mkdirSync(dir, { recursive: true });
|
|
399
|
+
fs.writeFileSync(STATE_FILE, JSON.stringify(state, null, 2));
|
|
400
|
+
}
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
import { chromium } from 'playwright';
|
|
2
|
+
import fs from 'fs';
|
|
3
|
+
import path from 'path';
|
|
4
|
+
import crypto from 'crypto';
|
|
5
|
+
import { fileURLToPath } from 'url';
|
|
6
|
+
import createLogger from '../utils/logger.js';
|
|
7
|
+
|
|
8
|
+
const logger = createLogger('LinkedInBrowser');
|
|
9
|
+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
10
|
+
const DEFAULT_PROFILE_DIR = path.join(__dirname, '../../data/playwright-profile');
|
|
11
|
+
|
|
12
|
+
const sleep = ms => new Promise(r => setTimeout(r, ms));
|
|
13
|
+
|
|
14
|
+
function validateProfile(profileDir) {
|
|
15
|
+
if (!fs.existsSync(profileDir)) return 'browser profile not found';
|
|
16
|
+
if (!fs.existsSync(path.join(profileDir, 'Default'))) return 'browser profile is incomplete';
|
|
17
|
+
return null;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export default async function linkedinBrowserFetch(config) {
|
|
21
|
+
const cfg = config.linkedin_browser;
|
|
22
|
+
if (!cfg?.enabled) return [];
|
|
23
|
+
|
|
24
|
+
const profileDir = cfg.profileDir || DEFAULT_PROFILE_DIR;
|
|
25
|
+
const profileError = validateProfile(profileDir);
|
|
26
|
+
if (profileError) {
|
|
27
|
+
logger.warn(`LinkedIn Browser skipped: ${profileError}`);
|
|
28
|
+
logger.warn('Run: npm run setup:twitter (same profile as Twitter)');
|
|
29
|
+
return [];
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const accounts = [...(cfg.accounts || [])].sort(() => Math.random() - 0.5);
|
|
33
|
+
const maxPerAccount = cfg.maxPostsPerAccount || 5;
|
|
34
|
+
const maxAgeHours = cfg.maxAgeHours || 48;
|
|
35
|
+
const cutoff = new Date(Date.now() - maxAgeHours * 3600000);
|
|
36
|
+
const delay = cfg.delayBetweenAccountsMs || 10000;
|
|
37
|
+
|
|
38
|
+
let context;
|
|
39
|
+
try {
|
|
40
|
+
context = await chromium.launchPersistentContext(profileDir, {
|
|
41
|
+
headless: false,
|
|
42
|
+
channel: 'chrome',
|
|
43
|
+
ignoreDefaultArgs: ['--enable-automation'],
|
|
44
|
+
args: ['--disable-blink-features=AutomationControlled'],
|
|
45
|
+
viewport: { width: 1280, height: 900 },
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
const page = context.pages()[0] ?? await context.newPage();
|
|
49
|
+
await sleep(3000);
|
|
50
|
+
|
|
51
|
+
const allItems = [];
|
|
52
|
+
|
|
53
|
+
for (let i = 0; i < accounts.length; i++) {
|
|
54
|
+
const account = accounts[i];
|
|
55
|
+
try {
|
|
56
|
+
logger.info(`Scraping linkedin.com/in/${account}...`);
|
|
57
|
+
const posts = await scrapeAccount(page, account, maxPerAccount, cutoff);
|
|
58
|
+
allItems.push(...posts);
|
|
59
|
+
logger.debug(` → ${posts.length} posts from ${account}`);
|
|
60
|
+
} catch (err) {
|
|
61
|
+
logger.error(`Failed ${account}: ${err.message}`);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
if (i < accounts.length - 1) {
|
|
65
|
+
const wait = delay + Math.random() * 5000;
|
|
66
|
+
logger.debug(` Waiting ${Math.round(wait / 1000)}s...`);
|
|
67
|
+
await sleep(wait);
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
logger.success(`Fetched ${allItems.length} posts from ${accounts.length} LinkedIn accounts`);
|
|
72
|
+
return allItems;
|
|
73
|
+
} finally {
|
|
74
|
+
if (context) await context.close();
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
async function scrapeAccount(page, slug, limit, cutoff) {
|
|
79
|
+
const url = `https://www.linkedin.com/in/${slug}/recent-activity/all/`;
|
|
80
|
+
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
81
|
+
|
|
82
|
+
// Wait for posts feed
|
|
83
|
+
await page.waitForSelector('div[data-urn]', { timeout: 15000 });
|
|
84
|
+
await sleep(2000);
|
|
85
|
+
|
|
86
|
+
// Scroll to load more posts
|
|
87
|
+
for (let i = 0; i < 3; i++) {
|
|
88
|
+
await page.mouse.wheel(0, 500 + Math.random() * 300);
|
|
89
|
+
await sleep(700 + Math.random() * 400);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
const rawPosts = await page.evaluate((limit) => {
|
|
93
|
+
const containers = [...document.querySelectorAll('div[data-urn^="urn:li:activity"]')]
|
|
94
|
+
.slice(0, limit);
|
|
95
|
+
|
|
96
|
+
return containers.map(el => {
|
|
97
|
+
const urn = el.getAttribute('data-urn');
|
|
98
|
+
|
|
99
|
+
// Post text
|
|
100
|
+
const textEl = el.querySelector('.update-components-text, [class*="commentary"]');
|
|
101
|
+
const text = textEl?.innerText?.trim() || '';
|
|
102
|
+
|
|
103
|
+
// Construct post URL directly from urn
|
|
104
|
+
const link = urn ? `https://www.linkedin.com/feed/update/${urn}/` : '';
|
|
105
|
+
|
|
106
|
+
// Time ago — take first segment before " •" or newline
|
|
107
|
+
const timeEl = el.querySelector('.update-components-actor__sub-description');
|
|
108
|
+
const timeAgo = timeEl?.innerText?.trim().split(/\s*[•\n]/)[0].trim() || '';
|
|
109
|
+
|
|
110
|
+
// Reactions count
|
|
111
|
+
const reactionsEl = el.querySelector('.social-details-social-counts__reactions-count');
|
|
112
|
+
const reactions = parseInt(reactionsEl?.innerText?.replace(/[^0-9]/g, '') || '0', 10);
|
|
113
|
+
|
|
114
|
+
// Comments count — parse from social counts block
|
|
115
|
+
const countsEl = el.querySelector('[class*="social-counts"]');
|
|
116
|
+
const commentsMatch = countsEl?.innerText?.match(/(\d+)\s+comment/);
|
|
117
|
+
const comments = commentsMatch ? parseInt(commentsMatch[1]) : 0;
|
|
118
|
+
|
|
119
|
+
return { text, link, timeAgo, reactions, comments };
|
|
120
|
+
});
|
|
121
|
+
}, limit);
|
|
122
|
+
|
|
123
|
+
return rawPosts
|
|
124
|
+
.filter(p => p.link && p.text)
|
|
125
|
+
.filter(p => {
|
|
126
|
+
if (!p.timeAgo) return true;
|
|
127
|
+
const pubDate = parseTimeAgo(p.timeAgo);
|
|
128
|
+
return !pubDate || pubDate >= cutoff;
|
|
129
|
+
})
|
|
130
|
+
.map(p => {
|
|
131
|
+
const pubDate = parseTimeAgo(p.timeAgo) || new Date();
|
|
132
|
+
return {
|
|
133
|
+
id: crypto.createHash('md5').update(p.link).digest('hex'),
|
|
134
|
+
source: 'linkedin_browser',
|
|
135
|
+
sourceName: slug,
|
|
136
|
+
category: 'linkedin',
|
|
137
|
+
title: p.text.substring(0, 100) + (p.text.length > 100 ? '…' : ''),
|
|
138
|
+
link: p.link,
|
|
139
|
+
url: p.link,
|
|
140
|
+
content: p.text,
|
|
141
|
+
summary: p.text.substring(0, 200),
|
|
142
|
+
author: slug,
|
|
143
|
+
pubDate: pubDate.toISOString(),
|
|
144
|
+
scraped_at: new Date().toISOString(),
|
|
145
|
+
age_hours: Math.floor((Date.now() - pubDate.getTime()) / 3600000),
|
|
146
|
+
tags: [],
|
|
147
|
+
engagement: {
|
|
148
|
+
upvotes: p.reactions,
|
|
149
|
+
comments: p.comments,
|
|
150
|
+
},
|
|
151
|
+
metadata: {
|
|
152
|
+
score: p.reactions,
|
|
153
|
+
timeAgo: p.timeAgo,
|
|
154
|
+
},
|
|
155
|
+
};
|
|
156
|
+
});
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
function parseTimeAgo(str) {
|
|
160
|
+
if (!str) return null;
|
|
161
|
+
const m = str.match(/(\d+)\s*(s|m|h|d|w|mo)/i);
|
|
162
|
+
if (!m) return null;
|
|
163
|
+
const n = parseInt(m[1]);
|
|
164
|
+
const unit = m[2].toLowerCase();
|
|
165
|
+
const ms = { s: 1000, m: 60000, h: 3600000, d: 86400000, w: 604800000, mo: 2592000000 }[unit] || 0;
|
|
166
|
+
return new Date(Date.now() - n * ms);
|
|
167
|
+
}
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import axios from 'axios';
|
|
2
|
+
import createLogger from '../utils/logger.js';
|
|
3
|
+
|
|
4
|
+
const logger = createLogger('RedditFetcher');
|
|
5
|
+
|
|
6
|
+
export default async function redditFetch(config) {
|
|
7
|
+
const redditConfig = config.trendingSources?.reddit;
|
|
8
|
+
if (!redditConfig?.enabled) {
|
|
9
|
+
return [];
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
const subreddits = redditConfig.subreddits || [];
|
|
13
|
+
const minScore = redditConfig.minScore || 100;
|
|
14
|
+
const maxAge = parseInt(redditConfig.maxAge) || 24;
|
|
15
|
+
const cutoff = new Date(Date.now() - maxAge * 60 * 60 * 1000);
|
|
16
|
+
|
|
17
|
+
logger.info(`Fetching top posts from ${subreddits.length} subreddits...`);
|
|
18
|
+
|
|
19
|
+
const allItems = [];
|
|
20
|
+
|
|
21
|
+
for (const subreddit of subreddits) {
|
|
22
|
+
try {
|
|
23
|
+
const response = await axios.get(
|
|
24
|
+
`https://www.reddit.com/r/${subreddit}/hot.json?limit=50`,
|
|
25
|
+
{ headers: { 'User-Agent': 'AI-Keytake-Scraper/1.0' } }
|
|
26
|
+
);
|
|
27
|
+
|
|
28
|
+
const posts = response.data.data.children;
|
|
29
|
+
|
|
30
|
+
for (const post of posts) {
|
|
31
|
+
const data = post.data;
|
|
32
|
+
const created = new Date(data.created_utc * 1000);
|
|
33
|
+
|
|
34
|
+
// Skip old posts
|
|
35
|
+
if (created < cutoff) continue;
|
|
36
|
+
|
|
37
|
+
// Skip low-score posts
|
|
38
|
+
if (data.score < minScore) continue;
|
|
39
|
+
|
|
40
|
+
// Skip NSFW
|
|
41
|
+
if (data.over_18) continue;
|
|
42
|
+
|
|
43
|
+
allItems.push({
|
|
44
|
+
id: `reddit_${data.id}`,
|
|
45
|
+
source: 'reddit',
|
|
46
|
+
sourceName: `r/${subreddit}`,
|
|
47
|
+
title: data.title,
|
|
48
|
+
content: data.selftext || '',
|
|
49
|
+
summary: (data.selftext || '').substring(0, 200),
|
|
50
|
+
url: `https://reddit.com${data.permalink}`,
|
|
51
|
+
external_url: data.url,
|
|
52
|
+
author: data.author,
|
|
53
|
+
posted_at: new Date(data.created_utc * 1000).toISOString(),
|
|
54
|
+
scraped_at: new Date().toISOString(),
|
|
55
|
+
age_hours: Math.floor((Date.now() - created.getTime()) / (1000 * 60 * 60)),
|
|
56
|
+
engagement: {
|
|
57
|
+
upvotes: data.score,
|
|
58
|
+
comments: data.num_comments,
|
|
59
|
+
ratio: data.upvote_ratio
|
|
60
|
+
},
|
|
61
|
+
metadata: {
|
|
62
|
+
score: data.score,
|
|
63
|
+
is_self: data.is_self,
|
|
64
|
+
is_video: data.is_video
|
|
65
|
+
}
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
logger.debug(`Fetched from r/${subreddit}`);
|
|
70
|
+
} catch (error) {
|
|
71
|
+
logger.error(`Error fetching r/${subreddit}: ${error.message}`);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
logger.success(`Fetched ${allItems.length} total posts from Reddit`);
|
|
76
|
+
return allItems;
|
|
77
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import Parser from 'rss-parser';
|
|
2
|
+
import crypto from 'crypto';
|
|
3
|
+
import createLogger from '../utils/logger.js';
|
|
4
|
+
|
|
5
|
+
const logger = createLogger('RSSFetcher');
|
|
6
|
+
const parser = new Parser();
|
|
7
|
+
|
|
8
|
+
export default async function rssFetch(config) {
|
|
9
|
+
const feeds = config.rssFeeds.filter(f => f.enabled);
|
|
10
|
+
logger.info(`Fetching from ${feeds.length} RSS feeds`);
|
|
11
|
+
|
|
12
|
+
const allItems = [];
|
|
13
|
+
const maxAge = config.filtering?.maxAgeHours || 48;
|
|
14
|
+
const cutoff = new Date(Date.now() - maxAge * 60 * 60 * 1000);
|
|
15
|
+
|
|
16
|
+
for (const feed of feeds) {
|
|
17
|
+
try {
|
|
18
|
+
const parsed = await parser.parseURL(feed.url);
|
|
19
|
+
|
|
20
|
+
for (const item of parsed.items) {
|
|
21
|
+
const pubDate = new Date(item.pubDate);
|
|
22
|
+
|
|
23
|
+
// Skip old items
|
|
24
|
+
if (pubDate < cutoff) continue;
|
|
25
|
+
|
|
26
|
+
allItems.push({
|
|
27
|
+
id: crypto.createHash('md5').update(item.link).digest('hex'),
|
|
28
|
+
source: 'rss',
|
|
29
|
+
sourceName: feed.name,
|
|
30
|
+
category: feed.category,
|
|
31
|
+
title: item.title,
|
|
32
|
+
link: item.link,
|
|
33
|
+
content: item.contentSnippet || item.content || '',
|
|
34
|
+
summary: (item.contentSnippet || item.content || '').substring(0, 200),
|
|
35
|
+
pubDate: item.pubDate,
|
|
36
|
+
author: item.creator || item.author || feed.name,
|
|
37
|
+
scraped_at: new Date().toISOString(),
|
|
38
|
+
age_hours: Math.floor((Date.now() - pubDate.getTime()) / (1000 * 60 * 60))
|
|
39
|
+
});
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
logger.debug(`Fetched items from ${feed.name}`);
|
|
43
|
+
} catch (error) {
|
|
44
|
+
logger.error(`Failed to fetch ${feed.name}: ${error.message}`);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
logger.success(`Fetched ${allItems.length} total items`);
|
|
49
|
+
return allItems;
|
|
50
|
+
}
|