heyhank 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +83 -10
- package/bin/cli.ts +7 -7
- package/bin/ctl.ts +42 -42
- package/dist/assets/{AgentsPage-BPhirnCe.js → AgentsPage-B-AAmsMK.js} +3 -3
- package/dist/assets/AssistantPage-BV1Mfwdt.js +2 -0
- package/dist/assets/BusinessPage-tLpNEz19.js +1 -0
- package/dist/assets/{CronManager-DDbz-yiT.js → CronManager-B-K_n3Jg.js} +1 -1
- package/dist/assets/HelpPage-Bhf_j6Xr.js +1 -0
- package/dist/assets/{IntegrationsPage-CrOitCmJ.js → IntegrationsPage-DAMjs9tM.js} +1 -1
- package/dist/assets/JarvisHUD-C_TGXCCn.js +120 -0
- package/dist/assets/MediaPage-C48HTTrt.js +1 -0
- package/dist/assets/MemoryPage-JkC-qtgp.js +1 -0
- package/dist/assets/{PlatformDashboard-Do6F0O2p.js → PlatformDashboard-AUo7tNnE.js} +1 -1
- package/dist/assets/{Playground-Fc5cdc5p.js → Playground-AzNMsRBL.js} +1 -1
- package/dist/assets/{ProcessPanel-CslEiZkI.js → ProcessPanel-DpE_2sX3.js} +1 -1
- package/dist/assets/{PromptsPage-D2EhsdNO.js → PromptsPage-C2RQOs6p.js} +2 -2
- package/dist/assets/RunsPage-B9UOyO79.js +1 -0
- package/dist/assets/{SandboxManager-a1AVI5q2.js → SandboxManager-jHvYjwfh.js} +1 -1
- package/dist/assets/SettingsPage-BBJax6gt.js +51 -0
- package/dist/assets/SkillsMarketplace-IjmjfdjD.js +1 -0
- package/dist/assets/SocialMediaPage-DoPZHhr2.js +10 -0
- package/dist/assets/{TailscalePage-CHiFhZXF.js → TailscalePage-DDEY7ckO.js} +1 -1
- package/dist/assets/TelephonyPage-OPNBZYKt.js +9 -0
- package/dist/assets/{TerminalPage-Drwyrnfd.js → TerminalPage-BjMbHHW3.js} +1 -1
- package/dist/assets/{gemini-live-client-C7rqAW7G.js → gemini-live-client-C70FEtX2.js} +11 -8
- package/dist/assets/{index-CEqZnThB.js → index-BgYM4wXw.js} +94 -93
- package/dist/assets/index-BkjSoVgn.css +32 -0
- package/dist/assets/sw-register-C7NOHtIu.js +1 -0
- package/dist/assets/text-chat-client-BSbLJerZ.js +2 -0
- package/dist/index.html +2 -2
- package/dist/sw.js +1 -1
- package/package.json +6 -1
- package/server/agent-executor.ts +37 -2
- package/server/agent-store.ts +3 -3
- package/server/agent-types.ts +11 -0
- package/server/assistant-store.ts +232 -6
- package/server/auth-manager.ts +9 -0
- package/server/cache-headers.ts +1 -1
- package/server/calendar-service.ts +10 -0
- package/server/ceo/document-store.ts +129 -0
- package/server/ceo/finance-store.ts +343 -0
- package/server/ceo/kpi-store.ts +208 -0
- package/server/ceo/memory-import.ts +277 -0
- package/server/ceo/news-store.ts +208 -0
- package/server/ceo/template-store.ts +134 -0
- package/server/ceo/time-tracking-store.ts +227 -0
- package/server/claude-auth-monitor.ts +128 -0
- package/server/claude-code-worker.ts +86 -0
- package/server/claude-session-discovery.ts +74 -1
- package/server/cli-launcher.ts +32 -10
- package/server/codex-adapter.ts +2 -2
- package/server/codex-ws-proxy.cjs +1 -1
- package/server/container-manager.ts +4 -4
- package/server/content-intelligence/content-engine.ts +1112 -0
- package/server/content-intelligence/platform-knowledge.ts +870 -0
- package/server/cron-store.ts +3 -3
- package/server/embedding-service.ts +49 -0
- package/server/event-bus-types.ts +13 -0
- package/server/federation/node-store.ts +5 -4
- package/server/fs-utils.ts +28 -1
- package/server/hank-notifications-store.ts +91 -0
- package/server/hank-tool-executor.ts +1835 -0
- package/server/hank-tools.ts +2107 -0
- package/server/image-pull-manager.ts +2 -2
- package/server/index.ts +25 -2
- package/server/llm-providers-streaming.ts +541 -0
- package/server/llm-providers.ts +12 -0
- package/server/marketplace.ts +249 -0
- package/server/mcp-registry.ts +158 -0
- package/server/memory-service.ts +296 -0
- package/server/obsidian-sync.ts +184 -0
- package/server/provider-manager.ts +5 -2
- package/server/provider-registry.ts +12 -0
- package/server/reminder-scheduler.ts +37 -1
- package/server/routes/agent-routes.ts +2 -1
- package/server/routes/assistant-routes.ts +198 -5
- package/server/routes/ceo-finance-kpi-routes.ts +167 -0
- package/server/routes/ceo-news-time-routes.ts +137 -0
- package/server/routes/ceo-routes.ts +99 -0
- package/server/routes/content-routes.ts +116 -0
- package/server/routes/email-routes.ts +147 -0
- package/server/routes/env-routes.ts +3 -3
- package/server/routes/fs-routes.ts +12 -9
- package/server/routes/hank-chat-routes.ts +592 -0
- package/server/routes/llm-routes.ts +12 -0
- package/server/routes/marketplace-routes.ts +63 -0
- package/server/routes/media-routes.ts +1 -1
- package/server/routes/memory-routes.ts +127 -0
- package/server/routes/platform-routes.ts +14 -675
- package/server/routes/sandbox-routes.ts +1 -1
- package/server/routes/settings-routes.ts +51 -1
- package/server/routes/socialmedia-routes.ts +152 -2
- package/server/routes/system-routes.ts +2 -2
- package/server/routes/team-routes.ts +71 -0
- package/server/routes/telephony-routes.ts +98 -18
- package/server/routes.ts +36 -9
- package/server/session-creation-service.ts +2 -2
- package/server/session-orchestrator.ts +54 -2
- package/server/session-types.ts +2 -0
- package/server/settings-manager.ts +50 -2
- package/server/skill-discovery.ts +68 -0
- package/server/socialmedia/adapters/browser-adapter.ts +179 -0
- package/server/socialmedia/adapters/postiz-adapter.ts +291 -14
- package/server/socialmedia/manager.ts +234 -15
- package/server/socialmedia/store.ts +51 -1
- package/server/socialmedia/types.ts +35 -2
- package/server/socialview/browser-manager.ts +150 -0
- package/server/socialview/extractors.ts +1298 -0
- package/server/socialview/image-describe.ts +188 -0
- package/server/socialview/library.ts +119 -0
- package/server/socialview/poster.ts +276 -0
- package/server/socialview/routes.ts +371 -0
- package/server/socialview/style-analyzer.ts +187 -0
- package/server/socialview/style-profiles.ts +67 -0
- package/server/socialview/types.ts +166 -0
- package/server/socialview/vision.ts +127 -0
- package/server/socialview/vnc-manager.ts +110 -0
- package/server/style-injector.ts +135 -0
- package/server/team-service.ts +239 -0
- package/server/team-store.ts +75 -0
- package/server/team-types.ts +52 -0
- package/server/telephony/audio-bridge.ts +281 -35
- package/server/telephony/audio-recorder.ts +132 -0
- package/server/telephony/call-manager.ts +803 -104
- package/server/telephony/call-types.ts +67 -1
- package/server/telephony/esl-client.ts +319 -0
- package/server/telephony/freeswitch-sync.ts +155 -0
- package/server/telephony/phone-utils.ts +63 -0
- package/server/telephony/telephony-store.ts +9 -8
- package/server/url-validator.ts +82 -0
- package/server/vault-markdown.ts +317 -0
- package/server/vault-migration.ts +121 -0
- package/server/vault-store.ts +466 -0
- package/server/vault-watcher.ts +59 -0
- package/server/vector-store.ts +210 -0
- package/server/voice-pipeline/gemini-live-adapter.ts +97 -0
- package/server/voice-pipeline/greeting-cache.ts +200 -0
- package/server/voice-pipeline/manager.ts +249 -0
- package/server/voice-pipeline/pipeline.ts +335 -0
- package/server/voice-pipeline/providers/index.ts +47 -0
- package/server/voice-pipeline/providers/llm-internal.ts +527 -0
- package/server/voice-pipeline/providers/stt-google.ts +157 -0
- package/server/voice-pipeline/providers/tts-google.ts +126 -0
- package/server/voice-pipeline/types.ts +247 -0
- package/server/ws-bridge-types.ts +6 -1
- package/dist/assets/AssistantPage-DJ-cMQfb.js +0 -1
- package/dist/assets/HelpPage-DMfkzERp.js +0 -1
- package/dist/assets/MediaPage-CE5rdvkC.js +0 -1
- package/dist/assets/RunsPage-C5BZF5Rx.js +0 -1
- package/dist/assets/SettingsPage-DirhjQrJ.js +0 -51
- package/dist/assets/SocialMediaPage-DBuM28vD.js +0 -1
- package/dist/assets/TelephonyPage-x0VV0fOo.js +0 -1
- package/dist/assets/index-C8M_PUmX.css +0 -32
- package/dist/assets/sw-register-LSSpj6RU.js +0 -1
- package/server/socialmedia/adapters/ayrshare-adapter.ts +0 -169
|
@@ -0,0 +1,1298 @@
|
|
|
1
|
+
// ─── Post Extractors ─────────────────────────────────────────────────────────
|
|
2
|
+
// Per-platform DOM extraction via Playwright. Phase 2 implements Instagram
|
|
3
|
+
// first; other platforms throw "not implemented" and will be added once the
|
|
4
|
+
// end-to-end flow is validated.
|
|
5
|
+
|
|
6
|
+
import type { Page } from "playwright";
|
|
7
|
+
import { randomUUID } from "node:crypto";
|
|
8
|
+
import { writeFileSync } from "node:fs";
|
|
9
|
+
import { join } from "node:path";
|
|
10
|
+
import { MEDIA_ROOT, ensureDirs } from "./library.js";
|
|
11
|
+
import { describeImageByUrl } from "./vision.js";
|
|
12
|
+
import type { LibraryPost, SocialPlatform } from "./types.js";
|
|
13
|
+
|
|
14
|
+
export interface ExtractOptions {
|
|
15
|
+
platform: SocialPlatform;
|
|
16
|
+
page: Page;
|
|
17
|
+
source: "own" | "role-model";
|
|
18
|
+
/** Called for side-effects (progress log to UI). */
|
|
19
|
+
onLog?: (msg: string) => void;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export interface ExtractResult {
|
|
23
|
+
posts: LibraryPost[];
|
|
24
|
+
errors: string[];
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
/** Detect roughly what type of IG page we're on and run the matching extractor. */
|
|
28
|
+
export async function extractCurrentPage(opts: ExtractOptions): Promise<ExtractResult> {
|
|
29
|
+
const url = opts.page.url();
|
|
30
|
+
opts.onLog?.(`URL: ${url}`);
|
|
31
|
+
|
|
32
|
+
if (opts.platform === "instagram") {
|
|
33
|
+
if (/\/p\/[^/]+\/?/.test(url) || /\/reel\/[^/]+\/?/.test(url)) {
|
|
34
|
+
opts.onLog?.("Detected: Instagram single post");
|
|
35
|
+
return await extractInstagramSinglePost(opts);
|
|
36
|
+
}
|
|
37
|
+
// Profile pages look like https://www.instagram.com/<handle>/
|
|
38
|
+
if (/instagram\.com\/[^/]+\/?$/.test(url) && !/\/(accounts|explore|direct)/.test(url)) {
|
|
39
|
+
opts.onLog?.("Detected: Instagram profile (up to 9 posts)");
|
|
40
|
+
return await extractInstagramProfile(opts, 25);
|
|
41
|
+
}
|
|
42
|
+
return { posts: [], errors: [`Instagram URL not recognized for extraction: ${url}`] };
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
if (opts.platform === "facebook") {
|
|
46
|
+
// Single-post permalinks: /<handle>/posts/<id>, /permalink.php, /share/p/<id>,
|
|
47
|
+
// /<handle>/videos/<id>, /reel/<id>, /story.php, /watch/?v=<id>
|
|
48
|
+
if (
|
|
49
|
+
/\/posts\//.test(url) ||
|
|
50
|
+
/\/permalink\.php/.test(url) ||
|
|
51
|
+
/\/share\/p\//.test(url) ||
|
|
52
|
+
/\/share\/v\//.test(url) ||
|
|
53
|
+
/\/share\/r\//.test(url) ||
|
|
54
|
+
/\/videos\//.test(url) ||
|
|
55
|
+
/\/reel\//.test(url) ||
|
|
56
|
+
/\/story\.php/.test(url) ||
|
|
57
|
+
/\/watch\//.test(url)
|
|
58
|
+
) {
|
|
59
|
+
opts.onLog?.("Detected: Facebook single post");
|
|
60
|
+
return await extractFacebookSinglePost(opts);
|
|
61
|
+
}
|
|
62
|
+
// Profile / page feed: facebook.com/<handle> or facebook.com/profile.php?id=...
|
|
63
|
+
// Desktop FB 2026 moved post wrappers OFF `role='article'` (which is now
|
|
64
|
+
// used for comments + side widgets). Posts live under elements carrying
|
|
65
|
+
// `[data-ad-rendering-role='story_message']` / `[data-ad-comet-preview='message']`
|
|
66
|
+
// for the body and `[data-ad-rendering-role='profile_name']` for the author
|
|
67
|
+
// header. mbasic.facebook.com redirects to www in 2026 so the mobile path
|
|
68
|
+
// is dead. We anchor on the message nodes and walk up to find each post
|
|
69
|
+
// wrapper.
|
|
70
|
+
if (
|
|
71
|
+
/facebook\.com\/[^/?#]+\/?$/.test(url) ||
|
|
72
|
+
/facebook\.com\/profile\.php/.test(url) ||
|
|
73
|
+
/facebook\.com\/pages\//.test(url)
|
|
74
|
+
) {
|
|
75
|
+
opts.onLog?.("Detected: Facebook profile feed — desktop message-anchor extractor");
|
|
76
|
+
return await extractFacebookFeedDesktop(opts, 25);
|
|
77
|
+
}
|
|
78
|
+
return { posts: [], errors: [`Facebook URL not recognized for extraction: ${url}`] };
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
return { posts: [], errors: [`Extractor not yet implemented for platform: ${opts.platform}`] };
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/** Extract the single post currently open at /p/<shortcode>/ or /reel/<shortcode>/ */
|
|
85
|
+
async function extractInstagramSinglePost(opts: ExtractOptions): Promise<ExtractResult> {
|
|
86
|
+
const { page, source, onLog } = opts;
|
|
87
|
+
try {
|
|
88
|
+
onLog?.("Waiting for post content to render…");
|
|
89
|
+
// Instagram's post pages wrap the article in <article role="presentation">.
|
|
90
|
+
await page.waitForSelector("article", { timeout: 10_000 }).catch(() => {});
|
|
91
|
+
|
|
92
|
+
const data = await page.evaluate(() => {
|
|
93
|
+
const article = document.querySelector("article");
|
|
94
|
+
if (!article) return null;
|
|
95
|
+
|
|
96
|
+
// Author handle: usually the first <a href="/xyz/"> inside header.
|
|
97
|
+
let handle = "";
|
|
98
|
+
const headerLink = article.querySelector("header a[href^='/']") as HTMLAnchorElement | null;
|
|
99
|
+
if (headerLink) {
|
|
100
|
+
handle = headerLink.getAttribute("href")?.replace(/^\//, "").replace(/\/$/, "") || "";
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Post text: the caption often lives in <h1> or a <span> in the second part of the article.
|
|
104
|
+
let text = "";
|
|
105
|
+
const h1 = article.querySelector("h1");
|
|
106
|
+
if (h1) text = h1.textContent?.trim() || "";
|
|
107
|
+
if (!text) {
|
|
108
|
+
// Fallback: longest <span> with line breaks
|
|
109
|
+
const spans = Array.from(article.querySelectorAll("span"));
|
|
110
|
+
const rich = spans
|
|
111
|
+
.map((s) => s.textContent?.trim() || "")
|
|
112
|
+
.filter((t) => t.length > 30)
|
|
113
|
+
.sort((a, b) => b.length - a.length);
|
|
114
|
+
if (rich[0]) text = rich[0];
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Media: all <img> with src starting https:, filter out tiny profile pics
|
|
118
|
+
const imgs = Array.from(article.querySelectorAll("img")) as HTMLImageElement[];
|
|
119
|
+
const mediaUrls = imgs
|
|
120
|
+
.filter((img) => img.naturalWidth > 200 || img.width > 200)
|
|
121
|
+
.map((img) => img.src)
|
|
122
|
+
.filter((src) => src.startsWith("http"));
|
|
123
|
+
|
|
124
|
+
// Video sources (reels)
|
|
125
|
+
const videos = Array.from(article.querySelectorAll("video")) as HTMLVideoElement[];
|
|
126
|
+
const videoUrls = videos
|
|
127
|
+
.map((v) => v.src || v.currentSrc)
|
|
128
|
+
.filter((u) => !!u && u.startsWith("http"));
|
|
129
|
+
|
|
130
|
+
// Post type heuristic
|
|
131
|
+
let postType = "image";
|
|
132
|
+
if (videoUrls.length > 0) postType = "reel";
|
|
133
|
+
if (mediaUrls.length > 1) postType = "carousel";
|
|
134
|
+
|
|
135
|
+
return {
|
|
136
|
+
handle,
|
|
137
|
+
text,
|
|
138
|
+
mediaUrls,
|
|
139
|
+
videoUrls,
|
|
140
|
+
postType,
|
|
141
|
+
url: window.location.href,
|
|
142
|
+
};
|
|
143
|
+
});
|
|
144
|
+
|
|
145
|
+
if (!data) {
|
|
146
|
+
return { posts: [], errors: ["Could not find post <article> on page"] };
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// Extract hashtags + mentions from text.
|
|
150
|
+
const hashtags = Array.from(data.text.matchAll(/#(\w+)/g)).map((m) => m[1]);
|
|
151
|
+
const mentions = Array.from(data.text.matchAll(/@(\w+)/g)).map((m) => m[1]);
|
|
152
|
+
const cta = detectCta(data.text);
|
|
153
|
+
const hook = extractHook(data.text);
|
|
154
|
+
|
|
155
|
+
// Claude Vision for the first image only (cheap, fast; the rest can be done on-demand).
|
|
156
|
+
let visionDescription = "";
|
|
157
|
+
if (data.mediaUrls.length > 0) {
|
|
158
|
+
onLog?.("Requesting Claude Vision description…");
|
|
159
|
+
visionDescription = await describeImageByUrl(data.mediaUrls[0]);
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
const id = `ig-${Date.now()}-${randomUUID().slice(0, 8)}`;
|
|
163
|
+
|
|
164
|
+
const post: LibraryPost = {
|
|
165
|
+
id,
|
|
166
|
+
platform: "instagram",
|
|
167
|
+
source,
|
|
168
|
+
url: data.url,
|
|
169
|
+
author: { handle: data.handle },
|
|
170
|
+
text: data.text,
|
|
171
|
+
hook,
|
|
172
|
+
cta,
|
|
173
|
+
hashtags,
|
|
174
|
+
mentions,
|
|
175
|
+
media: [
|
|
176
|
+
...data.mediaUrls.map((remoteUrl, i) => ({
|
|
177
|
+
type: "image" as const,
|
|
178
|
+
localPath: null,
|
|
179
|
+
remoteUrl,
|
|
180
|
+
description: i === 0 ? visionDescription : "",
|
|
181
|
+
})),
|
|
182
|
+
...data.videoUrls.map((remoteUrl) => ({
|
|
183
|
+
type: "video" as const,
|
|
184
|
+
localPath: null,
|
|
185
|
+
remoteUrl,
|
|
186
|
+
description: "",
|
|
187
|
+
})),
|
|
188
|
+
],
|
|
189
|
+
engagement: { likes: null, comments: null, shares: null, views: null, saves: null },
|
|
190
|
+
engagementRate: null,
|
|
191
|
+
postType: data.postType as LibraryPost["postType"],
|
|
192
|
+
postedAt: null,
|
|
193
|
+
tags: [],
|
|
194
|
+
isGold: false,
|
|
195
|
+
extractedAt: new Date().toISOString(),
|
|
196
|
+
notes: "",
|
|
197
|
+
};
|
|
198
|
+
|
|
199
|
+
return { posts: [post], errors: [] };
|
|
200
|
+
} catch (e) {
|
|
201
|
+
return {
|
|
202
|
+
posts: [],
|
|
203
|
+
errors: [`Extraction failed: ${e instanceof Error ? e.message : String(e)}`],
|
|
204
|
+
};
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
/** From a profile page, open the first N posts and extract each. */
|
|
209
|
+
async function extractInstagramProfile(
|
|
210
|
+
opts: ExtractOptions,
|
|
211
|
+
maxPosts: number,
|
|
212
|
+
): Promise<ExtractResult> {
|
|
213
|
+
const { page, onLog } = opts;
|
|
214
|
+
const posts: LibraryPost[] = [];
|
|
215
|
+
const errors: string[] = [];
|
|
216
|
+
|
|
217
|
+
try {
|
|
218
|
+
await page.waitForSelector("main a[href*='/p/'], main a[href*='/reel/']", { timeout: 10_000 });
|
|
219
|
+
|
|
220
|
+
// Scroll to populate more thumbnails — IG lazy-loads the grid as you scroll.
|
|
221
|
+
for (let i = 0; i < 8; i++) {
|
|
222
|
+
await page.evaluate(() => window.scrollBy(0, 1500));
|
|
223
|
+
await page.waitForTimeout(700);
|
|
224
|
+
}
|
|
225
|
+
await page.evaluate(() => window.scrollTo(0, 0));
|
|
226
|
+
await page.waitForTimeout(400);
|
|
227
|
+
|
|
228
|
+
const links = await page.evaluate((max) => {
|
|
229
|
+
const anchors = Array.from(document.querySelectorAll("main a")) as HTMLAnchorElement[];
|
|
230
|
+
const hrefs = anchors
|
|
231
|
+
.map((a) => a.getAttribute("href") || "")
|
|
232
|
+
.filter((h) => /\/p\/[^/]+\/?$/.test(h) || /\/reel\/[^/]+\/?$/.test(h));
|
|
233
|
+
return Array.from(new Set(hrefs)).slice(0, max);
|
|
234
|
+
}, maxPosts);
|
|
235
|
+
|
|
236
|
+
onLog?.(`Found ${links.length} posts on profile — extracting…`);
|
|
237
|
+
|
|
238
|
+
let idx = 0;
|
|
239
|
+
for (const href of links) {
|
|
240
|
+
idx++;
|
|
241
|
+
try {
|
|
242
|
+
const absoluteUrl = new URL(href, "https://www.instagram.com").toString();
|
|
243
|
+
onLog?.(`${idx}/${links.length}: ${absoluteUrl}`);
|
|
244
|
+
await page.goto(absoluteUrl, { waitUntil: "domcontentloaded" });
|
|
245
|
+
const single = await extractInstagramSinglePost(opts);
|
|
246
|
+
posts.push(...single.posts);
|
|
247
|
+
errors.push(...single.errors);
|
|
248
|
+
if (single.posts.length > 0) {
|
|
249
|
+
onLog?.(`${idx}/${links.length}: ✓ saved "${(single.posts[0].text || "").slice(0, 60)}"`);
|
|
250
|
+
} else {
|
|
251
|
+
onLog?.(`${idx}/${links.length}: ✗ no post extracted`);
|
|
252
|
+
}
|
|
253
|
+
} catch (e) {
|
|
254
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
255
|
+
errors.push(`Failed on ${href}: ${msg}`);
|
|
256
|
+
onLog?.(`${idx}/${links.length}: ✗ ${msg}`);
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
} catch (e) {
|
|
260
|
+
errors.push(`Profile scan failed: ${e instanceof Error ? e.message : String(e)}`);
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
return { posts, errors };
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
// ─── Facebook ───────────────────────────────────────────────────────────────
|
|
267
|
+
// Facebook's DOM is extremely dynamic: class names are obfuscated and rotate
|
|
268
|
+
// every few weeks. The strategy is to lean on structural roles (role="article"),
|
|
269
|
+
// stable aria-labels, and heuristic scoring of candidate text/image nodes rather
|
|
270
|
+
// than brittle class selectors. The extractor expands "See more" if present so
|
|
271
|
+
// the caption is captured in full.
|
|
272
|
+
|
|
273
|
+
async function extractFacebookSinglePost(opts: ExtractOptions): Promise<ExtractResult> {
|
|
274
|
+
const { page, source, onLog } = opts;
|
|
275
|
+
try {
|
|
276
|
+
onLog?.("Waiting for Facebook post to render…");
|
|
277
|
+
// FB lazy-loads stories. Wait for any article-role container.
|
|
278
|
+
await page
|
|
279
|
+
.waitForSelector("div[role='article'], [data-pagelet*='FeedUnit']", {
|
|
280
|
+
state: "attached",
|
|
281
|
+
timeout: 10_000,
|
|
282
|
+
})
|
|
283
|
+
.catch(() => {});
|
|
284
|
+
|
|
285
|
+
// Expand "See more" / "Mehr anzeigen" so we capture full text.
|
|
286
|
+
await expandFacebookSeeMore(page).catch(() => {});
|
|
287
|
+
|
|
288
|
+
const data = await page.evaluate(() => {
|
|
289
|
+
// Prefer the outermost article that contains a permalink or timestamp.
|
|
290
|
+
const articles = Array.from(
|
|
291
|
+
document.querySelectorAll("div[role='article']"),
|
|
292
|
+
) as HTMLElement[];
|
|
293
|
+
const article = articles.find((a) => a.querySelector("a[href*='/posts/'], a[href*='/videos/'], a[href*='/permalink'], a[href*='/share/']"))
|
|
294
|
+
|| articles[0]
|
|
295
|
+
|| document.body;
|
|
296
|
+
|
|
297
|
+
// Author handle: first link inside article pointing to a profile
|
|
298
|
+
// (href starts with "/" and isn't a media or reaction link).
|
|
299
|
+
let handle = "";
|
|
300
|
+
let displayName = "";
|
|
301
|
+
const authorLinks = Array.from(article.querySelectorAll("a[href^='/']")) as HTMLAnchorElement[];
|
|
302
|
+
for (const a of authorLinks) {
|
|
303
|
+
const href = a.getAttribute("href") || "";
|
|
304
|
+
if (/^\/(photo|video|reel|share|hashtag|stories|groups|events|marketplace|watch)/.test(href))
|
|
305
|
+
continue;
|
|
306
|
+
if (/\/(posts|permalink|comments)/.test(href)) continue;
|
|
307
|
+
// Normalize: /<handle>/ or /<handle>?
|
|
308
|
+
const m = href.match(/^\/([^/?#]+)/);
|
|
309
|
+
if (!m) continue;
|
|
310
|
+
const text = (a.textContent || "").trim();
|
|
311
|
+
if (!text || text.length > 80) continue;
|
|
312
|
+
handle = m[1];
|
|
313
|
+
displayName = text;
|
|
314
|
+
break;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
// Post text: FB wraps the caption in a container marked via data-ad-comet-preview
|
|
318
|
+
// or data-ad-preview="message". Fallback: longest text block inside article that
|
|
319
|
+
// isn't an author name or reaction count.
|
|
320
|
+
let text = "";
|
|
321
|
+
const msgNode = article.querySelector(
|
|
322
|
+
"[data-ad-comet-preview='message'], [data-ad-preview='message'], [data-testid='post_message']",
|
|
323
|
+
);
|
|
324
|
+
if (msgNode) text = (msgNode.textContent || "").trim();
|
|
325
|
+
if (!text) {
|
|
326
|
+
// Fallback: look at direct divs with dir="auto" that are longer than 40 chars.
|
|
327
|
+
const candidates = Array.from(article.querySelectorAll("div[dir='auto']"))
|
|
328
|
+
.map((d) => (d.textContent || "").trim())
|
|
329
|
+
.filter((t) => t.length > 40 && !/^[\d.,KM\s]+$/.test(t));
|
|
330
|
+
// Sort by length desc and pick the longest.
|
|
331
|
+
candidates.sort((a, b) => b.length - a.length);
|
|
332
|
+
if (candidates[0]) text = candidates[0];
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
// Media: images larger than 200px, exclude the tiny author avatar.
|
|
336
|
+
const imgs = Array.from(article.querySelectorAll("img")) as HTMLImageElement[];
|
|
337
|
+
const mediaUrls = imgs
|
|
338
|
+
.filter((img) => (img.naturalWidth || img.width) > 200)
|
|
339
|
+
.map((img) => img.src)
|
|
340
|
+
.filter((src) => src.startsWith("http"));
|
|
341
|
+
|
|
342
|
+
// Videos — Facebook uses <video> elements for reels and video posts.
|
|
343
|
+
const videos = Array.from(article.querySelectorAll("video")) as HTMLVideoElement[];
|
|
344
|
+
const videoUrls = videos
|
|
345
|
+
.map((v) => v.src || v.currentSrc)
|
|
346
|
+
.filter((u) => !!u && u.startsWith("http"));
|
|
347
|
+
|
|
348
|
+
// Timestamp: the permalink anchor often has an aria-label with the date.
|
|
349
|
+
let postedAt: string | null = null;
|
|
350
|
+
const timeLink = article.querySelector(
|
|
351
|
+
"a[href*='/posts/'] span[aria-label], a[href*='/permalink'] span[aria-label], a[href*='/videos/'] span[aria-label]",
|
|
352
|
+
);
|
|
353
|
+
if (timeLink) {
|
|
354
|
+
const label = timeLink.getAttribute("aria-label") || "";
|
|
355
|
+
if (label) postedAt = label;
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
// Engagement: FB hides exact numbers behind aria-labels.
|
|
359
|
+
// "X reactions" / "Y comments" / "Z shares".
|
|
360
|
+
const parseCount = (s: string | null): number | null => {
|
|
361
|
+
if (!s) return null;
|
|
362
|
+
const m = s.match(/([\d.,]+)\s*(k|m|tsd|mio|million|thousand)?/i);
|
|
363
|
+
if (!m) return null;
|
|
364
|
+
let n = parseFloat(m[1].replace(/[.,]/g, (c) => (c === "," ? "." : "")));
|
|
365
|
+
if (!Number.isFinite(n)) return null;
|
|
366
|
+
const unit = (m[2] || "").toLowerCase();
|
|
367
|
+
if (unit.startsWith("k") || unit.startsWith("tsd")) n *= 1_000;
|
|
368
|
+
if (unit.startsWith("m") || unit.startsWith("mio") || unit === "million") n *= 1_000_000;
|
|
369
|
+
return Math.round(n);
|
|
370
|
+
};
|
|
371
|
+
let likes: number | null = null;
|
|
372
|
+
let comments: number | null = null;
|
|
373
|
+
let shares: number | null = null;
|
|
374
|
+
const reactionNode = article.querySelector(
|
|
375
|
+
"[aria-label*='reaction'], [aria-label*='Reaktion'], [aria-label*='Gefällt']",
|
|
376
|
+
);
|
|
377
|
+
if (reactionNode) likes = parseCount(reactionNode.getAttribute("aria-label"));
|
|
378
|
+
const commentNode = article.querySelector(
|
|
379
|
+
"[aria-label*='comment'], [aria-label*='Kommentar']",
|
|
380
|
+
);
|
|
381
|
+
if (commentNode) comments = parseCount(commentNode.getAttribute("aria-label"));
|
|
382
|
+
const shareNode = article.querySelector(
|
|
383
|
+
"[aria-label*='share'], [aria-label*='Teilen'], [aria-label*='geteilt']",
|
|
384
|
+
);
|
|
385
|
+
if (shareNode) shares = parseCount(shareNode.getAttribute("aria-label"));
|
|
386
|
+
|
|
387
|
+
// Post type
|
|
388
|
+
let postType = "text";
|
|
389
|
+
if (videoUrls.length > 0) {
|
|
390
|
+
postType = /\/reel\//.test(window.location.href) ? "reel" : "video";
|
|
391
|
+
} else if (mediaUrls.length > 1) {
|
|
392
|
+
postType = "carousel";
|
|
393
|
+
} else if (mediaUrls.length === 1) {
|
|
394
|
+
postType = "image";
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
return {
|
|
398
|
+
handle,
|
|
399
|
+
displayName,
|
|
400
|
+
text,
|
|
401
|
+
mediaUrls,
|
|
402
|
+
videoUrls,
|
|
403
|
+
postType,
|
|
404
|
+
postedAt,
|
|
405
|
+
likes,
|
|
406
|
+
comments,
|
|
407
|
+
shares,
|
|
408
|
+
url: window.location.href,
|
|
409
|
+
};
|
|
410
|
+
});
|
|
411
|
+
|
|
412
|
+
if (!data) {
|
|
413
|
+
return { posts: [], errors: ["Could not find Facebook article on page"] };
|
|
414
|
+
}
|
|
415
|
+
if (!data.text && data.mediaUrls.length === 0 && data.videoUrls.length === 0) {
|
|
416
|
+
return {
|
|
417
|
+
posts: [],
|
|
418
|
+
errors: ["Facebook post appears empty (no text/media found) — DOM may have changed"],
|
|
419
|
+
};
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
const hashtags = Array.from(data.text.matchAll(/#(\w+)/g)).map((m) => m[1]);
|
|
423
|
+
const mentions = Array.from(data.text.matchAll(/@(\w+)/g)).map((m) => m[1]);
|
|
424
|
+
const cta = detectCta(data.text);
|
|
425
|
+
const hook = extractHook(data.text);
|
|
426
|
+
|
|
427
|
+
let visionDescription = "";
|
|
428
|
+
if (data.mediaUrls.length > 0) {
|
|
429
|
+
onLog?.("Requesting Claude Vision description…");
|
|
430
|
+
visionDescription = await describeImageByUrl(data.mediaUrls[0]);
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
const id = `fb-${Date.now()}-${randomUUID().slice(0, 8)}`;
|
|
434
|
+
|
|
435
|
+
const post: LibraryPost = {
|
|
436
|
+
id,
|
|
437
|
+
platform: "facebook",
|
|
438
|
+
source,
|
|
439
|
+
url: data.url,
|
|
440
|
+
author: {
|
|
441
|
+
handle: data.handle,
|
|
442
|
+
displayName: data.displayName || undefined,
|
|
443
|
+
},
|
|
444
|
+
text: data.text,
|
|
445
|
+
hook,
|
|
446
|
+
cta,
|
|
447
|
+
hashtags,
|
|
448
|
+
mentions,
|
|
449
|
+
media: [
|
|
450
|
+
...data.mediaUrls.map((remoteUrl, i) => ({
|
|
451
|
+
type: "image" as const,
|
|
452
|
+
localPath: null,
|
|
453
|
+
remoteUrl,
|
|
454
|
+
description: i === 0 ? visionDescription : "",
|
|
455
|
+
})),
|
|
456
|
+
...data.videoUrls.map((remoteUrl) => ({
|
|
457
|
+
type: "video" as const,
|
|
458
|
+
localPath: null,
|
|
459
|
+
remoteUrl,
|
|
460
|
+
description: "",
|
|
461
|
+
})),
|
|
462
|
+
],
|
|
463
|
+
engagement: {
|
|
464
|
+
likes: data.likes,
|
|
465
|
+
comments: data.comments,
|
|
466
|
+
shares: data.shares,
|
|
467
|
+
views: null,
|
|
468
|
+
saves: null,
|
|
469
|
+
},
|
|
470
|
+
engagementRate: null,
|
|
471
|
+
postType: data.postType as LibraryPost["postType"],
|
|
472
|
+
postedAt: data.postedAt,
|
|
473
|
+
tags: [],
|
|
474
|
+
isGold: false,
|
|
475
|
+
extractedAt: new Date().toISOString(),
|
|
476
|
+
notes: "",
|
|
477
|
+
};
|
|
478
|
+
|
|
479
|
+
return { posts: [post], errors: [] };
|
|
480
|
+
} catch (e) {
|
|
481
|
+
return {
|
|
482
|
+
posts: [],
|
|
483
|
+
errors: [`Facebook extraction failed: ${e instanceof Error ? e.message : String(e)}`],
|
|
484
|
+
};
|
|
485
|
+
}
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
// ─── Facebook profile feed — desktop (2026 layout) ─────────────────────────
|
|
489
|
+
// Posts on desktop FB 2026 are NOT wrapped in `role='article'` at top level
|
|
490
|
+
// (those are comments + side widgets). The reliable anchors are:
|
|
491
|
+
// - [data-ad-comet-preview='message'] — post body container
|
|
492
|
+
// - [data-ad-preview='message'] — same body, dual-attributed
|
|
493
|
+
// - [data-ad-rendering-role='story_message'] — message wrapper
|
|
494
|
+
// - [data-ad-rendering-role='profile_name'] — author header
|
|
495
|
+
// - [data-ad-rendering-role='like_button' | 'comment_button' | 'share_button']
|
|
496
|
+
// Strategy: collect every message anchor in the DOM, walk up to the closest
|
|
497
|
+
// ancestor that ALSO contains a profile_name AND a like_button — that's the
|
|
498
|
+
// post wrapper. Dedupe wrappers (we drop wrappers that contain other wrappers
|
|
499
|
+
// to keep only the inner-most one per post).
|
|
500
|
+
async function extractFacebookFeedDesktop(
|
|
501
|
+
opts: ExtractOptions,
|
|
502
|
+
maxPosts: number,
|
|
503
|
+
): Promise<ExtractResult> {
|
|
504
|
+
const { page, source, onLog } = opts;
|
|
505
|
+
const posts: LibraryPost[] = [];
|
|
506
|
+
const errors: string[] = [];
|
|
507
|
+
|
|
508
|
+
try {
|
|
509
|
+
// Wait for at least one message marker to be in the DOM. These appear
|
|
510
|
+
// once FB has hydrated the first post tile.
|
|
511
|
+
await page
|
|
512
|
+
.waitForSelector(
|
|
513
|
+
"[data-ad-comet-preview='message'], [data-ad-preview='message'], [data-ad-rendering-role='story_message']",
|
|
514
|
+
{ state: "attached", timeout: 12_000 },
|
|
515
|
+
)
|
|
516
|
+
.catch(() => {});
|
|
517
|
+
|
|
518
|
+
// Scroll to populate more posts (FB virtualises the feed).
|
|
519
|
+
for (let i = 0; i < 18; i++) {
|
|
520
|
+
await page.evaluate(() => window.scrollBy(0, 1500));
|
|
521
|
+
await page.waitForTimeout(900);
|
|
522
|
+
}
|
|
523
|
+
await page.evaluate(() => window.scrollTo(0, 0));
|
|
524
|
+
await page.waitForTimeout(400);
|
|
525
|
+
|
|
526
|
+
// Click any "See more" / "Mehr anzeigen" toggles page-wide so captions
|
|
527
|
+
// aren't truncated. The button is sometimes a sibling of the message
|
|
528
|
+
// container, not a descendant — so we search the whole document. Repeat
|
|
529
|
+
// a few times because expanded text may itself contain another toggle
|
|
530
|
+
// (long posts get folded twice).
|
|
531
|
+
for (let pass = 0; pass < 3; pass++) {
|
|
532
|
+
const clicked = await page.evaluate(() => {
|
|
533
|
+
const isToggle = (s: string) => {
|
|
534
|
+
const t = s.trim().toLowerCase();
|
|
535
|
+
return (
|
|
536
|
+
t === "see more" ||
|
|
537
|
+
t === "mehr anzeigen" ||
|
|
538
|
+
t === "weiterlesen" ||
|
|
539
|
+
t === "...mehr" ||
|
|
540
|
+
t === "… mehr" ||
|
|
541
|
+
t === "show more"
|
|
542
|
+
);
|
|
543
|
+
};
|
|
544
|
+
const candidates = Array.from(
|
|
545
|
+
document.querySelectorAll("div[role='button'], span[role='button'], span"),
|
|
546
|
+
) as HTMLElement[];
|
|
547
|
+
let count = 0;
|
|
548
|
+
for (const el of candidates) {
|
|
549
|
+
if (!el.isConnected) continue;
|
|
550
|
+
// The toggle is usually a leaf — skip elements whose textContent
|
|
551
|
+
// contains the trigger as part of a longer string (e.g. message
|
|
552
|
+
// body that ends with "… Mehr anzeigen"). Compare to direct text.
|
|
553
|
+
const direct = (el.textContent || "").trim();
|
|
554
|
+
if (direct.length > 30) continue;
|
|
555
|
+
if (!isToggle(direct)) continue;
|
|
556
|
+
try { el.click(); count++; } catch { /* noop */ }
|
|
557
|
+
}
|
|
558
|
+
return count;
|
|
559
|
+
}).catch(() => 0);
|
|
560
|
+
if (!clicked) break;
|
|
561
|
+
await page.waitForTimeout(700);
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
const currentUrl = page.url();
|
|
565
|
+
const handleMatch = currentUrl.match(/facebook\.com\/([^/?#]+)/);
|
|
566
|
+
const expectedHandle =
|
|
567
|
+
handleMatch &&
|
|
568
|
+
!/^(profile\.php|pages|watch|story\.php|groups|marketplace|events)$/.test(handleMatch[1])
|
|
569
|
+
? handleMatch[1].toLowerCase()
|
|
570
|
+
: null;
|
|
571
|
+
if (expectedHandle) onLog?.(`Expected handle: ${expectedHandle}`);
|
|
572
|
+
|
|
573
|
+
const raw = await page.evaluate(({ max, expectedHandle }) => {
|
|
574
|
+
const parseCount = (s: string | null): number | null => {
|
|
575
|
+
if (!s) return null;
|
|
576
|
+
const mm = s.match(/([\d.,]+)\s*(k|m|tsd|mio|million|thousand)?/i);
|
|
577
|
+
if (!mm) return null;
|
|
578
|
+
let n = parseFloat(mm[1].replace(/[.,]/g, (c) => (c === "," ? "." : "")));
|
|
579
|
+
if (!Number.isFinite(n)) return null;
|
|
580
|
+
const unit = (mm[2] || "").toLowerCase();
|
|
581
|
+
if (unit.startsWith("k") || unit.startsWith("tsd")) n *= 1_000;
|
|
582
|
+
if (unit.startsWith("m") || unit.startsWith("mio") || unit === "million") n *= 1_000_000;
|
|
583
|
+
return Math.round(n);
|
|
584
|
+
};
|
|
585
|
+
|
|
586
|
+
const extractHandleFromHref = (href: string): string | null => {
|
|
587
|
+
if (/comment_id=/.test(href)) return null;
|
|
588
|
+
const profIdMatch = href.match(/profile\.php\?id=(\d+)/);
|
|
589
|
+
if (profIdMatch) return profIdMatch[1];
|
|
590
|
+
// Strip protocol+host first so we always match against the path.
|
|
591
|
+
let path = href;
|
|
592
|
+
const httpMatch = href.match(/^https?:\/\/[^/]+(\/.*)$/);
|
|
593
|
+
if (httpMatch) path = httpMatch[1];
|
|
594
|
+
const slugMatch = path.match(/^\/([^/?#]+)/);
|
|
595
|
+
if (!slugMatch) return null;
|
|
596
|
+
const slug = slugMatch[1];
|
|
597
|
+
if (
|
|
598
|
+
/^(photo|video|reel|share|hashtag|stories|groups|events|marketplace|watch|posts|permalink|comments|browse|story\.php|home\.php|notifications|profile\.php|p|reels)$/i.test(
|
|
599
|
+
slug,
|
|
600
|
+
)
|
|
601
|
+
) {
|
|
602
|
+
return null;
|
|
603
|
+
}
|
|
604
|
+
return slug;
|
|
605
|
+
};
|
|
606
|
+
|
|
607
|
+
// 1) Collect all message anchors in document order.
|
|
608
|
+
const messageNodes = Array.from(
|
|
609
|
+
document.querySelectorAll(
|
|
610
|
+
"[data-ad-comet-preview='message'], [data-ad-preview='message'], [data-ad-rendering-role='story_message']",
|
|
611
|
+
),
|
|
612
|
+
) as HTMLElement[];
|
|
613
|
+
|
|
614
|
+
// 2) For each message, walk up to find the closest ancestor that also
|
|
615
|
+
// contains a profile_name AND a like_button. That's the post wrapper.
|
|
616
|
+
const wrapperSet = new Set<HTMLElement>();
|
|
617
|
+
const messageByWrapper = new Map<HTMLElement, HTMLElement>();
|
|
618
|
+
// A wrapper is the smallest ancestor of a message-anchor that
|
|
619
|
+
// contains profile_name + like_button + a post-permalink anchor
|
|
620
|
+
// (timestamp link). The permalink requirement is what stops us from
|
|
621
|
+
// picking a too-narrow wrapper that excludes the header.
|
|
622
|
+
const hasPermalinkAnchor = (el: HTMLElement): boolean => {
|
|
623
|
+
for (const a of Array.from(el.querySelectorAll("a[href]")) as HTMLAnchorElement[]) {
|
|
624
|
+
const h = a.getAttribute("href") || "";
|
|
625
|
+
if (/comment_id=/.test(h)) continue;
|
|
626
|
+
if (/\/(posts|videos|reel|permalink|share\/p|share\/r|share\/v)\//.test(h)) return true;
|
|
627
|
+
if (/\/(photo|photos)(\/|\?fbid=)/.test(h)) return true;
|
|
628
|
+
}
|
|
629
|
+
return false;
|
|
630
|
+
};
|
|
631
|
+
for (const msg of messageNodes) {
|
|
632
|
+
let cur: HTMLElement | null = msg;
|
|
633
|
+
for (let depth = 0; depth < 25 && cur; depth++) {
|
|
634
|
+
cur = cur.parentElement;
|
|
635
|
+
if (!cur) break;
|
|
636
|
+
const hasProfile = !!cur.querySelector("[data-ad-rendering-role='profile_name']");
|
|
637
|
+
const hasLike = !!cur.querySelector("[data-ad-rendering-role='like_button']");
|
|
638
|
+
if (hasProfile && hasLike && hasPermalinkAnchor(cur)) {
|
|
639
|
+
// Prefer inner-most wrapper: drop any already-collected wrapper
|
|
640
|
+
// that contains this one, and skip if a smaller wrapper already
|
|
641
|
+
// exists inside this one.
|
|
642
|
+
let isContainedByExisting = false;
|
|
643
|
+
for (const w of Array.from(wrapperSet)) {
|
|
644
|
+
if (w !== cur && w.contains(cur)) {
|
|
645
|
+
wrapperSet.delete(w);
|
|
646
|
+
messageByWrapper.delete(w);
|
|
647
|
+
} else if (w !== cur && cur.contains(w)) {
|
|
648
|
+
isContainedByExisting = true;
|
|
649
|
+
}
|
|
650
|
+
}
|
|
651
|
+
if (!isContainedByExisting && !wrapperSet.has(cur)) {
|
|
652
|
+
wrapperSet.add(cur);
|
|
653
|
+
messageByWrapper.set(cur, msg);
|
|
654
|
+
}
|
|
655
|
+
break;
|
|
656
|
+
}
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
const wrappers = Array.from(wrapperSet);
|
|
660
|
+
|
|
661
|
+
// 2.5) Detect the actual profile owner. FB does NOT necessarily use the
|
|
662
|
+
// URL slug (vanity) as the handle in author hrefs — a profile
|
|
663
|
+
// reachable via /aitrendz.xyz1 might be linked internally as
|
|
664
|
+
// /profile.php?id=NNN, /someother.slug, or display "René Remsik".
|
|
665
|
+
// So: collect the (handle, displayName) of every wrapper's
|
|
666
|
+
// profile_name link, count frequencies, and treat the dominant
|
|
667
|
+
// one as the page owner.
|
|
668
|
+
const wrapperMeta = wrappers.map((wrap) => {
|
|
669
|
+
const profile = wrap.querySelector("[data-ad-rendering-role='profile_name']") as HTMLElement | null;
|
|
670
|
+
let h = "";
|
|
671
|
+
let dn = "";
|
|
672
|
+
if (profile) {
|
|
673
|
+
const links = Array.from(profile.querySelectorAll("a[href]")) as HTMLAnchorElement[];
|
|
674
|
+
for (const a of links) {
|
|
675
|
+
const slug = extractHandleFromHref(a.getAttribute("href") || "");
|
|
676
|
+
if (!slug) continue;
|
|
677
|
+
const t = (a.textContent || "").trim();
|
|
678
|
+
if (!t || t.length > 80) continue;
|
|
679
|
+
h = slug;
|
|
680
|
+
dn = t;
|
|
681
|
+
break;
|
|
682
|
+
}
|
|
683
|
+
}
|
|
684
|
+
// Find permalink owner. Search ONLY within the wrapper itself —
|
|
685
|
+
// walking ancestors picks up links from sibling posts (especially
|
|
686
|
+
// the shared /stories/ links FB uses for the feed-virtualization
|
|
687
|
+
// container) which would assign the same wrong permalink to multiple
|
|
688
|
+
// posts.
|
|
689
|
+
let permalinkOwner: string | null = null;
|
|
690
|
+
let permalink: string | null = null;
|
|
691
|
+
const allHrefs: string[] = [];
|
|
692
|
+
for (const a of Array.from(wrap.querySelectorAll("a[href]")) as HTMLAnchorElement[]) {
|
|
693
|
+
const href = a.getAttribute("href") || "";
|
|
694
|
+
if (allHrefs.length < 12) allHrefs.push(href.slice(0, 200));
|
|
695
|
+
if (/comment_id=/.test(href)) continue;
|
|
696
|
+
const ownerMatch =
|
|
697
|
+
href.match(/^\/([^/?#]+)\/(?:posts|videos|reel)\//) ||
|
|
698
|
+
href.match(/facebook\.com\/([^/?#]+)\/(?:posts|videos|reel)\//);
|
|
699
|
+
if (ownerMatch) {
|
|
700
|
+
permalinkOwner = ownerMatch[1].toLowerCase();
|
|
701
|
+
permalink = href.startsWith("http") ? href : `https://www.facebook.com${href}`;
|
|
702
|
+
break;
|
|
703
|
+
}
|
|
704
|
+
if (/\/(permalink|share\/p|share\/r|share\/v)\//.test(href) && !permalink) {
|
|
705
|
+
permalink = href.startsWith("http") ? href : `https://www.facebook.com${href}`;
|
|
706
|
+
}
|
|
707
|
+
if (/\/(photo|photos)(\/|\?fbid=)/.test(href) && !permalink) {
|
|
708
|
+
permalink = href.startsWith("http") ? href : `https://www.facebook.com${href}`;
|
|
709
|
+
}
|
|
710
|
+
}
|
|
711
|
+
return { wrap, handle: h, displayName: dn, permalinkOwner, permalink, debugHrefs: allHrefs };
|
|
712
|
+
});
|
|
713
|
+
|
|
714
|
+
// Tally handle frequency. Most-frequent wins as page owner — but only
|
|
715
|
+
// if it dominates (≥ 30% of wrappers AND ≥ 2 occurrences). Otherwise
|
|
716
|
+
// we treat the page as a heterogeneous feed (recommendations etc.) and
|
|
717
|
+
// fall back to the URL-slug filter.
|
|
718
|
+
const handleCounts = new Map<string, { count: number; displayName: string }>();
|
|
719
|
+
for (const w of wrapperMeta) {
|
|
720
|
+
if (!w.handle) continue;
|
|
721
|
+
const key = w.handle.toLowerCase();
|
|
722
|
+
const ex = handleCounts.get(key);
|
|
723
|
+
if (ex) ex.count++;
|
|
724
|
+
else handleCounts.set(key, { count: 1, displayName: w.displayName });
|
|
725
|
+
}
|
|
726
|
+
const ownerCounts = Array.from(handleCounts.entries()).sort((a, b) => b[1].count - a[1].count);
|
|
727
|
+
const dominant = ownerCounts[0];
|
|
728
|
+
let detectedOwner: string | null = null;
|
|
729
|
+
let detectedOwnerName = "";
|
|
730
|
+
if (dominant && dominant[1].count >= 2 && dominant[1].count / wrappers.length >= 0.3) {
|
|
731
|
+
detectedOwner = dominant[0];
|
|
732
|
+
detectedOwnerName = dominant[1].displayName;
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
// The "effective owner" used for filtering: detected owner if we have
|
|
736
|
+
// one, else the URL-slug expectedHandle.
|
|
737
|
+
const effectiveOwner = detectedOwner || (expectedHandle ? expectedHandle.toLowerCase() : null);
|
|
738
|
+
|
|
739
|
+
const results: Array<{
|
|
740
|
+
handle: string;
|
|
741
|
+
displayName: string;
|
|
742
|
+
text: string;
|
|
743
|
+
mediaUrls: string[];
|
|
744
|
+
videoUrls: string[];
|
|
745
|
+
postType: string;
|
|
746
|
+
postedAt: string | null;
|
|
747
|
+
likes: number | null;
|
|
748
|
+
comments: number | null;
|
|
749
|
+
shares: number | null;
|
|
750
|
+
permalink: string | null;
|
|
751
|
+
}> = [];
|
|
752
|
+
let skippedWrongHandle = 0;
|
|
753
|
+
let skippedEmpty = 0;
|
|
754
|
+
const seenSig = new Set<string>();
|
|
755
|
+
|
|
756
|
+
let skippedPinned = 0;
|
|
757
|
+
for (const meta of wrapperMeta) {
|
|
758
|
+
if (results.length >= max) break;
|
|
759
|
+
const wrap = meta.wrap;
|
|
760
|
+
let handle = meta.handle;
|
|
761
|
+
let displayName = meta.displayName;
|
|
762
|
+
const permalink = meta.permalink;
|
|
763
|
+
const permalinkOwner = meta.permalinkOwner;
|
|
764
|
+
|
|
765
|
+
// Skip pinned posts. FB shows them at the top of profile feeds and
|
|
766
|
+
// they can be months/years old. Look for the pin marker near the
|
|
767
|
+
// header — usually a sibling of profile_name with text "Angeheftet"
|
|
768
|
+
// / "Pinned post" / "Featured", or an aria-label hinting at it.
|
|
769
|
+
const profileEl = wrap.querySelector("[data-ad-rendering-role='profile_name']") as HTMLElement | null;
|
|
770
|
+
let isPinned = false;
|
|
771
|
+
if (profileEl) {
|
|
772
|
+
const headerArea = profileEl.parentElement || profileEl;
|
|
773
|
+
const headerText = (headerArea.innerText || "").toLowerCase();
|
|
774
|
+
if (
|
|
775
|
+
/^(angeheftet|pinned post|pinned|angeheftete? beitr|featured)$/m.test(headerText) ||
|
|
776
|
+
/\b(angeheftet|pinned post|pinned|angepinnt|featured post)\b/.test(headerText.split("\n")[0] || "")
|
|
777
|
+
) {
|
|
778
|
+
isPinned = true;
|
|
779
|
+
}
|
|
780
|
+
// Also check for the explicit aria-label/title on the pin icon.
|
|
781
|
+
if (
|
|
782
|
+
headerArea.querySelector(
|
|
783
|
+
"[aria-label*='Angeheftet' i], [aria-label*='Pinned' i], [aria-label*='angepinnt' i]",
|
|
784
|
+
)
|
|
785
|
+
) {
|
|
786
|
+
isPinned = true;
|
|
787
|
+
}
|
|
788
|
+
}
|
|
789
|
+
if (isPinned) {
|
|
790
|
+
skippedPinned++;
|
|
791
|
+
continue;
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
// Filter: keep wrapper only if it belongs to the effective owner.
|
|
795
|
+
// Match either by author handle or by permalink path-owner.
|
|
796
|
+
if (effectiveOwner) {
|
|
797
|
+
const ownerMatchByPermalink = permalinkOwner && permalinkOwner === effectiveOwner;
|
|
798
|
+
const ownerMatchByHandle = handle && handle.toLowerCase() === effectiveOwner;
|
|
799
|
+
const haveOwnerInfo = !!permalinkOwner || !!handle;
|
|
800
|
+
if (haveOwnerInfo && !ownerMatchByPermalink && !ownerMatchByHandle) {
|
|
801
|
+
skippedWrongHandle++;
|
|
802
|
+
continue;
|
|
803
|
+
}
|
|
804
|
+
// If matched by permalink but no handle was detected, fill in.
|
|
805
|
+
if (ownerMatchByPermalink && !handle) {
|
|
806
|
+
handle = effectiveOwner;
|
|
807
|
+
if (!displayName && detectedOwnerName) displayName = detectedOwnerName;
|
|
808
|
+
}
|
|
809
|
+
}
|
|
810
|
+
|
|
811
|
+
// Body text from the message node we tagged onto this wrapper.
|
|
812
|
+
// Use innerText (NOT textContent) so we get only the visually rendered
|
|
813
|
+
// characters. FB injects hidden <span style="display:none"> decoys
|
|
814
|
+
// with random characters between the real ones as an anti-scraping
|
|
815
|
+
// measure — textContent reads them all and produces gibberish like
|
|
816
|
+
// "oeSrnodstp 0 809a"; innerText respects CSS visibility and yields
|
|
817
|
+
// the clean rendered text.
|
|
818
|
+
const msgNode = messageByWrapper.get(wrap);
|
|
819
|
+
let text = "";
|
|
820
|
+
if (msgNode) text = (msgNode.innerText || "").trim();
|
|
821
|
+
|
|
822
|
+
// Note: own-comments are fetched in a SECOND pass via permalink tabs
|
|
823
|
+
// — see fetchOwnCommentsFromPermalink below. Inline expansion via
|
|
824
|
+
// clicking comment_button on the feed view proved unreliable: FB
|
|
825
|
+
// doesn't always load the full thread inline, and "Verfasser"-tagged
|
|
826
|
+
// own-comments often only render on the post's permalink page.
|
|
827
|
+
|
|
828
|
+
// Posted-at: prefer aria-label of timestamp link.
|
|
829
|
+
let postedAt: string | null = null;
|
|
830
|
+
const tsNode = wrap.querySelector(
|
|
831
|
+
"a[href*='/posts/'] [aria-label], a[href*='/permalink'] [aria-label], a[href*='/videos/'] [aria-label], a[href*='/reel/'] [aria-label]",
|
|
832
|
+
);
|
|
833
|
+
if (tsNode) {
|
|
834
|
+
const lab = tsNode.getAttribute("aria-label") || "";
|
|
835
|
+
if (lab) postedAt = lab;
|
|
836
|
+
}
|
|
837
|
+
|
|
838
|
+
// Engagement.
|
|
839
|
+
let likes: number | null = null;
|
|
840
|
+
let comments: number | null = null;
|
|
841
|
+
let shares: number | null = null;
|
|
842
|
+
const reactionNode = wrap.querySelector(
|
|
843
|
+
"[aria-label*='Gefällt mir: '], [aria-label*='reactions'], [aria-label*='Reaktion']",
|
|
844
|
+
);
|
|
845
|
+
if (reactionNode) likes = parseCount(reactionNode.getAttribute("aria-label"));
|
|
846
|
+
// Fallback: number adjacent to like_button.
|
|
847
|
+
if (likes == null) {
|
|
848
|
+
const likeBtn = wrap.querySelector("[data-ad-rendering-role='like_button']");
|
|
849
|
+
if (likeBtn) {
|
|
850
|
+
const sib = likeBtn.parentElement?.textContent?.trim() || "";
|
|
851
|
+
likes = parseCount(sib);
|
|
852
|
+
}
|
|
853
|
+
}
|
|
854
|
+
const commentBtn = wrap.querySelector("[data-ad-rendering-role='comment_button']");
|
|
855
|
+
if (commentBtn) {
|
|
856
|
+
// The visible count usually sits as text inside the button or as an
|
|
857
|
+
// aria-label like "3 Kommentare".
|
|
858
|
+
const lab = commentBtn.getAttribute("aria-label");
|
|
859
|
+
if (lab) comments = parseCount(lab);
|
|
860
|
+
if (comments == null) {
|
|
861
|
+
const t = (commentBtn.textContent || "").trim();
|
|
862
|
+
comments = parseCount(t);
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
const shareBtn = wrap.querySelector("[data-ad-rendering-role='share_button']");
|
|
866
|
+
if (shareBtn) {
|
|
867
|
+
const lab = shareBtn.getAttribute("aria-label");
|
|
868
|
+
if (lab) shares = parseCount(lab);
|
|
869
|
+
if (shares == null) {
|
|
870
|
+
const t = (shareBtn.textContent || "").trim();
|
|
871
|
+
shares = parseCount(t);
|
|
872
|
+
}
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
// Media: images >200px inside the wrapper, excluding profile photos
|
|
876
|
+
// (which usually live inside [data-ad-rendering-role='profile_name'])
|
|
877
|
+
// and emoji/safe_image hosts.
|
|
878
|
+
const wrapClone = wrap.cloneNode(true) as HTMLElement;
|
|
879
|
+
for (const p of Array.from(wrapClone.querySelectorAll("[data-ad-rendering-role='profile_name']"))) {
|
|
880
|
+
p.remove();
|
|
881
|
+
}
|
|
882
|
+
const imgs = Array.from(wrapClone.querySelectorAll("img")) as HTMLImageElement[];
|
|
883
|
+
const mediaUrls = imgs
|
|
884
|
+
.filter((img) => (img.naturalWidth || img.width) > 200)
|
|
885
|
+
.map((img) => img.src)
|
|
886
|
+
.filter((src) => src.startsWith("http") && !/emoji|safe_image\.php/i.test(src));
|
|
887
|
+
const videos = Array.from(wrapClone.querySelectorAll("video")) as HTMLVideoElement[];
|
|
888
|
+
const videoUrls = videos
|
|
889
|
+
.map((v) => v.src || v.currentSrc)
|
|
890
|
+
.filter((u) => !!u && u.startsWith("http"));
|
|
891
|
+
|
|
892
|
+
if (!text && mediaUrls.length === 0 && videoUrls.length === 0) {
|
|
893
|
+
skippedEmpty++;
|
|
894
|
+
continue;
|
|
895
|
+
}
|
|
896
|
+
|
|
897
|
+
// Dedupe by text+first-image signature.
|
|
898
|
+
const sig = (text.slice(0, 120) + "|" + (mediaUrls[0] || "") + "|" + (permalink || "")).trim();
|
|
899
|
+
if (sig && seenSig.has(sig)) continue;
|
|
900
|
+
if (sig) seenSig.add(sig);
|
|
901
|
+
|
|
902
|
+
let postType = "text";
|
|
903
|
+
if (videoUrls.length > 0) postType = "video";
|
|
904
|
+
else if (mediaUrls.length > 1) postType = "carousel";
|
|
905
|
+
else if (mediaUrls.length === 1) postType = "image";
|
|
906
|
+
|
|
907
|
+
results.push({
|
|
908
|
+
handle,
|
|
909
|
+
displayName,
|
|
910
|
+
text,
|
|
911
|
+
mediaUrls,
|
|
912
|
+
videoUrls,
|
|
913
|
+
postType,
|
|
914
|
+
postedAt,
|
|
915
|
+
likes,
|
|
916
|
+
comments,
|
|
917
|
+
shares,
|
|
918
|
+
permalink,
|
|
919
|
+
});
|
|
920
|
+
}
|
|
921
|
+
|
|
922
|
+
// Per-wrapper diagnostic list so we can see WHAT was found, not just
|
|
923
|
+
// counts. Helps when a profile uses a non-obvious handle (e.g. URL slug
|
|
924
|
+
// ≠ author handle, vanity URLs, numeric profile IDs).
|
|
925
|
+
const wrapperDiag = wrapperMeta.map((w) => ({
|
|
926
|
+
handle: w.handle,
|
|
927
|
+
displayName: w.displayName,
|
|
928
|
+
permalinkOwner: w.permalinkOwner,
|
|
929
|
+
permalink: w.permalink,
|
|
930
|
+
debugHrefs: w.debugHrefs as string[],
|
|
931
|
+
}));
|
|
932
|
+
|
|
933
|
+
return {
|
|
934
|
+
items: results,
|
|
935
|
+
diag: {
|
|
936
|
+
messageNodes: messageNodes.length,
|
|
937
|
+
wrappers: wrappers.length,
|
|
938
|
+
skippedWrongHandle,
|
|
939
|
+
skippedEmpty,
|
|
940
|
+
skippedPinned,
|
|
941
|
+
detectedOwner,
|
|
942
|
+
detectedOwnerName,
|
|
943
|
+
effectiveOwner,
|
|
944
|
+
ownerCounts: ownerCounts.slice(0, 5).map(([h, v]) => ({ handle: h, count: v.count, displayName: v.displayName })),
|
|
945
|
+
wrapperDiag,
|
|
946
|
+
},
|
|
947
|
+
};
|
|
948
|
+
}, { max: maxPosts, expectedHandle });
|
|
949
|
+
|
|
950
|
+
onLog?.(
|
|
951
|
+
`Scan: ${raw.diag.messageNodes} msg-anchors, ${raw.diag.wrappers} wrappers ` +
|
|
952
|
+
`(pinned: ${raw.diag.skippedPinned}, wrong-handle: ${raw.diag.skippedWrongHandle}, ` +
|
|
953
|
+
`empty: ${raw.diag.skippedEmpty}, kept: ${raw.items.length})`,
|
|
954
|
+
);
|
|
955
|
+
if (raw.diag.detectedOwner) {
|
|
956
|
+
onLog?.(`Detected owner: @${raw.diag.detectedOwner} ("${raw.diag.detectedOwnerName}") — using this instead of URL slug`);
|
|
957
|
+
} else if (raw.diag.ownerCounts.length > 0) {
|
|
958
|
+
onLog?.(
|
|
959
|
+
`No dominant owner detected. Top handles: ` +
|
|
960
|
+
raw.diag.ownerCounts.map((o) => `@${o.handle}×${o.count}`).join(", "),
|
|
961
|
+
);
|
|
962
|
+
}
|
|
963
|
+
// Dump per-wrapper details to the existing /tmp/fb-debug.html.
|
|
964
|
+
try {
|
|
965
|
+
const fs = await import("node:fs/promises");
|
|
966
|
+
const lines = raw.diag.wrapperDiag.map((w, i) => {
|
|
967
|
+
const head = `<!-- wrap[${i}] handle="${w.handle}" name="${w.displayName}" permalinkOwner="${w.permalinkOwner || ""}" permalink="${w.permalink || ""}" -->`;
|
|
968
|
+
const hrefs = (w.debugHrefs || []).map((h) => `<!-- href: ${h.replace(/-->/g, "-->")} -->`).join("\n");
|
|
969
|
+
return head + (hrefs ? "\n" + hrefs : "");
|
|
970
|
+
});
|
|
971
|
+
const extra = `\n\n<!-- desktop-extractor wrapper diag (${raw.diag.wrappers} total) -->\n` +
|
|
972
|
+
`<!-- effectiveOwner=${raw.diag.effectiveOwner} detectedOwner=${raw.diag.detectedOwner} -->\n` +
|
|
973
|
+
lines.join("\n");
|
|
974
|
+
await fs.appendFile("/tmp/fb-debug.html", extra);
|
|
975
|
+
} catch {
|
|
976
|
+
/* noop */
|
|
977
|
+
}
|
|
978
|
+
|
|
979
|
+
if (raw.items.length === 0) {
|
|
980
|
+
if (raw.diag.messageNodes === 0) {
|
|
981
|
+
errors.push(
|
|
982
|
+
"No post-message containers found on the page. The profile may be empty, private, " +
|
|
983
|
+
"or Facebook may be showing a non-feed view (notifications, settings, etc.). " +
|
|
984
|
+
"Open the profile in the browser, scroll until posts are visible, then retry.",
|
|
985
|
+
);
|
|
986
|
+
} else if (raw.diag.skippedWrongHandle > 0 && expectedHandle) {
|
|
987
|
+
errors.push(
|
|
988
|
+
`No posts for @${expectedHandle} — ${raw.diag.skippedWrongHandle} candidates belonged to other profiles.`,
|
|
989
|
+
);
|
|
990
|
+
} else {
|
|
991
|
+
errors.push("Found post wrappers but none had extractable text or media.");
|
|
992
|
+
}
|
|
993
|
+
return { posts, errors };
|
|
994
|
+
}
|
|
995
|
+
|
|
996
|
+
let idx = 0;
|
|
997
|
+
for (const item of raw.items) {
|
|
998
|
+
idx++;
|
|
999
|
+
onLog?.(`${idx}/${raw.items.length}: ${item.handle ? `@${item.handle}` : "(no handle)"} — "${(item.text || "(no text)").slice(0, 60)}"`);
|
|
1000
|
+
|
|
1001
|
+
// Fetch the post author's own comments via the permalink page. FB's
|
|
1002
|
+
// feed view often doesn't render the full comment thread inline, but
|
|
1003
|
+
// the dedicated permalink page does — and own-comments are explicitly
|
|
1004
|
+
// tagged with a "Verfasser" / "Author" badge there.
|
|
1005
|
+
if (item.permalink && raw.diag.effectiveOwner) {
|
|
1006
|
+
try {
|
|
1007
|
+
const ownComments = await fetchOwnCommentsFromPermalink(
|
|
1008
|
+
page,
|
|
1009
|
+
item.permalink,
|
|
1010
|
+
raw.diag.effectiveOwner,
|
|
1011
|
+
raw.diag.detectedOwnerName || item.displayName || "",
|
|
1012
|
+
);
|
|
1013
|
+
if (ownComments.length > 0) {
|
|
1014
|
+
item.text = (item.text + "\n\n" + ownComments.map((c) => `[Eigener Kommentar]\n${c}`).join("\n\n")).trim();
|
|
1015
|
+
onLog?.(`${idx}/${raw.items.length}: + ${ownComments.length} eigene Kommentare`);
|
|
1016
|
+
}
|
|
1017
|
+
} catch (e) {
|
|
1018
|
+
onLog?.(`${idx}/${raw.items.length}: comment fetch failed (${e instanceof Error ? e.message : String(e)})`);
|
|
1019
|
+
}
|
|
1020
|
+
}
|
|
1021
|
+
|
|
1022
|
+
const hashtags = Array.from(item.text.matchAll(/#(\w+)/g)).map((m) => m[1]);
|
|
1023
|
+
const mentions = Array.from(item.text.matchAll(/@(\w+)/g)).map((m) => m[1]);
|
|
1024
|
+
const cta = detectCta(item.text);
|
|
1025
|
+
const hook = extractHook(item.text);
|
|
1026
|
+
|
|
1027
|
+
let visionDescription = "";
|
|
1028
|
+
if (item.mediaUrls.length > 0) {
|
|
1029
|
+
onLog?.(`${idx}/${raw.items.length}: vision describe…`);
|
|
1030
|
+
try {
|
|
1031
|
+
visionDescription = await describeImageByUrl(item.mediaUrls[0]);
|
|
1032
|
+
} catch (e) {
|
|
1033
|
+
onLog?.(`${idx}/${raw.items.length}: vision failed (${e instanceof Error ? e.message : String(e)})`);
|
|
1034
|
+
}
|
|
1035
|
+
}
|
|
1036
|
+
|
|
1037
|
+
const id = `fb-${Date.now()}-${randomUUID().slice(0, 8)}`;
|
|
1038
|
+
const post: LibraryPost = {
|
|
1039
|
+
id,
|
|
1040
|
+
platform: "facebook",
|
|
1041
|
+
source,
|
|
1042
|
+
url: item.permalink || page.url(),
|
|
1043
|
+
author: { handle: item.handle, displayName: item.displayName || undefined },
|
|
1044
|
+
text: item.text,
|
|
1045
|
+
hook,
|
|
1046
|
+
cta,
|
|
1047
|
+
hashtags,
|
|
1048
|
+
mentions,
|
|
1049
|
+
media: [
|
|
1050
|
+
...item.mediaUrls.map((remoteUrl, i) => ({
|
|
1051
|
+
type: "image" as const,
|
|
1052
|
+
localPath: null,
|
|
1053
|
+
remoteUrl,
|
|
1054
|
+
description: i === 0 ? visionDescription : "",
|
|
1055
|
+
})),
|
|
1056
|
+
...item.videoUrls.map((remoteUrl) => ({
|
|
1057
|
+
type: "video" as const,
|
|
1058
|
+
localPath: null,
|
|
1059
|
+
remoteUrl,
|
|
1060
|
+
description: "",
|
|
1061
|
+
})),
|
|
1062
|
+
],
|
|
1063
|
+
engagement: {
|
|
1064
|
+
likes: item.likes,
|
|
1065
|
+
comments: item.comments,
|
|
1066
|
+
shares: item.shares,
|
|
1067
|
+
views: null,
|
|
1068
|
+
saves: null,
|
|
1069
|
+
},
|
|
1070
|
+
engagementRate: null,
|
|
1071
|
+
postType: item.postType as LibraryPost["postType"],
|
|
1072
|
+
postedAt: item.postedAt,
|
|
1073
|
+
tags: [],
|
|
1074
|
+
isGold: false,
|
|
1075
|
+
extractedAt: new Date().toISOString(),
|
|
1076
|
+
notes: "",
|
|
1077
|
+
};
|
|
1078
|
+
posts.push(post);
|
|
1079
|
+
}
|
|
1080
|
+
} catch (e) {
|
|
1081
|
+
errors.push(`Facebook desktop feed scan failed: ${e instanceof Error ? e.message : String(e)}`);
|
|
1082
|
+
}
|
|
1083
|
+
|
|
1084
|
+
return { posts, errors };
|
|
1085
|
+
}
|
|
1086
|
+
|
|
1087
|
+
/**
|
|
1088
|
+
* Open a post's permalink in a background tab, expand all comments, and
|
|
1089
|
+
* return the bodies of comments authored by `ownerHandle`. The permalink
|
|
1090
|
+
* page renders the full comment thread (including FB's "Verfasser" / "Author"
|
|
1091
|
+
* badge on own-comments) which the feed view often omits.
|
|
1092
|
+
*/
|
|
1093
|
+
async function fetchOwnCommentsFromPermalink(
|
|
1094
|
+
basePage: Page,
|
|
1095
|
+
permalink: string,
|
|
1096
|
+
ownerHandle: string,
|
|
1097
|
+
ownerDisplayName: string,
|
|
1098
|
+
): Promise<string[]> {
|
|
1099
|
+
const tab = await basePage.context().newPage();
|
|
1100
|
+
try {
|
|
1101
|
+
await tab.goto(permalink, { waitUntil: "domcontentloaded", timeout: 25_000 });
|
|
1102
|
+
await tab.waitForTimeout(2000);
|
|
1103
|
+
|
|
1104
|
+
// Click "View previous comments" / "Weitere Kommentare anzeigen" /
|
|
1105
|
+
// "Antworten anzeigen" repeatedly to surface deeper threads.
|
|
1106
|
+
for (let pass = 0; pass < 6; pass++) {
|
|
1107
|
+
const clicked = await tab.evaluate(() => {
|
|
1108
|
+
const candidates = Array.from(
|
|
1109
|
+
document.querySelectorAll("div[role='button'], span[role='button'], span"),
|
|
1110
|
+
) as HTMLElement[];
|
|
1111
|
+
const re = /^(weitere kommentare anzeigen|view more comments|alle kommentare anzeigen|view all comments|previous comments|vorherige kommentare|antworten anzeigen|view replies?|view all \d+ replies?|\d+ antworten|\d+ replies?|kommentar(e)? anzeigen)$/i;
|
|
1112
|
+
let n = 0;
|
|
1113
|
+
for (const el of candidates) {
|
|
1114
|
+
if (!el.isConnected) continue;
|
|
1115
|
+
const t = (el.textContent || "").trim();
|
|
1116
|
+
if (t.length === 0 || t.length > 60) continue;
|
|
1117
|
+
if (re.test(t)) {
|
|
1118
|
+
try { el.click(); n++; } catch { /* noop */ }
|
|
1119
|
+
}
|
|
1120
|
+
}
|
|
1121
|
+
return n;
|
|
1122
|
+
}).catch(() => 0);
|
|
1123
|
+
if (!clicked) break;
|
|
1124
|
+
await tab.waitForTimeout(900);
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
// Click "See more" inside any truncated comments.
|
|
1128
|
+
await tab.evaluate(() => {
|
|
1129
|
+
const candidates = Array.from(
|
|
1130
|
+
document.querySelectorAll("div[role='button'], span[role='button'], span"),
|
|
1131
|
+
) as HTMLElement[];
|
|
1132
|
+
for (const el of candidates) {
|
|
1133
|
+
if (!el.isConnected) continue;
|
|
1134
|
+
const t = (el.textContent || "").trim();
|
|
1135
|
+
if (t.length > 30) continue;
|
|
1136
|
+
const lc = t.toLowerCase();
|
|
1137
|
+
if (lc === "see more" || lc === "mehr anzeigen" || lc === "weiterlesen" || lc === "...mehr" || lc === "show more") {
|
|
1138
|
+
try { el.click(); } catch { /* noop */ }
|
|
1139
|
+
}
|
|
1140
|
+
}
|
|
1141
|
+
}).catch(() => {});
|
|
1142
|
+
await tab.waitForTimeout(500);
|
|
1143
|
+
|
|
1144
|
+
const comments = await tab.evaluate(() => {
|
|
1145
|
+
// Strategy: walk ALL [role='article'] elements that have a
|
|
1146
|
+
// "Verfasser" / "Author" badge inside. FB tags comments by the post
|
|
1147
|
+
// author with that badge regardless of who reshared the post — so this
|
|
1148
|
+
// is more reliable than matching against an upstream-detected owner
|
|
1149
|
+
// handle (which would be the resharer's handle, not the original
|
|
1150
|
+
// author's, on shared posts).
|
|
1151
|
+
const articles = Array.from(document.querySelectorAll("[role='article']")) as HTMLElement[];
|
|
1152
|
+
const results: string[] = [];
|
|
1153
|
+
const seen = new Set<string>();
|
|
1154
|
+
let postAuthorName = "";
|
|
1155
|
+
|
|
1156
|
+
for (const c of articles) {
|
|
1157
|
+
if (results.length >= 12) break;
|
|
1158
|
+
|
|
1159
|
+
const al = (c.getAttribute("aria-label") || "").trim();
|
|
1160
|
+
// Comment articles only — skip the post itself (no aria-label or
|
|
1161
|
+
// a non-comment aria-label like "Beitrag von X").
|
|
1162
|
+
if (!/^(Kommentar|Comment|Reply|Antwort)/i.test(al)) continue;
|
|
1163
|
+
|
|
1164
|
+
// "Verfasser" / "Author" badge inside?
|
|
1165
|
+
const hasBadge = Array.from(c.querySelectorAll("span, div")).some((el) => {
|
|
1166
|
+
const t = (el.textContent || "").trim();
|
|
1167
|
+
return t === "Verfasser" || t === "Author";
|
|
1168
|
+
});
|
|
1169
|
+
if (!hasBadge) continue;
|
|
1170
|
+
|
|
1171
|
+
// Capture the post-author display name from aria-label
|
|
1172
|
+
// (= "Kommentar von <Name> (...)" / "Comment by <Name>").
|
|
1173
|
+
if (!postAuthorName) {
|
|
1174
|
+
const m = al.match(/^(?:Kommentar von|Comment by|Antwort von|Reply by)\s+(.+?)(?:\s*\(.*\))?$/i);
|
|
1175
|
+
if (m) postAuthorName = m[1].trim();
|
|
1176
|
+
}
|
|
1177
|
+
|
|
1178
|
+
// Body: clone, strip nested role='article' (replies), strip the
|
|
1179
|
+
// action row links / "Verfasser" badge / author-name / timestamp.
|
|
1180
|
+
const cClone = c.cloneNode(true) as HTMLElement;
|
|
1181
|
+
for (const inner of Array.from(cClone.querySelectorAll("[role='article']"))) {
|
|
1182
|
+
if (inner !== cClone) inner.remove();
|
|
1183
|
+
}
|
|
1184
|
+
for (const el of Array.from(cClone.querySelectorAll("span, div"))) {
|
|
1185
|
+
const t = (el.textContent || "").trim();
|
|
1186
|
+
if (t === "Verfasser" || t === "Author") el.remove();
|
|
1187
|
+
}
|
|
1188
|
+
|
|
1189
|
+
let body = (cClone.innerText || "").trim();
|
|
1190
|
+
body = body
|
|
1191
|
+
.split("\n")
|
|
1192
|
+
.map((l) => l.trimEnd())
|
|
1193
|
+
.filter((line, idx, arr) => {
|
|
1194
|
+
const tr = line.trim();
|
|
1195
|
+
if (!tr) return false;
|
|
1196
|
+
// First two lines are typically "Verfasser" + "<Author Name>" —
|
|
1197
|
+
// we already removed "Verfasser", but the author name remains.
|
|
1198
|
+
// Drop it when seen as a standalone first line.
|
|
1199
|
+
if (idx < 2 && postAuthorName && tr === postAuthorName) return false;
|
|
1200
|
+
// Drop trailing meta lines: timestamp, "Antworten", "Gefällt mir"…
|
|
1201
|
+
if (/^\d+\s*(Min|Std|Tag|Tagen|Wo|Mon|Jahr|min|h|d|w)\.?$/i.test(tr)) return false;
|
|
1202
|
+
if (/^vor\s+\d+\s+(Min|Stunden?|Tagen?|Wochen?|Monaten?|Jahren?)/i.test(tr)) return false;
|
|
1203
|
+
if (tr === "Antworten" || tr === "Reply" || tr === "Gefällt mir" || tr === "Like" || tr === "Teilen" || tr === "Share") return false;
|
|
1204
|
+
// Last few lines: "Bearbeitet" / "Edited" markers
|
|
1205
|
+
if (idx >= arr.length - 3 && (tr === "Bearbeitet" || tr === "Edited")) return false;
|
|
1206
|
+
return true;
|
|
1207
|
+
})
|
|
1208
|
+
.join("\n")
|
|
1209
|
+
.trim();
|
|
1210
|
+
if (body.length < 10) continue;
|
|
1211
|
+
const sig = body.slice(0, 100);
|
|
1212
|
+
if (seen.has(sig)) continue;
|
|
1213
|
+
seen.add(sig);
|
|
1214
|
+
results.push(body);
|
|
1215
|
+
}
|
|
1216
|
+
return results;
|
|
1217
|
+
});
|
|
1218
|
+
|
|
1219
|
+
return comments;
|
|
1220
|
+
} finally {
|
|
1221
|
+
await tab.close().catch(() => { /* noop */ });
|
|
1222
|
+
}
|
|
1223
|
+
}
|
|
1224
|
+
|
|
1225
|
+
/** Click any "See more" / "Mehr anzeigen" toggle inside article to expand truncated text. */
|
|
1226
|
+
async function expandFacebookSeeMore(page: Page): Promise<void> {
|
|
1227
|
+
await page.evaluate(() => {
|
|
1228
|
+
const article = document.querySelector("div[role='article']");
|
|
1229
|
+
if (!article) return;
|
|
1230
|
+
const candidates = Array.from(article.querySelectorAll("div[role='button'], span")) as HTMLElement[];
|
|
1231
|
+
for (const el of candidates) {
|
|
1232
|
+
const t = (el.textContent || "").trim().toLowerCase();
|
|
1233
|
+
if (t === "see more" || t === "mehr anzeigen" || t === "weiterlesen" || t === "...mehr") {
|
|
1234
|
+
el.click();
|
|
1235
|
+
return;
|
|
1236
|
+
}
|
|
1237
|
+
}
|
|
1238
|
+
});
|
|
1239
|
+
await page.waitForTimeout(400);
|
|
1240
|
+
}
|
|
1241
|
+
|
|
1242
|
+
// ─── Helpers ────────────────────────────────────────────────────────────────
|
|
1243
|
+
|
|
1244
|
+
function extractHook(text: string): string {
|
|
1245
|
+
const trimmed = text.trim();
|
|
1246
|
+
// First sentence up to 140 chars, or first 2 lines, whichever shorter.
|
|
1247
|
+
const firstLineBreak = trimmed.indexOf("\n");
|
|
1248
|
+
const firstLine = firstLineBreak > 0 ? trimmed.slice(0, firstLineBreak) : trimmed;
|
|
1249
|
+
const firstSentenceMatch = firstLine.match(/^(.+?[.!?])\s/);
|
|
1250
|
+
const candidate = firstSentenceMatch ? firstSentenceMatch[1] : firstLine;
|
|
1251
|
+
return candidate.slice(0, 140).trim();
|
|
1252
|
+
}
|
|
1253
|
+
|
|
1254
|
+
/** Very simple CTA detection: questions, imperatives, "link in bio" phrases. */
|
|
1255
|
+
function detectCta(text: string): string | null {
|
|
1256
|
+
const lines = text.split(/\n+/).map((l) => l.trim()).filter(Boolean);
|
|
1257
|
+
// Check from the end — CTAs are usually at the bottom.
|
|
1258
|
+
for (let i = lines.length - 1; i >= Math.max(0, lines.length - 4); i--) {
|
|
1259
|
+
const line = lines[i];
|
|
1260
|
+
if (!line) continue;
|
|
1261
|
+
if (/\?$/.test(line)) return line; // question
|
|
1262
|
+
if (/link in bio|link in der bio|tap the link|mehr im link|swipe up/i.test(line)) return line;
|
|
1263
|
+
if (/^(kommentiere|schreib|teile|kommentier|tag|follow|folg|speicher)/i.test(line)) return line;
|
|
1264
|
+
}
|
|
1265
|
+
return null;
|
|
1266
|
+
}
|
|
1267
|
+
|
|
1268
|
+
// Media download helper — optional future use (currently vision is URL-based).
|
|
1269
|
+
// Keeping stub available as we'll want local copies for offline training.
|
|
1270
|
+
export async function downloadImage(
|
|
1271
|
+
_url: string,
|
|
1272
|
+
_postId: string,
|
|
1273
|
+
_index: number,
|
|
1274
|
+
): Promise<string | null> {
|
|
1275
|
+
ensureDirs();
|
|
1276
|
+
try {
|
|
1277
|
+
const res = await fetch(_url);
|
|
1278
|
+
if (!res.ok) return null;
|
|
1279
|
+
const buf = Buffer.from(await res.arrayBuffer());
|
|
1280
|
+
const ext = guessExt(res.headers.get("content-type"));
|
|
1281
|
+
const filename = `${_postId}-${_index}.${ext}`;
|
|
1282
|
+
const path = join(MEDIA_ROOT, filename);
|
|
1283
|
+
writeFileSync(path, buf);
|
|
1284
|
+
return path;
|
|
1285
|
+
} catch {
|
|
1286
|
+
return null;
|
|
1287
|
+
}
|
|
1288
|
+
}
|
|
1289
|
+
|
|
1290
|
+
function guessExt(ct: string | null): string {
|
|
1291
|
+
if (!ct) return "bin";
|
|
1292
|
+
if (ct.includes("jpeg") || ct.includes("jpg")) return "jpg";
|
|
1293
|
+
if (ct.includes("png")) return "png";
|
|
1294
|
+
if (ct.includes("webp")) return "webp";
|
|
1295
|
+
if (ct.includes("gif")) return "gif";
|
|
1296
|
+
if (ct.includes("mp4")) return "mp4";
|
|
1297
|
+
return "bin";
|
|
1298
|
+
}
|