heyhank 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (156) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +83 -10
  3. package/bin/cli.ts +7 -7
  4. package/bin/ctl.ts +42 -42
  5. package/dist/assets/{AgentsPage-BPhirnCe.js → AgentsPage-B-AAmsMK.js} +3 -3
  6. package/dist/assets/AssistantPage-BV1Mfwdt.js +2 -0
  7. package/dist/assets/BusinessPage-tLpNEz19.js +1 -0
  8. package/dist/assets/{CronManager-DDbz-yiT.js → CronManager-B-K_n3Jg.js} +1 -1
  9. package/dist/assets/HelpPage-Bhf_j6Xr.js +1 -0
  10. package/dist/assets/{IntegrationsPage-CrOitCmJ.js → IntegrationsPage-DAMjs9tM.js} +1 -1
  11. package/dist/assets/JarvisHUD-C_TGXCCn.js +120 -0
  12. package/dist/assets/MediaPage-C48HTTrt.js +1 -0
  13. package/dist/assets/MemoryPage-JkC-qtgp.js +1 -0
  14. package/dist/assets/{PlatformDashboard-Do6F0O2p.js → PlatformDashboard-AUo7tNnE.js} +1 -1
  15. package/dist/assets/{Playground-Fc5cdc5p.js → Playground-AzNMsRBL.js} +1 -1
  16. package/dist/assets/{ProcessPanel-CslEiZkI.js → ProcessPanel-DpE_2sX3.js} +1 -1
  17. package/dist/assets/{PromptsPage-D2EhsdNO.js → PromptsPage-C2RQOs6p.js} +2 -2
  18. package/dist/assets/RunsPage-B9UOyO79.js +1 -0
  19. package/dist/assets/{SandboxManager-a1AVI5q2.js → SandboxManager-jHvYjwfh.js} +1 -1
  20. package/dist/assets/SettingsPage-BBJax6gt.js +51 -0
  21. package/dist/assets/SkillsMarketplace-IjmjfdjD.js +1 -0
  22. package/dist/assets/SocialMediaPage-DoPZHhr2.js +10 -0
  23. package/dist/assets/{TailscalePage-CHiFhZXF.js → TailscalePage-DDEY7ckO.js} +1 -1
  24. package/dist/assets/TelephonyPage-OPNBZYKt.js +9 -0
  25. package/dist/assets/{TerminalPage-Drwyrnfd.js → TerminalPage-BjMbHHW3.js} +1 -1
  26. package/dist/assets/{gemini-live-client-C7rqAW7G.js → gemini-live-client-C70FEtX2.js} +11 -8
  27. package/dist/assets/{index-CEqZnThB.js → index-BgYM4wXw.js} +94 -93
  28. package/dist/assets/index-BkjSoVgn.css +32 -0
  29. package/dist/assets/sw-register-C7NOHtIu.js +1 -0
  30. package/dist/assets/text-chat-client-BSbLJerZ.js +2 -0
  31. package/dist/index.html +2 -2
  32. package/dist/sw.js +1 -1
  33. package/package.json +6 -1
  34. package/server/agent-executor.ts +37 -2
  35. package/server/agent-store.ts +3 -3
  36. package/server/agent-types.ts +11 -0
  37. package/server/assistant-store.ts +232 -6
  38. package/server/auth-manager.ts +9 -0
  39. package/server/cache-headers.ts +1 -1
  40. package/server/calendar-service.ts +10 -0
  41. package/server/ceo/document-store.ts +129 -0
  42. package/server/ceo/finance-store.ts +343 -0
  43. package/server/ceo/kpi-store.ts +208 -0
  44. package/server/ceo/memory-import.ts +277 -0
  45. package/server/ceo/news-store.ts +208 -0
  46. package/server/ceo/template-store.ts +134 -0
  47. package/server/ceo/time-tracking-store.ts +227 -0
  48. package/server/claude-auth-monitor.ts +128 -0
  49. package/server/claude-code-worker.ts +86 -0
  50. package/server/claude-session-discovery.ts +74 -1
  51. package/server/cli-launcher.ts +32 -10
  52. package/server/codex-adapter.ts +2 -2
  53. package/server/codex-ws-proxy.cjs +1 -1
  54. package/server/container-manager.ts +4 -4
  55. package/server/content-intelligence/content-engine.ts +1112 -0
  56. package/server/content-intelligence/platform-knowledge.ts +870 -0
  57. package/server/cron-store.ts +3 -3
  58. package/server/embedding-service.ts +49 -0
  59. package/server/event-bus-types.ts +13 -0
  60. package/server/federation/node-store.ts +5 -4
  61. package/server/fs-utils.ts +28 -1
  62. package/server/hank-notifications-store.ts +91 -0
  63. package/server/hank-tool-executor.ts +1835 -0
  64. package/server/hank-tools.ts +2107 -0
  65. package/server/image-pull-manager.ts +2 -2
  66. package/server/index.ts +25 -2
  67. package/server/llm-providers-streaming.ts +541 -0
  68. package/server/llm-providers.ts +12 -0
  69. package/server/marketplace.ts +249 -0
  70. package/server/mcp-registry.ts +158 -0
  71. package/server/memory-service.ts +296 -0
  72. package/server/obsidian-sync.ts +184 -0
  73. package/server/provider-manager.ts +5 -2
  74. package/server/provider-registry.ts +12 -0
  75. package/server/reminder-scheduler.ts +37 -1
  76. package/server/routes/agent-routes.ts +2 -1
  77. package/server/routes/assistant-routes.ts +198 -5
  78. package/server/routes/ceo-finance-kpi-routes.ts +167 -0
  79. package/server/routes/ceo-news-time-routes.ts +137 -0
  80. package/server/routes/ceo-routes.ts +99 -0
  81. package/server/routes/content-routes.ts +116 -0
  82. package/server/routes/email-routes.ts +147 -0
  83. package/server/routes/env-routes.ts +3 -3
  84. package/server/routes/fs-routes.ts +12 -9
  85. package/server/routes/hank-chat-routes.ts +592 -0
  86. package/server/routes/llm-routes.ts +12 -0
  87. package/server/routes/marketplace-routes.ts +63 -0
  88. package/server/routes/media-routes.ts +1 -1
  89. package/server/routes/memory-routes.ts +127 -0
  90. package/server/routes/platform-routes.ts +14 -675
  91. package/server/routes/sandbox-routes.ts +1 -1
  92. package/server/routes/settings-routes.ts +51 -1
  93. package/server/routes/socialmedia-routes.ts +152 -2
  94. package/server/routes/system-routes.ts +2 -2
  95. package/server/routes/team-routes.ts +71 -0
  96. package/server/routes/telephony-routes.ts +98 -18
  97. package/server/routes.ts +36 -9
  98. package/server/session-creation-service.ts +2 -2
  99. package/server/session-orchestrator.ts +54 -2
  100. package/server/session-types.ts +2 -0
  101. package/server/settings-manager.ts +50 -2
  102. package/server/skill-discovery.ts +68 -0
  103. package/server/socialmedia/adapters/browser-adapter.ts +179 -0
  104. package/server/socialmedia/adapters/postiz-adapter.ts +291 -14
  105. package/server/socialmedia/manager.ts +234 -15
  106. package/server/socialmedia/store.ts +51 -1
  107. package/server/socialmedia/types.ts +35 -2
  108. package/server/socialview/browser-manager.ts +150 -0
  109. package/server/socialview/extractors.ts +1298 -0
  110. package/server/socialview/image-describe.ts +188 -0
  111. package/server/socialview/library.ts +119 -0
  112. package/server/socialview/poster.ts +276 -0
  113. package/server/socialview/routes.ts +371 -0
  114. package/server/socialview/style-analyzer.ts +187 -0
  115. package/server/socialview/style-profiles.ts +67 -0
  116. package/server/socialview/types.ts +166 -0
  117. package/server/socialview/vision.ts +127 -0
  118. package/server/socialview/vnc-manager.ts +110 -0
  119. package/server/style-injector.ts +135 -0
  120. package/server/team-service.ts +239 -0
  121. package/server/team-store.ts +75 -0
  122. package/server/team-types.ts +52 -0
  123. package/server/telephony/audio-bridge.ts +281 -35
  124. package/server/telephony/audio-recorder.ts +132 -0
  125. package/server/telephony/call-manager.ts +803 -104
  126. package/server/telephony/call-types.ts +67 -1
  127. package/server/telephony/esl-client.ts +319 -0
  128. package/server/telephony/freeswitch-sync.ts +155 -0
  129. package/server/telephony/phone-utils.ts +63 -0
  130. package/server/telephony/telephony-store.ts +9 -8
  131. package/server/url-validator.ts +82 -0
  132. package/server/vault-markdown.ts +317 -0
  133. package/server/vault-migration.ts +121 -0
  134. package/server/vault-store.ts +466 -0
  135. package/server/vault-watcher.ts +59 -0
  136. package/server/vector-store.ts +210 -0
  137. package/server/voice-pipeline/gemini-live-adapter.ts +97 -0
  138. package/server/voice-pipeline/greeting-cache.ts +200 -0
  139. package/server/voice-pipeline/manager.ts +249 -0
  140. package/server/voice-pipeline/pipeline.ts +335 -0
  141. package/server/voice-pipeline/providers/index.ts +47 -0
  142. package/server/voice-pipeline/providers/llm-internal.ts +527 -0
  143. package/server/voice-pipeline/providers/stt-google.ts +157 -0
  144. package/server/voice-pipeline/providers/tts-google.ts +126 -0
  145. package/server/voice-pipeline/types.ts +247 -0
  146. package/server/ws-bridge-types.ts +6 -1
  147. package/dist/assets/AssistantPage-DJ-cMQfb.js +0 -1
  148. package/dist/assets/HelpPage-DMfkzERp.js +0 -1
  149. package/dist/assets/MediaPage-CE5rdvkC.js +0 -1
  150. package/dist/assets/RunsPage-C5BZF5Rx.js +0 -1
  151. package/dist/assets/SettingsPage-DirhjQrJ.js +0 -51
  152. package/dist/assets/SocialMediaPage-DBuM28vD.js +0 -1
  153. package/dist/assets/TelephonyPage-x0VV0fOo.js +0 -1
  154. package/dist/assets/index-C8M_PUmX.css +0 -32
  155. package/dist/assets/sw-register-LSSpj6RU.js +0 -1
  156. package/server/socialmedia/adapters/ayrshare-adapter.ts +0 -169
@@ -0,0 +1,1298 @@
1
+ // ─── Post Extractors ─────────────────────────────────────────────────────────
2
+ // Per-platform DOM extraction via Playwright. Phase 2 implements Instagram
3
+ // first; other platforms throw "not implemented" and will be added once the
4
+ // end-to-end flow is validated.
5
+
6
+ import type { Page } from "playwright";
7
+ import { randomUUID } from "node:crypto";
8
+ import { writeFileSync } from "node:fs";
9
+ import { join } from "node:path";
10
+ import { MEDIA_ROOT, ensureDirs } from "./library.js";
11
+ import { describeImageByUrl } from "./vision.js";
12
+ import type { LibraryPost, SocialPlatform } from "./types.js";
13
+
14
+ export interface ExtractOptions {
15
+ platform: SocialPlatform;
16
+ page: Page;
17
+ source: "own" | "role-model";
18
+ /** Called for side-effects (progress log to UI). */
19
+ onLog?: (msg: string) => void;
20
+ }
21
+
22
+ export interface ExtractResult {
23
+ posts: LibraryPost[];
24
+ errors: string[];
25
+ }
26
+
27
+ /** Detect roughly what type of IG page we're on and run the matching extractor. */
28
+ export async function extractCurrentPage(opts: ExtractOptions): Promise<ExtractResult> {
29
+ const url = opts.page.url();
30
+ opts.onLog?.(`URL: ${url}`);
31
+
32
+ if (opts.platform === "instagram") {
33
+ if (/\/p\/[^/]+\/?/.test(url) || /\/reel\/[^/]+\/?/.test(url)) {
34
+ opts.onLog?.("Detected: Instagram single post");
35
+ return await extractInstagramSinglePost(opts);
36
+ }
37
+ // Profile pages look like https://www.instagram.com/<handle>/
38
+ if (/instagram\.com\/[^/]+\/?$/.test(url) && !/\/(accounts|explore|direct)/.test(url)) {
39
+ opts.onLog?.("Detected: Instagram profile (up to 9 posts)");
40
+ return await extractInstagramProfile(opts, 25);
41
+ }
42
+ return { posts: [], errors: [`Instagram URL not recognized for extraction: ${url}`] };
43
+ }
44
+
45
+ if (opts.platform === "facebook") {
46
+ // Single-post permalinks: /<handle>/posts/<id>, /permalink.php, /share/p/<id>,
47
+ // /<handle>/videos/<id>, /reel/<id>, /story.php, /watch/?v=<id>
48
+ if (
49
+ /\/posts\//.test(url) ||
50
+ /\/permalink\.php/.test(url) ||
51
+ /\/share\/p\//.test(url) ||
52
+ /\/share\/v\//.test(url) ||
53
+ /\/share\/r\//.test(url) ||
54
+ /\/videos\//.test(url) ||
55
+ /\/reel\//.test(url) ||
56
+ /\/story\.php/.test(url) ||
57
+ /\/watch\//.test(url)
58
+ ) {
59
+ opts.onLog?.("Detected: Facebook single post");
60
+ return await extractFacebookSinglePost(opts);
61
+ }
62
+ // Profile / page feed: facebook.com/<handle> or facebook.com/profile.php?id=...
63
+ // Desktop FB 2026 moved post wrappers OFF `role='article'` (which is now
64
+ // used for comments + side widgets). Posts live under elements carrying
65
+ // `[data-ad-rendering-role='story_message']` / `[data-ad-comet-preview='message']`
66
+ // for the body and `[data-ad-rendering-role='profile_name']` for the author
67
+ // header. mbasic.facebook.com redirects to www in 2026 so the mobile path
68
+ // is dead. We anchor on the message nodes and walk up to find each post
69
+ // wrapper.
70
+ if (
71
+ /facebook\.com\/[^/?#]+\/?$/.test(url) ||
72
+ /facebook\.com\/profile\.php/.test(url) ||
73
+ /facebook\.com\/pages\//.test(url)
74
+ ) {
75
+ opts.onLog?.("Detected: Facebook profile feed — desktop message-anchor extractor");
76
+ return await extractFacebookFeedDesktop(opts, 25);
77
+ }
78
+ return { posts: [], errors: [`Facebook URL not recognized for extraction: ${url}`] };
79
+ }
80
+
81
+ return { posts: [], errors: [`Extractor not yet implemented for platform: ${opts.platform}`] };
82
+ }
83
+
84
+ /** Extract the single post currently open at /p/<shortcode>/ or /reel/<shortcode>/ */
85
+ async function extractInstagramSinglePost(opts: ExtractOptions): Promise<ExtractResult> {
86
+ const { page, source, onLog } = opts;
87
+ try {
88
+ onLog?.("Waiting for post content to render…");
89
+ // Instagram's post pages wrap the article in <article role="presentation">.
90
+ await page.waitForSelector("article", { timeout: 10_000 }).catch(() => {});
91
+
92
+ const data = await page.evaluate(() => {
93
+ const article = document.querySelector("article");
94
+ if (!article) return null;
95
+
96
+ // Author handle: usually the first <a href="/xyz/"> inside header.
97
+ let handle = "";
98
+ const headerLink = article.querySelector("header a[href^='/']") as HTMLAnchorElement | null;
99
+ if (headerLink) {
100
+ handle = headerLink.getAttribute("href")?.replace(/^\//, "").replace(/\/$/, "") || "";
101
+ }
102
+
103
+ // Post text: the caption often lives in <h1> or a <span> in the second part of the article.
104
+ let text = "";
105
+ const h1 = article.querySelector("h1");
106
+ if (h1) text = h1.textContent?.trim() || "";
107
+ if (!text) {
108
+ // Fallback: longest <span> with line breaks
109
+ const spans = Array.from(article.querySelectorAll("span"));
110
+ const rich = spans
111
+ .map((s) => s.textContent?.trim() || "")
112
+ .filter((t) => t.length > 30)
113
+ .sort((a, b) => b.length - a.length);
114
+ if (rich[0]) text = rich[0];
115
+ }
116
+
117
+ // Media: all <img> with src starting https:, filter out tiny profile pics
118
+ const imgs = Array.from(article.querySelectorAll("img")) as HTMLImageElement[];
119
+ const mediaUrls = imgs
120
+ .filter((img) => img.naturalWidth > 200 || img.width > 200)
121
+ .map((img) => img.src)
122
+ .filter((src) => src.startsWith("http"));
123
+
124
+ // Video sources (reels)
125
+ const videos = Array.from(article.querySelectorAll("video")) as HTMLVideoElement[];
126
+ const videoUrls = videos
127
+ .map((v) => v.src || v.currentSrc)
128
+ .filter((u) => !!u && u.startsWith("http"));
129
+
130
+ // Post type heuristic
131
+ let postType = "image";
132
+ if (videoUrls.length > 0) postType = "reel";
133
+ if (mediaUrls.length > 1) postType = "carousel";
134
+
135
+ return {
136
+ handle,
137
+ text,
138
+ mediaUrls,
139
+ videoUrls,
140
+ postType,
141
+ url: window.location.href,
142
+ };
143
+ });
144
+
145
+ if (!data) {
146
+ return { posts: [], errors: ["Could not find post <article> on page"] };
147
+ }
148
+
149
+ // Extract hashtags + mentions from text.
150
+ const hashtags = Array.from(data.text.matchAll(/#(\w+)/g)).map((m) => m[1]);
151
+ const mentions = Array.from(data.text.matchAll(/@(\w+)/g)).map((m) => m[1]);
152
+ const cta = detectCta(data.text);
153
+ const hook = extractHook(data.text);
154
+
155
+ // Claude Vision for the first image only (cheap, fast; the rest can be done on-demand).
156
+ let visionDescription = "";
157
+ if (data.mediaUrls.length > 0) {
158
+ onLog?.("Requesting Claude Vision description…");
159
+ visionDescription = await describeImageByUrl(data.mediaUrls[0]);
160
+ }
161
+
162
+ const id = `ig-${Date.now()}-${randomUUID().slice(0, 8)}`;
163
+
164
+ const post: LibraryPost = {
165
+ id,
166
+ platform: "instagram",
167
+ source,
168
+ url: data.url,
169
+ author: { handle: data.handle },
170
+ text: data.text,
171
+ hook,
172
+ cta,
173
+ hashtags,
174
+ mentions,
175
+ media: [
176
+ ...data.mediaUrls.map((remoteUrl, i) => ({
177
+ type: "image" as const,
178
+ localPath: null,
179
+ remoteUrl,
180
+ description: i === 0 ? visionDescription : "",
181
+ })),
182
+ ...data.videoUrls.map((remoteUrl) => ({
183
+ type: "video" as const,
184
+ localPath: null,
185
+ remoteUrl,
186
+ description: "",
187
+ })),
188
+ ],
189
+ engagement: { likes: null, comments: null, shares: null, views: null, saves: null },
190
+ engagementRate: null,
191
+ postType: data.postType as LibraryPost["postType"],
192
+ postedAt: null,
193
+ tags: [],
194
+ isGold: false,
195
+ extractedAt: new Date().toISOString(),
196
+ notes: "",
197
+ };
198
+
199
+ return { posts: [post], errors: [] };
200
+ } catch (e) {
201
+ return {
202
+ posts: [],
203
+ errors: [`Extraction failed: ${e instanceof Error ? e.message : String(e)}`],
204
+ };
205
+ }
206
+ }
207
+
208
+ /** From a profile page, open the first N posts and extract each. */
209
+ async function extractInstagramProfile(
210
+ opts: ExtractOptions,
211
+ maxPosts: number,
212
+ ): Promise<ExtractResult> {
213
+ const { page, onLog } = opts;
214
+ const posts: LibraryPost[] = [];
215
+ const errors: string[] = [];
216
+
217
+ try {
218
+ await page.waitForSelector("main a[href*='/p/'], main a[href*='/reel/']", { timeout: 10_000 });
219
+
220
+ // Scroll to populate more thumbnails — IG lazy-loads the grid as you scroll.
221
+ for (let i = 0; i < 8; i++) {
222
+ await page.evaluate(() => window.scrollBy(0, 1500));
223
+ await page.waitForTimeout(700);
224
+ }
225
+ await page.evaluate(() => window.scrollTo(0, 0));
226
+ await page.waitForTimeout(400);
227
+
228
+ const links = await page.evaluate((max) => {
229
+ const anchors = Array.from(document.querySelectorAll("main a")) as HTMLAnchorElement[];
230
+ const hrefs = anchors
231
+ .map((a) => a.getAttribute("href") || "")
232
+ .filter((h) => /\/p\/[^/]+\/?$/.test(h) || /\/reel\/[^/]+\/?$/.test(h));
233
+ return Array.from(new Set(hrefs)).slice(0, max);
234
+ }, maxPosts);
235
+
236
+ onLog?.(`Found ${links.length} posts on profile — extracting…`);
237
+
238
+ let idx = 0;
239
+ for (const href of links) {
240
+ idx++;
241
+ try {
242
+ const absoluteUrl = new URL(href, "https://www.instagram.com").toString();
243
+ onLog?.(`${idx}/${links.length}: ${absoluteUrl}`);
244
+ await page.goto(absoluteUrl, { waitUntil: "domcontentloaded" });
245
+ const single = await extractInstagramSinglePost(opts);
246
+ posts.push(...single.posts);
247
+ errors.push(...single.errors);
248
+ if (single.posts.length > 0) {
249
+ onLog?.(`${idx}/${links.length}: ✓ saved "${(single.posts[0].text || "").slice(0, 60)}"`);
250
+ } else {
251
+ onLog?.(`${idx}/${links.length}: ✗ no post extracted`);
252
+ }
253
+ } catch (e) {
254
+ const msg = e instanceof Error ? e.message : String(e);
255
+ errors.push(`Failed on ${href}: ${msg}`);
256
+ onLog?.(`${idx}/${links.length}: ✗ ${msg}`);
257
+ }
258
+ }
259
+ } catch (e) {
260
+ errors.push(`Profile scan failed: ${e instanceof Error ? e.message : String(e)}`);
261
+ }
262
+
263
+ return { posts, errors };
264
+ }
265
+
266
+ // ─── Facebook ───────────────────────────────────────────────────────────────
267
+ // Facebook's DOM is extremely dynamic: class names are obfuscated and rotate
268
+ // every few weeks. The strategy is to lean on structural roles (role="article"),
269
+ // stable aria-labels, and heuristic scoring of candidate text/image nodes rather
270
+ // than brittle class selectors. The extractor expands "See more" if present so
271
+ // the caption is captured in full.
272
+
273
+ async function extractFacebookSinglePost(opts: ExtractOptions): Promise<ExtractResult> {
274
+ const { page, source, onLog } = opts;
275
+ try {
276
+ onLog?.("Waiting for Facebook post to render…");
277
+ // FB lazy-loads stories. Wait for any article-role container.
278
+ await page
279
+ .waitForSelector("div[role='article'], [data-pagelet*='FeedUnit']", {
280
+ state: "attached",
281
+ timeout: 10_000,
282
+ })
283
+ .catch(() => {});
284
+
285
+ // Expand "See more" / "Mehr anzeigen" so we capture full text.
286
+ await expandFacebookSeeMore(page).catch(() => {});
287
+
288
+ const data = await page.evaluate(() => {
289
+ // Prefer the outermost article that contains a permalink or timestamp.
290
+ const articles = Array.from(
291
+ document.querySelectorAll("div[role='article']"),
292
+ ) as HTMLElement[];
293
+ const article = articles.find((a) => a.querySelector("a[href*='/posts/'], a[href*='/videos/'], a[href*='/permalink'], a[href*='/share/']"))
294
+ || articles[0]
295
+ || document.body;
296
+
297
+ // Author handle: first link inside article pointing to a profile
298
+ // (href starts with "/" and isn't a media or reaction link).
299
+ let handle = "";
300
+ let displayName = "";
301
+ const authorLinks = Array.from(article.querySelectorAll("a[href^='/']")) as HTMLAnchorElement[];
302
+ for (const a of authorLinks) {
303
+ const href = a.getAttribute("href") || "";
304
+ if (/^\/(photo|video|reel|share|hashtag|stories|groups|events|marketplace|watch)/.test(href))
305
+ continue;
306
+ if (/\/(posts|permalink|comments)/.test(href)) continue;
307
+ // Normalize: /<handle>/ or /<handle>?
308
+ const m = href.match(/^\/([^/?#]+)/);
309
+ if (!m) continue;
310
+ const text = (a.textContent || "").trim();
311
+ if (!text || text.length > 80) continue;
312
+ handle = m[1];
313
+ displayName = text;
314
+ break;
315
+ }
316
+
317
+ // Post text: FB wraps the caption in a container marked via data-ad-comet-preview
318
+ // or data-ad-preview="message". Fallback: longest text block inside article that
319
+ // isn't an author name or reaction count.
320
+ let text = "";
321
+ const msgNode = article.querySelector(
322
+ "[data-ad-comet-preview='message'], [data-ad-preview='message'], [data-testid='post_message']",
323
+ );
324
+ if (msgNode) text = (msgNode.textContent || "").trim();
325
+ if (!text) {
326
+ // Fallback: look at direct divs with dir="auto" that are longer than 40 chars.
327
+ const candidates = Array.from(article.querySelectorAll("div[dir='auto']"))
328
+ .map((d) => (d.textContent || "").trim())
329
+ .filter((t) => t.length > 40 && !/^[\d.,KM\s]+$/.test(t));
330
+ // Sort by length desc and pick the longest.
331
+ candidates.sort((a, b) => b.length - a.length);
332
+ if (candidates[0]) text = candidates[0];
333
+ }
334
+
335
+ // Media: images larger than 200px, exclude the tiny author avatar.
336
+ const imgs = Array.from(article.querySelectorAll("img")) as HTMLImageElement[];
337
+ const mediaUrls = imgs
338
+ .filter((img) => (img.naturalWidth || img.width) > 200)
339
+ .map((img) => img.src)
340
+ .filter((src) => src.startsWith("http"));
341
+
342
+ // Videos — Facebook uses <video> elements for reels and video posts.
343
+ const videos = Array.from(article.querySelectorAll("video")) as HTMLVideoElement[];
344
+ const videoUrls = videos
345
+ .map((v) => v.src || v.currentSrc)
346
+ .filter((u) => !!u && u.startsWith("http"));
347
+
348
+ // Timestamp: the permalink anchor often has an aria-label with the date.
349
+ let postedAt: string | null = null;
350
+ const timeLink = article.querySelector(
351
+ "a[href*='/posts/'] span[aria-label], a[href*='/permalink'] span[aria-label], a[href*='/videos/'] span[aria-label]",
352
+ );
353
+ if (timeLink) {
354
+ const label = timeLink.getAttribute("aria-label") || "";
355
+ if (label) postedAt = label;
356
+ }
357
+
358
+ // Engagement: FB hides exact numbers behind aria-labels.
359
+ // "X reactions" / "Y comments" / "Z shares".
360
+ const parseCount = (s: string | null): number | null => {
361
+ if (!s) return null;
362
+ const m = s.match(/([\d.,]+)\s*(k|m|tsd|mio|million|thousand)?/i);
363
+ if (!m) return null;
364
+ let n = parseFloat(m[1].replace(/[.,]/g, (c) => (c === "," ? "." : "")));
365
+ if (!Number.isFinite(n)) return null;
366
+ const unit = (m[2] || "").toLowerCase();
367
+ if (unit.startsWith("k") || unit.startsWith("tsd")) n *= 1_000;
368
+ if (unit.startsWith("m") || unit.startsWith("mio") || unit === "million") n *= 1_000_000;
369
+ return Math.round(n);
370
+ };
371
+ let likes: number | null = null;
372
+ let comments: number | null = null;
373
+ let shares: number | null = null;
374
+ const reactionNode = article.querySelector(
375
+ "[aria-label*='reaction'], [aria-label*='Reaktion'], [aria-label*='Gefällt']",
376
+ );
377
+ if (reactionNode) likes = parseCount(reactionNode.getAttribute("aria-label"));
378
+ const commentNode = article.querySelector(
379
+ "[aria-label*='comment'], [aria-label*='Kommentar']",
380
+ );
381
+ if (commentNode) comments = parseCount(commentNode.getAttribute("aria-label"));
382
+ const shareNode = article.querySelector(
383
+ "[aria-label*='share'], [aria-label*='Teilen'], [aria-label*='geteilt']",
384
+ );
385
+ if (shareNode) shares = parseCount(shareNode.getAttribute("aria-label"));
386
+
387
+ // Post type
388
+ let postType = "text";
389
+ if (videoUrls.length > 0) {
390
+ postType = /\/reel\//.test(window.location.href) ? "reel" : "video";
391
+ } else if (mediaUrls.length > 1) {
392
+ postType = "carousel";
393
+ } else if (mediaUrls.length === 1) {
394
+ postType = "image";
395
+ }
396
+
397
+ return {
398
+ handle,
399
+ displayName,
400
+ text,
401
+ mediaUrls,
402
+ videoUrls,
403
+ postType,
404
+ postedAt,
405
+ likes,
406
+ comments,
407
+ shares,
408
+ url: window.location.href,
409
+ };
410
+ });
411
+
412
+ if (!data) {
413
+ return { posts: [], errors: ["Could not find Facebook article on page"] };
414
+ }
415
+ if (!data.text && data.mediaUrls.length === 0 && data.videoUrls.length === 0) {
416
+ return {
417
+ posts: [],
418
+ errors: ["Facebook post appears empty (no text/media found) — DOM may have changed"],
419
+ };
420
+ }
421
+
422
+ const hashtags = Array.from(data.text.matchAll(/#(\w+)/g)).map((m) => m[1]);
423
+ const mentions = Array.from(data.text.matchAll(/@(\w+)/g)).map((m) => m[1]);
424
+ const cta = detectCta(data.text);
425
+ const hook = extractHook(data.text);
426
+
427
+ let visionDescription = "";
428
+ if (data.mediaUrls.length > 0) {
429
+ onLog?.("Requesting Claude Vision description…");
430
+ visionDescription = await describeImageByUrl(data.mediaUrls[0]);
431
+ }
432
+
433
+ const id = `fb-${Date.now()}-${randomUUID().slice(0, 8)}`;
434
+
435
+ const post: LibraryPost = {
436
+ id,
437
+ platform: "facebook",
438
+ source,
439
+ url: data.url,
440
+ author: {
441
+ handle: data.handle,
442
+ displayName: data.displayName || undefined,
443
+ },
444
+ text: data.text,
445
+ hook,
446
+ cta,
447
+ hashtags,
448
+ mentions,
449
+ media: [
450
+ ...data.mediaUrls.map((remoteUrl, i) => ({
451
+ type: "image" as const,
452
+ localPath: null,
453
+ remoteUrl,
454
+ description: i === 0 ? visionDescription : "",
455
+ })),
456
+ ...data.videoUrls.map((remoteUrl) => ({
457
+ type: "video" as const,
458
+ localPath: null,
459
+ remoteUrl,
460
+ description: "",
461
+ })),
462
+ ],
463
+ engagement: {
464
+ likes: data.likes,
465
+ comments: data.comments,
466
+ shares: data.shares,
467
+ views: null,
468
+ saves: null,
469
+ },
470
+ engagementRate: null,
471
+ postType: data.postType as LibraryPost["postType"],
472
+ postedAt: data.postedAt,
473
+ tags: [],
474
+ isGold: false,
475
+ extractedAt: new Date().toISOString(),
476
+ notes: "",
477
+ };
478
+
479
+ return { posts: [post], errors: [] };
480
+ } catch (e) {
481
+ return {
482
+ posts: [],
483
+ errors: [`Facebook extraction failed: ${e instanceof Error ? e.message : String(e)}`],
484
+ };
485
+ }
486
+ }
487
+
488
+ // ─── Facebook profile feed — desktop (2026 layout) ─────────────────────────
489
+ // Posts on desktop FB 2026 are NOT wrapped in `role='article'` at top level
490
+ // (those are comments + side widgets). The reliable anchors are:
491
+ // - [data-ad-comet-preview='message'] — post body container
492
+ // - [data-ad-preview='message'] — same body, dual-attributed
493
+ // - [data-ad-rendering-role='story_message'] — message wrapper
494
+ // - [data-ad-rendering-role='profile_name'] — author header
495
+ // - [data-ad-rendering-role='like_button' | 'comment_button' | 'share_button']
496
+ // Strategy: collect every message anchor in the DOM, walk up to the closest
497
+ // ancestor that ALSO contains a profile_name AND a like_button — that's the
498
+ // post wrapper. Dedupe wrappers (we drop wrappers that contain other wrappers
499
+ // to keep only the inner-most one per post).
500
+ async function extractFacebookFeedDesktop(
501
+ opts: ExtractOptions,
502
+ maxPosts: number,
503
+ ): Promise<ExtractResult> {
504
+ const { page, source, onLog } = opts;
505
+ const posts: LibraryPost[] = [];
506
+ const errors: string[] = [];
507
+
508
+ try {
509
+ // Wait for at least one message marker to be in the DOM. These appear
510
+ // once FB has hydrated the first post tile.
511
+ await page
512
+ .waitForSelector(
513
+ "[data-ad-comet-preview='message'], [data-ad-preview='message'], [data-ad-rendering-role='story_message']",
514
+ { state: "attached", timeout: 12_000 },
515
+ )
516
+ .catch(() => {});
517
+
518
+ // Scroll to populate more posts (FB virtualises the feed).
519
+ for (let i = 0; i < 18; i++) {
520
+ await page.evaluate(() => window.scrollBy(0, 1500));
521
+ await page.waitForTimeout(900);
522
+ }
523
+ await page.evaluate(() => window.scrollTo(0, 0));
524
+ await page.waitForTimeout(400);
525
+
526
+ // Click any "See more" / "Mehr anzeigen" toggles page-wide so captions
527
+ // aren't truncated. The button is sometimes a sibling of the message
528
+ // container, not a descendant — so we search the whole document. Repeat
529
+ // a few times because expanded text may itself contain another toggle
530
+ // (long posts get folded twice).
531
+ for (let pass = 0; pass < 3; pass++) {
532
+ const clicked = await page.evaluate(() => {
533
+ const isToggle = (s: string) => {
534
+ const t = s.trim().toLowerCase();
535
+ return (
536
+ t === "see more" ||
537
+ t === "mehr anzeigen" ||
538
+ t === "weiterlesen" ||
539
+ t === "...mehr" ||
540
+ t === "… mehr" ||
541
+ t === "show more"
542
+ );
543
+ };
544
+ const candidates = Array.from(
545
+ document.querySelectorAll("div[role='button'], span[role='button'], span"),
546
+ ) as HTMLElement[];
547
+ let count = 0;
548
+ for (const el of candidates) {
549
+ if (!el.isConnected) continue;
550
+ // The toggle is usually a leaf — skip elements whose textContent
551
+ // contains the trigger as part of a longer string (e.g. message
552
+ // body that ends with "… Mehr anzeigen"). Compare to direct text.
553
+ const direct = (el.textContent || "").trim();
554
+ if (direct.length > 30) continue;
555
+ if (!isToggle(direct)) continue;
556
+ try { el.click(); count++; } catch { /* noop */ }
557
+ }
558
+ return count;
559
+ }).catch(() => 0);
560
+ if (!clicked) break;
561
+ await page.waitForTimeout(700);
562
+ }
563
+
564
+ const currentUrl = page.url();
565
+ const handleMatch = currentUrl.match(/facebook\.com\/([^/?#]+)/);
566
+ const expectedHandle =
567
+ handleMatch &&
568
+ !/^(profile\.php|pages|watch|story\.php|groups|marketplace|events)$/.test(handleMatch[1])
569
+ ? handleMatch[1].toLowerCase()
570
+ : null;
571
+ if (expectedHandle) onLog?.(`Expected handle: ${expectedHandle}`);
572
+
573
+ const raw = await page.evaluate(({ max, expectedHandle }) => {
574
+ const parseCount = (s: string | null): number | null => {
575
+ if (!s) return null;
576
+ const mm = s.match(/([\d.,]+)\s*(k|m|tsd|mio|million|thousand)?/i);
577
+ if (!mm) return null;
578
+ let n = parseFloat(mm[1].replace(/[.,]/g, (c) => (c === "," ? "." : "")));
579
+ if (!Number.isFinite(n)) return null;
580
+ const unit = (mm[2] || "").toLowerCase();
581
+ if (unit.startsWith("k") || unit.startsWith("tsd")) n *= 1_000;
582
+ if (unit.startsWith("m") || unit.startsWith("mio") || unit === "million") n *= 1_000_000;
583
+ return Math.round(n);
584
+ };
585
+
586
+ const extractHandleFromHref = (href: string): string | null => {
587
+ if (/comment_id=/.test(href)) return null;
588
+ const profIdMatch = href.match(/profile\.php\?id=(\d+)/);
589
+ if (profIdMatch) return profIdMatch[1];
590
+ // Strip protocol+host first so we always match against the path.
591
+ let path = href;
592
+ const httpMatch = href.match(/^https?:\/\/[^/]+(\/.*)$/);
593
+ if (httpMatch) path = httpMatch[1];
594
+ const slugMatch = path.match(/^\/([^/?#]+)/);
595
+ if (!slugMatch) return null;
596
+ const slug = slugMatch[1];
597
+ if (
598
+ /^(photo|video|reel|share|hashtag|stories|groups|events|marketplace|watch|posts|permalink|comments|browse|story\.php|home\.php|notifications|profile\.php|p|reels)$/i.test(
599
+ slug,
600
+ )
601
+ ) {
602
+ return null;
603
+ }
604
+ return slug;
605
+ };
606
+
607
+ // 1) Collect all message anchors in document order.
608
+ const messageNodes = Array.from(
609
+ document.querySelectorAll(
610
+ "[data-ad-comet-preview='message'], [data-ad-preview='message'], [data-ad-rendering-role='story_message']",
611
+ ),
612
+ ) as HTMLElement[];
613
+
614
+ // 2) For each message, walk up to find the closest ancestor that also
615
+ // contains a profile_name AND a like_button. That's the post wrapper.
616
+ const wrapperSet = new Set<HTMLElement>();
617
+ const messageByWrapper = new Map<HTMLElement, HTMLElement>();
618
+ // A wrapper is the smallest ancestor of a message-anchor that
619
+ // contains profile_name + like_button + a post-permalink anchor
620
+ // (timestamp link). The permalink requirement is what stops us from
621
+ // picking a too-narrow wrapper that excludes the header.
622
+ const hasPermalinkAnchor = (el: HTMLElement): boolean => {
623
+ for (const a of Array.from(el.querySelectorAll("a[href]")) as HTMLAnchorElement[]) {
624
+ const h = a.getAttribute("href") || "";
625
+ if (/comment_id=/.test(h)) continue;
626
+ if (/\/(posts|videos|reel|permalink|share\/p|share\/r|share\/v)\//.test(h)) return true;
627
+ if (/\/(photo|photos)(\/|\?fbid=)/.test(h)) return true;
628
+ }
629
+ return false;
630
+ };
631
+ for (const msg of messageNodes) {
632
+ let cur: HTMLElement | null = msg;
633
+ for (let depth = 0; depth < 25 && cur; depth++) {
634
+ cur = cur.parentElement;
635
+ if (!cur) break;
636
+ const hasProfile = !!cur.querySelector("[data-ad-rendering-role='profile_name']");
637
+ const hasLike = !!cur.querySelector("[data-ad-rendering-role='like_button']");
638
+ if (hasProfile && hasLike && hasPermalinkAnchor(cur)) {
639
+ // Prefer inner-most wrapper: drop any already-collected wrapper
640
+ // that contains this one, and skip if a smaller wrapper already
641
+ // exists inside this one.
642
+ let isContainedByExisting = false;
643
+ for (const w of Array.from(wrapperSet)) {
644
+ if (w !== cur && w.contains(cur)) {
645
+ wrapperSet.delete(w);
646
+ messageByWrapper.delete(w);
647
+ } else if (w !== cur && cur.contains(w)) {
648
+ isContainedByExisting = true;
649
+ }
650
+ }
651
+ if (!isContainedByExisting && !wrapperSet.has(cur)) {
652
+ wrapperSet.add(cur);
653
+ messageByWrapper.set(cur, msg);
654
+ }
655
+ break;
656
+ }
657
+ }
658
+ }
659
+ const wrappers = Array.from(wrapperSet);
660
+
661
+ // 2.5) Detect the actual profile owner. FB does NOT necessarily use the
662
+ // URL slug (vanity) as the handle in author hrefs — a profile
663
+ // reachable via /aitrendz.xyz1 might be linked internally as
664
+ // /profile.php?id=NNN, /someother.slug, or display "René Remsik".
665
+ // So: collect the (handle, displayName) of every wrapper's
666
+ // profile_name link, count frequencies, and treat the dominant
667
+ // one as the page owner.
668
+ const wrapperMeta = wrappers.map((wrap) => {
669
+ const profile = wrap.querySelector("[data-ad-rendering-role='profile_name']") as HTMLElement | null;
670
+ let h = "";
671
+ let dn = "";
672
+ if (profile) {
673
+ const links = Array.from(profile.querySelectorAll("a[href]")) as HTMLAnchorElement[];
674
+ for (const a of links) {
675
+ const slug = extractHandleFromHref(a.getAttribute("href") || "");
676
+ if (!slug) continue;
677
+ const t = (a.textContent || "").trim();
678
+ if (!t || t.length > 80) continue;
679
+ h = slug;
680
+ dn = t;
681
+ break;
682
+ }
683
+ }
684
+ // Find permalink owner. Search ONLY within the wrapper itself —
685
+ // walking ancestors picks up links from sibling posts (especially
686
+ // the shared /stories/ links FB uses for the feed-virtualization
687
+ // container) which would assign the same wrong permalink to multiple
688
+ // posts.
689
+ let permalinkOwner: string | null = null;
690
+ let permalink: string | null = null;
691
+ const allHrefs: string[] = [];
692
+ for (const a of Array.from(wrap.querySelectorAll("a[href]")) as HTMLAnchorElement[]) {
693
+ const href = a.getAttribute("href") || "";
694
+ if (allHrefs.length < 12) allHrefs.push(href.slice(0, 200));
695
+ if (/comment_id=/.test(href)) continue;
696
+ const ownerMatch =
697
+ href.match(/^\/([^/?#]+)\/(?:posts|videos|reel)\//) ||
698
+ href.match(/facebook\.com\/([^/?#]+)\/(?:posts|videos|reel)\//);
699
+ if (ownerMatch) {
700
+ permalinkOwner = ownerMatch[1].toLowerCase();
701
+ permalink = href.startsWith("http") ? href : `https://www.facebook.com${href}`;
702
+ break;
703
+ }
704
+ if (/\/(permalink|share\/p|share\/r|share\/v)\//.test(href) && !permalink) {
705
+ permalink = href.startsWith("http") ? href : `https://www.facebook.com${href}`;
706
+ }
707
+ if (/\/(photo|photos)(\/|\?fbid=)/.test(href) && !permalink) {
708
+ permalink = href.startsWith("http") ? href : `https://www.facebook.com${href}`;
709
+ }
710
+ }
711
+ return { wrap, handle: h, displayName: dn, permalinkOwner, permalink, debugHrefs: allHrefs };
712
+ });
713
+
714
+ // Tally handle frequency. Most-frequent wins as page owner — but only
715
+ // if it dominates (≥ 30% of wrappers AND ≥ 2 occurrences). Otherwise
716
+ // we treat the page as a heterogeneous feed (recommendations etc.) and
717
+ // fall back to the URL-slug filter.
718
+ const handleCounts = new Map<string, { count: number; displayName: string }>();
719
+ for (const w of wrapperMeta) {
720
+ if (!w.handle) continue;
721
+ const key = w.handle.toLowerCase();
722
+ const ex = handleCounts.get(key);
723
+ if (ex) ex.count++;
724
+ else handleCounts.set(key, { count: 1, displayName: w.displayName });
725
+ }
726
+ const ownerCounts = Array.from(handleCounts.entries()).sort((a, b) => b[1].count - a[1].count);
727
+ const dominant = ownerCounts[0];
728
+ let detectedOwner: string | null = null;
729
+ let detectedOwnerName = "";
730
+ if (dominant && dominant[1].count >= 2 && dominant[1].count / wrappers.length >= 0.3) {
731
+ detectedOwner = dominant[0];
732
+ detectedOwnerName = dominant[1].displayName;
733
+ }
734
+
735
+ // The "effective owner" used for filtering: detected owner if we have
736
+ // one, else the URL-slug expectedHandle.
737
+ const effectiveOwner = detectedOwner || (expectedHandle ? expectedHandle.toLowerCase() : null);
738
+
739
+ const results: Array<{
740
+ handle: string;
741
+ displayName: string;
742
+ text: string;
743
+ mediaUrls: string[];
744
+ videoUrls: string[];
745
+ postType: string;
746
+ postedAt: string | null;
747
+ likes: number | null;
748
+ comments: number | null;
749
+ shares: number | null;
750
+ permalink: string | null;
751
+ }> = [];
752
+ let skippedWrongHandle = 0;
753
+ let skippedEmpty = 0;
754
+ const seenSig = new Set<string>();
755
+
756
+ let skippedPinned = 0;
757
+ for (const meta of wrapperMeta) {
758
+ if (results.length >= max) break;
759
+ const wrap = meta.wrap;
760
+ let handle = meta.handle;
761
+ let displayName = meta.displayName;
762
+ const permalink = meta.permalink;
763
+ const permalinkOwner = meta.permalinkOwner;
764
+
765
+ // Skip pinned posts. FB shows them at the top of profile feeds and
766
+ // they can be months/years old. Look for the pin marker near the
767
+ // header — usually a sibling of profile_name with text "Angeheftet"
768
+ // / "Pinned post" / "Featured", or an aria-label hinting at it.
769
+ const profileEl = wrap.querySelector("[data-ad-rendering-role='profile_name']") as HTMLElement | null;
770
+ let isPinned = false;
771
+ if (profileEl) {
772
+ const headerArea = profileEl.parentElement || profileEl;
773
+ const headerText = (headerArea.innerText || "").toLowerCase();
774
+ if (
775
+ /^(angeheftet|pinned post|pinned|angeheftete? beitr|featured)$/m.test(headerText) ||
776
+ /\b(angeheftet|pinned post|pinned|angepinnt|featured post)\b/.test(headerText.split("\n")[0] || "")
777
+ ) {
778
+ isPinned = true;
779
+ }
780
+ // Also check for the explicit aria-label/title on the pin icon.
781
+ if (
782
+ headerArea.querySelector(
783
+ "[aria-label*='Angeheftet' i], [aria-label*='Pinned' i], [aria-label*='angepinnt' i]",
784
+ )
785
+ ) {
786
+ isPinned = true;
787
+ }
788
+ }
789
+ if (isPinned) {
790
+ skippedPinned++;
791
+ continue;
792
+ }
793
+
794
+ // Filter: keep wrapper only if it belongs to the effective owner.
795
+ // Match either by author handle or by permalink path-owner.
796
+ if (effectiveOwner) {
797
+ const ownerMatchByPermalink = permalinkOwner && permalinkOwner === effectiveOwner;
798
+ const ownerMatchByHandle = handle && handle.toLowerCase() === effectiveOwner;
799
+ const haveOwnerInfo = !!permalinkOwner || !!handle;
800
+ if (haveOwnerInfo && !ownerMatchByPermalink && !ownerMatchByHandle) {
801
+ skippedWrongHandle++;
802
+ continue;
803
+ }
804
+ // If matched by permalink but no handle was detected, fill in.
805
+ if (ownerMatchByPermalink && !handle) {
806
+ handle = effectiveOwner;
807
+ if (!displayName && detectedOwnerName) displayName = detectedOwnerName;
808
+ }
809
+ }
810
+
811
+ // Body text from the message node we tagged onto this wrapper.
812
+ // Use innerText (NOT textContent) so we get only the visually rendered
813
+ // characters. FB injects hidden <span style="display:none"> decoys
814
+ // with random characters between the real ones as an anti-scraping
815
+ // measure — textContent reads them all and produces gibberish like
816
+ // "oeSrnodstp 0 809a"; innerText respects CSS visibility and yields
817
+ // the clean rendered text.
818
+ const msgNode = messageByWrapper.get(wrap);
819
+ let text = "";
820
+ if (msgNode) text = (msgNode.innerText || "").trim();
821
+
822
+ // Note: own-comments are fetched in a SECOND pass via permalink tabs
823
+ // — see fetchOwnCommentsFromPermalink below. Inline expansion via
824
+ // clicking comment_button on the feed view proved unreliable: FB
825
+ // doesn't always load the full thread inline, and "Verfasser"-tagged
826
+ // own-comments often only render on the post's permalink page.
827
+
828
+ // Posted-at: prefer aria-label of timestamp link.
829
+ let postedAt: string | null = null;
830
+ const tsNode = wrap.querySelector(
831
+ "a[href*='/posts/'] [aria-label], a[href*='/permalink'] [aria-label], a[href*='/videos/'] [aria-label], a[href*='/reel/'] [aria-label]",
832
+ );
833
+ if (tsNode) {
834
+ const lab = tsNode.getAttribute("aria-label") || "";
835
+ if (lab) postedAt = lab;
836
+ }
837
+
838
+ // Engagement.
839
+ let likes: number | null = null;
840
+ let comments: number | null = null;
841
+ let shares: number | null = null;
842
+ const reactionNode = wrap.querySelector(
843
+ "[aria-label*='Gefällt mir: '], [aria-label*='reactions'], [aria-label*='Reaktion']",
844
+ );
845
+ if (reactionNode) likes = parseCount(reactionNode.getAttribute("aria-label"));
846
+ // Fallback: number adjacent to like_button.
847
+ if (likes == null) {
848
+ const likeBtn = wrap.querySelector("[data-ad-rendering-role='like_button']");
849
+ if (likeBtn) {
850
+ const sib = likeBtn.parentElement?.textContent?.trim() || "";
851
+ likes = parseCount(sib);
852
+ }
853
+ }
854
+ const commentBtn = wrap.querySelector("[data-ad-rendering-role='comment_button']");
855
+ if (commentBtn) {
856
+ // The visible count usually sits as text inside the button or as an
857
+ // aria-label like "3 Kommentare".
858
+ const lab = commentBtn.getAttribute("aria-label");
859
+ if (lab) comments = parseCount(lab);
860
+ if (comments == null) {
861
+ const t = (commentBtn.textContent || "").trim();
862
+ comments = parseCount(t);
863
+ }
864
+ }
865
+ const shareBtn = wrap.querySelector("[data-ad-rendering-role='share_button']");
866
+ if (shareBtn) {
867
+ const lab = shareBtn.getAttribute("aria-label");
868
+ if (lab) shares = parseCount(lab);
869
+ if (shares == null) {
870
+ const t = (shareBtn.textContent || "").trim();
871
+ shares = parseCount(t);
872
+ }
873
+ }
874
+
875
+ // Media: images >200px inside the wrapper, excluding profile photos
876
+ // (which usually live inside [data-ad-rendering-role='profile_name'])
877
+ // and emoji/safe_image hosts.
878
+ const wrapClone = wrap.cloneNode(true) as HTMLElement;
879
+ for (const p of Array.from(wrapClone.querySelectorAll("[data-ad-rendering-role='profile_name']"))) {
880
+ p.remove();
881
+ }
882
+ const imgs = Array.from(wrapClone.querySelectorAll("img")) as HTMLImageElement[];
883
+ const mediaUrls = imgs
884
+ .filter((img) => (img.naturalWidth || img.width) > 200)
885
+ .map((img) => img.src)
886
+ .filter((src) => src.startsWith("http") && !/emoji|safe_image\.php/i.test(src));
887
+ const videos = Array.from(wrapClone.querySelectorAll("video")) as HTMLVideoElement[];
888
+ const videoUrls = videos
889
+ .map((v) => v.src || v.currentSrc)
890
+ .filter((u) => !!u && u.startsWith("http"));
891
+
892
+ if (!text && mediaUrls.length === 0 && videoUrls.length === 0) {
893
+ skippedEmpty++;
894
+ continue;
895
+ }
896
+
897
+ // Dedupe by text+first-image signature.
898
+ const sig = (text.slice(0, 120) + "|" + (mediaUrls[0] || "") + "|" + (permalink || "")).trim();
899
+ if (sig && seenSig.has(sig)) continue;
900
+ if (sig) seenSig.add(sig);
901
+
902
+ let postType = "text";
903
+ if (videoUrls.length > 0) postType = "video";
904
+ else if (mediaUrls.length > 1) postType = "carousel";
905
+ else if (mediaUrls.length === 1) postType = "image";
906
+
907
+ results.push({
908
+ handle,
909
+ displayName,
910
+ text,
911
+ mediaUrls,
912
+ videoUrls,
913
+ postType,
914
+ postedAt,
915
+ likes,
916
+ comments,
917
+ shares,
918
+ permalink,
919
+ });
920
+ }
921
+
922
+ // Per-wrapper diagnostic list so we can see WHAT was found, not just
923
+ // counts. Helps when a profile uses a non-obvious handle (e.g. URL slug
924
+ // ≠ author handle, vanity URLs, numeric profile IDs).
925
+ const wrapperDiag = wrapperMeta.map((w) => ({
926
+ handle: w.handle,
927
+ displayName: w.displayName,
928
+ permalinkOwner: w.permalinkOwner,
929
+ permalink: w.permalink,
930
+ debugHrefs: w.debugHrefs as string[],
931
+ }));
932
+
933
+ return {
934
+ items: results,
935
+ diag: {
936
+ messageNodes: messageNodes.length,
937
+ wrappers: wrappers.length,
938
+ skippedWrongHandle,
939
+ skippedEmpty,
940
+ skippedPinned,
941
+ detectedOwner,
942
+ detectedOwnerName,
943
+ effectiveOwner,
944
+ ownerCounts: ownerCounts.slice(0, 5).map(([h, v]) => ({ handle: h, count: v.count, displayName: v.displayName })),
945
+ wrapperDiag,
946
+ },
947
+ };
948
+ }, { max: maxPosts, expectedHandle });
949
+
950
+ onLog?.(
951
+ `Scan: ${raw.diag.messageNodes} msg-anchors, ${raw.diag.wrappers} wrappers ` +
952
+ `(pinned: ${raw.diag.skippedPinned}, wrong-handle: ${raw.diag.skippedWrongHandle}, ` +
953
+ `empty: ${raw.diag.skippedEmpty}, kept: ${raw.items.length})`,
954
+ );
955
+ if (raw.diag.detectedOwner) {
956
+ onLog?.(`Detected owner: @${raw.diag.detectedOwner} ("${raw.diag.detectedOwnerName}") — using this instead of URL slug`);
957
+ } else if (raw.diag.ownerCounts.length > 0) {
958
+ onLog?.(
959
+ `No dominant owner detected. Top handles: ` +
960
+ raw.diag.ownerCounts.map((o) => `@${o.handle}×${o.count}`).join(", "),
961
+ );
962
+ }
963
+ // Dump per-wrapper details to the existing /tmp/fb-debug.html.
964
+ try {
965
+ const fs = await import("node:fs/promises");
966
+ const lines = raw.diag.wrapperDiag.map((w, i) => {
967
+ const head = `<!-- wrap[${i}] handle="${w.handle}" name="${w.displayName}" permalinkOwner="${w.permalinkOwner || ""}" permalink="${w.permalink || ""}" -->`;
968
+ const hrefs = (w.debugHrefs || []).map((h) => `<!-- href: ${h.replace(/-->/g, "--&gt;")} -->`).join("\n");
969
+ return head + (hrefs ? "\n" + hrefs : "");
970
+ });
971
+ const extra = `\n\n<!-- desktop-extractor wrapper diag (${raw.diag.wrappers} total) -->\n` +
972
+ `<!-- effectiveOwner=${raw.diag.effectiveOwner} detectedOwner=${raw.diag.detectedOwner} -->\n` +
973
+ lines.join("\n");
974
+ await fs.appendFile("/tmp/fb-debug.html", extra);
975
+ } catch {
976
+ /* noop */
977
+ }
978
+
979
+ if (raw.items.length === 0) {
980
+ if (raw.diag.messageNodes === 0) {
981
+ errors.push(
982
+ "No post-message containers found on the page. The profile may be empty, private, " +
983
+ "or Facebook may be showing a non-feed view (notifications, settings, etc.). " +
984
+ "Open the profile in the browser, scroll until posts are visible, then retry.",
985
+ );
986
+ } else if (raw.diag.skippedWrongHandle > 0 && expectedHandle) {
987
+ errors.push(
988
+ `No posts for @${expectedHandle} — ${raw.diag.skippedWrongHandle} candidates belonged to other profiles.`,
989
+ );
990
+ } else {
991
+ errors.push("Found post wrappers but none had extractable text or media.");
992
+ }
993
+ return { posts, errors };
994
+ }
995
+
996
+ let idx = 0;
997
+ for (const item of raw.items) {
998
+ idx++;
999
+ onLog?.(`${idx}/${raw.items.length}: ${item.handle ? `@${item.handle}` : "(no handle)"} — "${(item.text || "(no text)").slice(0, 60)}"`);
1000
+
1001
+ // Fetch the post author's own comments via the permalink page. FB's
1002
+ // feed view often doesn't render the full comment thread inline, but
1003
+ // the dedicated permalink page does — and own-comments are explicitly
1004
+ // tagged with a "Verfasser" / "Author" badge there.
1005
+ if (item.permalink && raw.diag.effectiveOwner) {
1006
+ try {
1007
+ const ownComments = await fetchOwnCommentsFromPermalink(
1008
+ page,
1009
+ item.permalink,
1010
+ raw.diag.effectiveOwner,
1011
+ raw.diag.detectedOwnerName || item.displayName || "",
1012
+ );
1013
+ if (ownComments.length > 0) {
1014
+ item.text = (item.text + "\n\n" + ownComments.map((c) => `[Eigener Kommentar]\n${c}`).join("\n\n")).trim();
1015
+ onLog?.(`${idx}/${raw.items.length}: + ${ownComments.length} eigene Kommentare`);
1016
+ }
1017
+ } catch (e) {
1018
+ onLog?.(`${idx}/${raw.items.length}: comment fetch failed (${e instanceof Error ? e.message : String(e)})`);
1019
+ }
1020
+ }
1021
+
1022
+ const hashtags = Array.from(item.text.matchAll(/#(\w+)/g)).map((m) => m[1]);
1023
+ const mentions = Array.from(item.text.matchAll(/@(\w+)/g)).map((m) => m[1]);
1024
+ const cta = detectCta(item.text);
1025
+ const hook = extractHook(item.text);
1026
+
1027
+ let visionDescription = "";
1028
+ if (item.mediaUrls.length > 0) {
1029
+ onLog?.(`${idx}/${raw.items.length}: vision describe…`);
1030
+ try {
1031
+ visionDescription = await describeImageByUrl(item.mediaUrls[0]);
1032
+ } catch (e) {
1033
+ onLog?.(`${idx}/${raw.items.length}: vision failed (${e instanceof Error ? e.message : String(e)})`);
1034
+ }
1035
+ }
1036
+
1037
+ const id = `fb-${Date.now()}-${randomUUID().slice(0, 8)}`;
1038
+ const post: LibraryPost = {
1039
+ id,
1040
+ platform: "facebook",
1041
+ source,
1042
+ url: item.permalink || page.url(),
1043
+ author: { handle: item.handle, displayName: item.displayName || undefined },
1044
+ text: item.text,
1045
+ hook,
1046
+ cta,
1047
+ hashtags,
1048
+ mentions,
1049
+ media: [
1050
+ ...item.mediaUrls.map((remoteUrl, i) => ({
1051
+ type: "image" as const,
1052
+ localPath: null,
1053
+ remoteUrl,
1054
+ description: i === 0 ? visionDescription : "",
1055
+ })),
1056
+ ...item.videoUrls.map((remoteUrl) => ({
1057
+ type: "video" as const,
1058
+ localPath: null,
1059
+ remoteUrl,
1060
+ description: "",
1061
+ })),
1062
+ ],
1063
+ engagement: {
1064
+ likes: item.likes,
1065
+ comments: item.comments,
1066
+ shares: item.shares,
1067
+ views: null,
1068
+ saves: null,
1069
+ },
1070
+ engagementRate: null,
1071
+ postType: item.postType as LibraryPost["postType"],
1072
+ postedAt: item.postedAt,
1073
+ tags: [],
1074
+ isGold: false,
1075
+ extractedAt: new Date().toISOString(),
1076
+ notes: "",
1077
+ };
1078
+ posts.push(post);
1079
+ }
1080
+ } catch (e) {
1081
+ errors.push(`Facebook desktop feed scan failed: ${e instanceof Error ? e.message : String(e)}`);
1082
+ }
1083
+
1084
+ return { posts, errors };
1085
+ }
1086
+
1087
+ /**
1088
+ * Open a post's permalink in a background tab, expand all comments, and
1089
+ * return the bodies of comments authored by `ownerHandle`. The permalink
1090
+ * page renders the full comment thread (including FB's "Verfasser" / "Author"
1091
+ * badge on own-comments) which the feed view often omits.
1092
+ */
1093
+ async function fetchOwnCommentsFromPermalink(
1094
+ basePage: Page,
1095
+ permalink: string,
1096
+ ownerHandle: string,
1097
+ ownerDisplayName: string,
1098
+ ): Promise<string[]> {
1099
+ const tab = await basePage.context().newPage();
1100
+ try {
1101
+ await tab.goto(permalink, { waitUntil: "domcontentloaded", timeout: 25_000 });
1102
+ await tab.waitForTimeout(2000);
1103
+
1104
+ // Click "View previous comments" / "Weitere Kommentare anzeigen" /
1105
+ // "Antworten anzeigen" repeatedly to surface deeper threads.
1106
+ for (let pass = 0; pass < 6; pass++) {
1107
+ const clicked = await tab.evaluate(() => {
1108
+ const candidates = Array.from(
1109
+ document.querySelectorAll("div[role='button'], span[role='button'], span"),
1110
+ ) as HTMLElement[];
1111
+ const re = /^(weitere kommentare anzeigen|view more comments|alle kommentare anzeigen|view all comments|previous comments|vorherige kommentare|antworten anzeigen|view replies?|view all \d+ replies?|\d+ antworten|\d+ replies?|kommentar(e)? anzeigen)$/i;
1112
+ let n = 0;
1113
+ for (const el of candidates) {
1114
+ if (!el.isConnected) continue;
1115
+ const t = (el.textContent || "").trim();
1116
+ if (t.length === 0 || t.length > 60) continue;
1117
+ if (re.test(t)) {
1118
+ try { el.click(); n++; } catch { /* noop */ }
1119
+ }
1120
+ }
1121
+ return n;
1122
+ }).catch(() => 0);
1123
+ if (!clicked) break;
1124
+ await tab.waitForTimeout(900);
1125
+ }
1126
+
1127
+ // Click "See more" inside any truncated comments.
1128
+ await tab.evaluate(() => {
1129
+ const candidates = Array.from(
1130
+ document.querySelectorAll("div[role='button'], span[role='button'], span"),
1131
+ ) as HTMLElement[];
1132
+ for (const el of candidates) {
1133
+ if (!el.isConnected) continue;
1134
+ const t = (el.textContent || "").trim();
1135
+ if (t.length > 30) continue;
1136
+ const lc = t.toLowerCase();
1137
+ if (lc === "see more" || lc === "mehr anzeigen" || lc === "weiterlesen" || lc === "...mehr" || lc === "show more") {
1138
+ try { el.click(); } catch { /* noop */ }
1139
+ }
1140
+ }
1141
+ }).catch(() => {});
1142
+ await tab.waitForTimeout(500);
1143
+
1144
+ const comments = await tab.evaluate(() => {
1145
+ // Strategy: walk ALL [role='article'] elements that have a
1146
+ // "Verfasser" / "Author" badge inside. FB tags comments by the post
1147
+ // author with that badge regardless of who reshared the post — so this
1148
+ // is more reliable than matching against an upstream-detected owner
1149
+ // handle (which would be the resharer's handle, not the original
1150
+ // author's, on shared posts).
1151
+ const articles = Array.from(document.querySelectorAll("[role='article']")) as HTMLElement[];
1152
+ const results: string[] = [];
1153
+ const seen = new Set<string>();
1154
+ let postAuthorName = "";
1155
+
1156
+ for (const c of articles) {
1157
+ if (results.length >= 12) break;
1158
+
1159
+ const al = (c.getAttribute("aria-label") || "").trim();
1160
+ // Comment articles only — skip the post itself (no aria-label or
1161
+ // a non-comment aria-label like "Beitrag von X").
1162
+ if (!/^(Kommentar|Comment|Reply|Antwort)/i.test(al)) continue;
1163
+
1164
+ // "Verfasser" / "Author" badge inside?
1165
+ const hasBadge = Array.from(c.querySelectorAll("span, div")).some((el) => {
1166
+ const t = (el.textContent || "").trim();
1167
+ return t === "Verfasser" || t === "Author";
1168
+ });
1169
+ if (!hasBadge) continue;
1170
+
1171
+ // Capture the post-author display name from aria-label
1172
+ // (= "Kommentar von <Name> (...)" / "Comment by <Name>").
1173
+ if (!postAuthorName) {
1174
+ const m = al.match(/^(?:Kommentar von|Comment by|Antwort von|Reply by)\s+(.+?)(?:\s*\(.*\))?$/i);
1175
+ if (m) postAuthorName = m[1].trim();
1176
+ }
1177
+
1178
+ // Body: clone, strip nested role='article' (replies), strip the
1179
+ // action row links / "Verfasser" badge / author-name / timestamp.
1180
+ const cClone = c.cloneNode(true) as HTMLElement;
1181
+ for (const inner of Array.from(cClone.querySelectorAll("[role='article']"))) {
1182
+ if (inner !== cClone) inner.remove();
1183
+ }
1184
+ for (const el of Array.from(cClone.querySelectorAll("span, div"))) {
1185
+ const t = (el.textContent || "").trim();
1186
+ if (t === "Verfasser" || t === "Author") el.remove();
1187
+ }
1188
+
1189
+ let body = (cClone.innerText || "").trim();
1190
+ body = body
1191
+ .split("\n")
1192
+ .map((l) => l.trimEnd())
1193
+ .filter((line, idx, arr) => {
1194
+ const tr = line.trim();
1195
+ if (!tr) return false;
1196
+ // First two lines are typically "Verfasser" + "<Author Name>" —
1197
+ // we already removed "Verfasser", but the author name remains.
1198
+ // Drop it when seen as a standalone first line.
1199
+ if (idx < 2 && postAuthorName && tr === postAuthorName) return false;
1200
+ // Drop trailing meta lines: timestamp, "Antworten", "Gefällt mir"…
1201
+ if (/^\d+\s*(Min|Std|Tag|Tagen|Wo|Mon|Jahr|min|h|d|w)\.?$/i.test(tr)) return false;
1202
+ if (/^vor\s+\d+\s+(Min|Stunden?|Tagen?|Wochen?|Monaten?|Jahren?)/i.test(tr)) return false;
1203
+ if (tr === "Antworten" || tr === "Reply" || tr === "Gefällt mir" || tr === "Like" || tr === "Teilen" || tr === "Share") return false;
1204
+ // Last few lines: "Bearbeitet" / "Edited" markers
1205
+ if (idx >= arr.length - 3 && (tr === "Bearbeitet" || tr === "Edited")) return false;
1206
+ return true;
1207
+ })
1208
+ .join("\n")
1209
+ .trim();
1210
+ if (body.length < 10) continue;
1211
+ const sig = body.slice(0, 100);
1212
+ if (seen.has(sig)) continue;
1213
+ seen.add(sig);
1214
+ results.push(body);
1215
+ }
1216
+ return results;
1217
+ });
1218
+
1219
+ return comments;
1220
+ } finally {
1221
+ await tab.close().catch(() => { /* noop */ });
1222
+ }
1223
+ }
1224
+
1225
+ /** Click any "See more" / "Mehr anzeigen" toggle inside article to expand truncated text. */
1226
+ async function expandFacebookSeeMore(page: Page): Promise<void> {
1227
+ await page.evaluate(() => {
1228
+ const article = document.querySelector("div[role='article']");
1229
+ if (!article) return;
1230
+ const candidates = Array.from(article.querySelectorAll("div[role='button'], span")) as HTMLElement[];
1231
+ for (const el of candidates) {
1232
+ const t = (el.textContent || "").trim().toLowerCase();
1233
+ if (t === "see more" || t === "mehr anzeigen" || t === "weiterlesen" || t === "...mehr") {
1234
+ el.click();
1235
+ return;
1236
+ }
1237
+ }
1238
+ });
1239
+ await page.waitForTimeout(400);
1240
+ }
1241
+
1242
+ // ─── Helpers ────────────────────────────────────────────────────────────────
1243
+
1244
+ function extractHook(text: string): string {
1245
+ const trimmed = text.trim();
1246
+ // First sentence up to 140 chars, or first 2 lines, whichever shorter.
1247
+ const firstLineBreak = trimmed.indexOf("\n");
1248
+ const firstLine = firstLineBreak > 0 ? trimmed.slice(0, firstLineBreak) : trimmed;
1249
+ const firstSentenceMatch = firstLine.match(/^(.+?[.!?])\s/);
1250
+ const candidate = firstSentenceMatch ? firstSentenceMatch[1] : firstLine;
1251
+ return candidate.slice(0, 140).trim();
1252
+ }
1253
+
1254
+ /** Very simple CTA detection: questions, imperatives, "link in bio" phrases. */
1255
+ function detectCta(text: string): string | null {
1256
+ const lines = text.split(/\n+/).map((l) => l.trim()).filter(Boolean);
1257
+ // Check from the end — CTAs are usually at the bottom.
1258
+ for (let i = lines.length - 1; i >= Math.max(0, lines.length - 4); i--) {
1259
+ const line = lines[i];
1260
+ if (!line) continue;
1261
+ if (/\?$/.test(line)) return line; // question
1262
+ if (/link in bio|link in der bio|tap the link|mehr im link|swipe up/i.test(line)) return line;
1263
+ if (/^(kommentiere|schreib|teile|kommentier|tag|follow|folg|speicher)/i.test(line)) return line;
1264
+ }
1265
+ return null;
1266
+ }
1267
+
1268
+ // Media download helper — optional future use (currently vision is URL-based).
1269
+ // Keeping stub available as we'll want local copies for offline training.
1270
+ export async function downloadImage(
1271
+ _url: string,
1272
+ _postId: string,
1273
+ _index: number,
1274
+ ): Promise<string | null> {
1275
+ ensureDirs();
1276
+ try {
1277
+ const res = await fetch(_url);
1278
+ if (!res.ok) return null;
1279
+ const buf = Buffer.from(await res.arrayBuffer());
1280
+ const ext = guessExt(res.headers.get("content-type"));
1281
+ const filename = `${_postId}-${_index}.${ext}`;
1282
+ const path = join(MEDIA_ROOT, filename);
1283
+ writeFileSync(path, buf);
1284
+ return path;
1285
+ } catch {
1286
+ return null;
1287
+ }
1288
+ }
1289
+
1290
+ function guessExt(ct: string | null): string {
1291
+ if (!ct) return "bin";
1292
+ if (ct.includes("jpeg") || ct.includes("jpg")) return "jpg";
1293
+ if (ct.includes("png")) return "png";
1294
+ if (ct.includes("webp")) return "webp";
1295
+ if (ct.includes("gif")) return "gif";
1296
+ if (ct.includes("mp4")) return "mp4";
1297
+ return "bin";
1298
+ }