@pencil-agent/nano-pencil 2.0.0-beta.8 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +267 -267
- package/dist/build-meta.json +3 -3
- package/dist/core/export-html/AGENT.md +11 -11
- package/dist/core/export-html/template.css +971 -971
- package/dist/core/export-html/template.html +54 -54
- package/dist/core/extensions-host/index.d.ts +1 -1
- package/dist/core/extensions-host/loader.js +1 -1
- package/dist/core/extensions-host/runner.d.ts +1 -0
- package/dist/core/extensions-host/runner.js +2 -2
- package/dist/core/extensions-host/types.d.ts +17 -22
- package/dist/core/lib/ai/src/types.d.ts +12 -2
- package/dist/core/persona/persona-manager.js +5 -2
- package/dist/core/runtime/agent-session.js +3 -3
- package/dist/core/runtime/extension-core-bindings.d.ts +1 -0
- package/dist/core/runtime/extension-core-bindings.js +2 -2
- package/dist/extensions/builtin/AGENT.md +115 -115
- package/dist/extensions/builtin/browser/AGENT.md +17 -17
- package/dist/extensions/builtin/browser/agent-workspace/agent_helpers.py +12 -12
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/amazon/product-search.md +198 -198
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/archive-org/scraping.md +341 -341
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/arxiv/scraping.md +311 -311
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/arxiv-bulk/scraping.md +333 -333
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/atlas/overview.md +70 -70
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/booking-com/scraping.md +578 -578
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/capterra/scraping.md +440 -440
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/centilebrain/generate-estimates.md +110 -110
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/coingecko/scraping.md +325 -325
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/coinmarketcap/scraping.md +463 -463
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/coursera/scraping.md +360 -360
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/craigslist/scraping.md +390 -390
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/crossref/scraping.md +568 -568
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/dev-to/scraping.md +323 -323
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/duckduckgo/scraping.md +349 -349
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/ebay/scraping.md +435 -435
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/etsy/scraping.md +506 -506
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/eventbrite/scraping.md +363 -363
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/expedia/automation.md +168 -168
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/facebook/groups.md +236 -236
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/facebook/pages.md +295 -295
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/framer/editor.md +108 -108
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/fred/scraping.md +493 -493
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/g2/scraping.md +580 -580
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/genius/scraping.md +511 -511
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/github/repo-actions.md +65 -65
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/github/scraping.md +184 -184
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/glassdoor/scraping.md +543 -543
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/gmail/compose.md +122 -122
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/goodreads/scraping.md +461 -461
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/gutenberg/scraping.md +383 -383
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/hackernews/scraping.md +243 -243
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/howlongtobeat/scraping.md +473 -473
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/imdb/scraping.md +271 -271
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/itch-io/scraping.md +436 -436
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/job-boards/indeed-glassdoor.md +1021 -1021
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/letterboxd/scraping.md +349 -349
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/linkedin/invitation-manager.md +109 -109
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/loom/folder-enumeration.md +170 -170
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/macrotrends/scraping.md +537 -537
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/medium/article-hydration.md +120 -120
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/medium/scraping.md +414 -414
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/metacritic/scraping.md +477 -477
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/musicbrainz/scraping.md +478 -478
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/nasa/scraping.md +339 -339
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/news-aggregation/multi-source.md +205 -205
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/open-library/scraping.md +472 -472
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/openalex/scraping.md +470 -470
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/openstreetmap/scraping.md +490 -490
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/package-registries/npm-pypi.md +478 -478
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/polymarket/scraping.md +234 -234
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/producthunt/scraping.md +307 -307
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/pubmed/scraping.md +421 -421
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/quora/scraping.md +364 -364
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/rawg/scraping.md +352 -352
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/reddit/scraping.md +124 -124
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/rest-countries/scraping.md +233 -233
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/sec-edgar/scraping.md +361 -361
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/README.md +36 -36
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/embedded-apps.md +72 -72
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/knowledge-base.md +109 -109
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/polaris-inputs.md +137 -137
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/soundcloud/scraping.md +362 -362
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/spotify/scraping.md +339 -339
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/stackoverflow/scraping.md +435 -435
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/steam/scraping.md +575 -575
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/substack/scraping.md +338 -338
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/thetechgeeks/pricing.md +52 -52
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/tiktok/upload.md +107 -107
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/tradingview/scraping.md +309 -309
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/trello/boards-and-lists.md +88 -88
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/trustpilot/scraping.md +375 -375
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/walmart/scraping.md +444 -444
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/wayback-machine/scraping.md +306 -306
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/weather/scraping.md +398 -398
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/wellfound/scraping.md +596 -596
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/world-bank/scraping.md +356 -356
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/xiaohongshu/scraping.md +84 -84
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/youtube/scraping.md +418 -418
- package/dist/extensions/builtin/browser/agent-workspace/domain-skills/zillow/scraping.md +433 -433
- package/dist/extensions/builtin/browser/browser.md +73 -73
- package/dist/extensions/builtin/browser/install.md +142 -142
- package/dist/extensions/builtin/browser/interaction-skills/connection.md +48 -48
- package/dist/extensions/builtin/browser/interaction-skills/cookies.md +3 -3
- package/dist/extensions/builtin/browser/interaction-skills/cross-origin-iframes.md +3 -3
- package/dist/extensions/builtin/browser/interaction-skills/dialogs.md +64 -64
- package/dist/extensions/builtin/browser/interaction-skills/downloads.md +3 -3
- package/dist/extensions/builtin/browser/interaction-skills/drag-and-drop.md +3 -3
- package/dist/extensions/builtin/browser/interaction-skills/dropdowns.md +3 -3
- package/dist/extensions/builtin/browser/interaction-skills/iframes.md +3 -3
- package/dist/extensions/builtin/browser/interaction-skills/network-requests.md +3 -3
- package/dist/extensions/builtin/browser/interaction-skills/print-as-pdf.md +3 -3
- package/dist/extensions/builtin/browser/interaction-skills/profile-sync.md +90 -90
- package/dist/extensions/builtin/browser/interaction-skills/screenshots.md +17 -17
- package/dist/extensions/builtin/browser/interaction-skills/scrolling.md +3 -3
- package/dist/extensions/builtin/browser/interaction-skills/shadow-dom.md +3 -3
- package/dist/extensions/builtin/browser/interaction-skills/tabs.md +69 -69
- package/dist/extensions/builtin/browser/interaction-skills/uploads.md +1 -1
- package/dist/extensions/builtin/browser/interaction-skills/viewport.md +3 -3
- package/dist/extensions/builtin/browser/src/browser_harness/AGENT.md +15 -15
- package/dist/extensions/builtin/browser/src/browser_harness/__init__.py +8 -8
- package/dist/extensions/builtin/browser/src/browser_harness/_ipc.py +90 -90
- package/dist/extensions/builtin/browser/src/browser_harness/admin.py +722 -722
- package/dist/extensions/builtin/browser/src/browser_harness/daemon.py +328 -328
- package/dist/extensions/builtin/browser/src/browser_harness/helpers.py +396 -396
- package/dist/extensions/builtin/browser/src/browser_harness/run.py +103 -103
- package/dist/extensions/builtin/discipline/skills/brainstorming/SKILL.md +33 -33
- package/dist/extensions/builtin/discipline/skills/executing-plans/SKILL.md +25 -25
- package/dist/extensions/builtin/discipline/skills/finishing-development-branch/SKILL.md +25 -25
- package/dist/extensions/builtin/discipline/skills/receiving-code-review/SKILL.md +22 -22
- package/dist/extensions/builtin/discipline/skills/requesting-code-review/SKILL.md +31 -31
- package/dist/extensions/builtin/discipline/skills/systematic-debugging/SKILL.md +28 -28
- package/dist/extensions/builtin/discipline/skills/test-driven-development/SKILL.md +32 -32
- package/dist/extensions/builtin/discipline/skills/using-git-worktrees/SKILL.md +25 -25
- package/dist/extensions/builtin/discipline/skills/verification-before-completion/SKILL.md +27 -27
- package/dist/extensions/builtin/discipline/skills/writing-plans/SKILL.md +26 -26
- package/dist/extensions/builtin/goal/README.md +67 -67
- package/dist/extensions/builtin/goal/goal-controller.d.ts +39 -10
- package/dist/extensions/builtin/goal/goal-controller.js +1 -1
- package/dist/extensions/builtin/goal/goal-format.js +1 -1
- package/dist/extensions/builtin/goal/goal-prompts.d.ts +2 -0
- package/dist/extensions/builtin/goal/goal-prompts.js +5 -4
- package/dist/extensions/builtin/goal/goal-store.js +1 -1
- package/dist/extensions/builtin/goal/index.d.ts +1 -1
- package/dist/extensions/builtin/goal/index.js +10 -7
- package/dist/extensions/builtin/grub/README.md +112 -112
- package/dist/extensions/builtin/link-world/agent-workspace/README.md +16 -16
- package/dist/extensions/builtin/link-world/index.js +6 -6
- package/dist/extensions/builtin/link-world/internet-search/internet-search.md +65 -65
- package/dist/extensions/builtin/link-world/link-world-agent.md +82 -82
- package/dist/extensions/builtin/link-world/linkworld.md +313 -313
- package/dist/extensions/builtin/link-world/{network-routing.md → network-routing/network-routing.md} +67 -67
- package/dist/extensions/builtin/loop/README.md +92 -92
- package/dist/extensions/builtin/mcp/figma-design.md +68 -68
- package/dist/extensions/builtin/mcp/mcp-management.md +85 -85
- package/dist/extensions/builtin/plan/index.js +1 -1
- package/dist/extensions/builtin/recap/AGENT.md +15 -15
- package/dist/extensions/builtin/sal/README.md +72 -72
- package/dist/extensions/builtin/security-audit/README.md +289 -289
- package/dist/extensions/builtin/task/task-store.d.ts +4 -0
- package/dist/extensions/builtin/task/task-store.js +1 -1
- package/dist/extensions/builtin/team/AGENT.md +112 -112
- package/dist/extensions/builtin/team/TESTING.md +299 -299
- package/dist/extensions/builtin/token-save/README.md +56 -56
- package/dist/extensions/optional/AGENT.md +10 -10
- package/dist/index.d.ts +5 -30
- package/dist/index.js +1 -1
- package/dist/models.d.ts +7 -0
- package/dist/models.js +1 -0
- package/dist/modes/interactive/components/footer.js +1 -1
- package/dist/modes/interactive/components/task-status-panel.d.ts +36 -0
- package/dist/modes/interactive/components/task-status-panel.js +1 -0
- package/dist/modes/interactive/controllers/stream-render-controller.d.ts +7 -0
- package/dist/modes/interactive/controllers/stream-render-controller.js +2 -2
- package/dist/modes/interactive/interactive-mode.js +40 -40
- package/dist/modes/interactive/state/interactive-state.d.ts +2 -0
- package/dist/modes/interactive/state/interactive-state.js +1 -1
- package/dist/modes/interactive/theme/dark.json +85 -85
- package/dist/modes/interactive/theme/light.json +84 -84
- package/dist/modes/interactive/theme/theme-schema.json +335 -335
- package/dist/modes/interactive/theme/warm.json +81 -81
- package/dist/node_modules/@pencil-agent/ai/dist/cli.js +0 -0
- package/dist/node_modules/@pencil-agent/ai/dist/models.generated.js +1 -1
- package/dist/node_modules/@pencil-agent/ai/dist/providers/anthropic.js +2 -2
- package/dist/node_modules/@pencil-agent/ai/dist/providers/openai-completions.js +5 -5
- package/dist/node_modules/@pencil-agent/ai/dist/providers/openai-responses.js +1 -1
- package/dist/node_modules/@pencil-agent/ai/dist/stream.js +1 -1
- package/dist/packages/protocol/src/commands.d.ts +33 -0
- package/dist/packages/protocol/src/flags.d.ts +20 -0
- package/dist/packages/protocol/src/hooks.d.ts +17 -0
- package/dist/packages/protocol/src/hooks.js +0 -0
- package/dist/packages/{extension-sdk → protocol}/src/index.d.ts +7 -4
- package/dist/packages/protocol/src/index.js +1 -0
- package/dist/packages/{extension-sdk → protocol}/src/lifecycle.d.ts +15 -27
- package/dist/packages/protocol/src/lifecycle.js +0 -0
- package/dist/packages/{extension-sdk → protocol}/src/tools.d.ts +1 -1
- package/dist/packages/protocol/src/tools.js +0 -0
- package/dist/public-config.d.ts +12 -0
- package/dist/public-config.js +1 -0
- package/dist/runtime.d.ts +9 -0
- package/dist/runtime.js +1 -0
- package/dist/session-compaction.d.ts +7 -0
- package/dist/session-compaction.js +1 -0
- package/dist/session.d.ts +7 -0
- package/dist/session.js +1 -0
- package/dist/skills.d.ts +7 -0
- package/dist/skills.js +1 -0
- package/dist/tools.d.ts +7 -0
- package/dist/tools.js +1 -0
- package/docs/ACP/345/215/217/350/256/256/351/233/206/346/210/220/345/274/200/345/217/221/346/226/207/346/241/243.md +851 -0
- package/docs/SDK-TESTING.md +364 -0
- package/docs/codex-goal-command-impl.md +1055 -1055
- package/docs/codex-goal-vs-grub.md +500 -500
- package/docs/custom-provider.md +27 -27
- package/docs/extensions.md +27 -27
- package/docs/keybindings.md +27 -27
- package/docs/loop /351/207/215/346/236/204/345/256/214/346/210/220/346/200/273/347/273/223.md" +250 -250
- package/docs/loop /351/207/215/346/236/204/345/256/214/346/210/220/346/212/245/345/221/212.md" +122 -122
- package/docs/loop /351/207/215/346/236/204/346/226/271/346/241/210.md" +1222 -1222
- package/docs/loop /351/207/215/346/236/204/346/226/271/346/241/210/345/256/236/347/216/260/346/212/245/345/221/212.md" +158 -158
- package/docs/loop /351/207/215/346/236/204/346/226/271/346/241/210/345/257/271/346/257/224/345/210/206/346/236/220.md" +128 -128
- package/docs/loop /351/207/215/346/236/204/350/256/241/345/210/222.md" +320 -320
- package/docs/loop-usage-examples.md +214 -214
- package/docs/mem-core/346/212/200/346/234/257/346/226/207/346/241/243.md +593 -0
- package/docs/models.md +27 -27
- package/docs/packages.md +27 -27
- package/docs/pi-design-philosophy.md +457 -457
- package/docs/planmode.md +1987 -1987
- package/docs/prompt-templates.md +27 -27
- package/docs/providers.md +27 -27
- package/docs/sdk.md +27 -27
- package/docs/skills.md +27 -27
- package/docs/startup-performance-optimization.md +301 -0
- package/docs/themes.md +27 -27
- package/docs/tui.md +27 -27
- package/docs//350/256/244/347/237/245/345/234/260/345/233/276.md +47 -0
- package/package.json +190 -162
- package/dist/packages/extension-sdk/src/index.js +0 -1
- package/docs/cc-agent-design.md +0 -1297
- package/docs/cc-tui-design.md +0 -1333
- package/docs//345/257/271/346/240/207Claude-Code.md +0 -1775
- /package/dist/packages/{extension-sdk/src/lifecycle.js → protocol/src/commands.js} +0 -0
- /package/dist/packages/{extension-sdk/src/tools.js → protocol/src/flags.js} +0 -0
|
@@ -1,236 +1,236 @@
|
|
|
1
|
-
# Facebook Groups — mining feeds for posts + external URLs
|
|
2
|
-
|
|
3
|
-
Field-tested against a logged-in Jay account on 2026-04-18.
|
|
4
|
-
**Requires:** Browser Harness driving a real Chrome that is (a) signed into
|
|
5
|
-
Facebook and (b) already a member of the target group. Non-member or logged-out
|
|
6
|
-
views serve a stripped landing page with no post content.
|
|
7
|
-
|
|
8
|
-
## What this skill is for
|
|
9
|
-
|
|
10
|
-
1. Pull the N most recent posts from a named FB group
|
|
11
|
-
2. Harvest every external URL that members have shared
|
|
12
|
-
3. Hand that URL list to Firecrawl (or `http_get`) for structured scraping at scale
|
|
13
|
-
4. Cache post text + author + timestamp for downstream keyword matching
|
|
14
|
-
|
|
15
|
-
It is NOT for: replying in groups, DMing members, or any write action.
|
|
16
|
-
|
|
17
|
-
## URL patterns
|
|
18
|
-
|
|
19
|
-
| What | URL |
|
|
20
|
-
|------|-----|
|
|
21
|
-
| Group main feed | `https://www.facebook.com/groups/{id_or_slug}` |
|
|
22
|
-
| Group "Discussion" tab (canonical feed) | `https://www.facebook.com/groups/{id_or_slug}/?sorting_setting=CHRONOLOGICAL` |
|
|
23
|
-
| Single post (permalink) | `https://www.facebook.com/groups/{id_or_slug}/posts/{post_id}/` |
|
|
24
|
-
| User's joined-groups feed | `https://www.facebook.com/groups/feed/` |
|
|
25
|
-
| List of YOUR groups | `https://www.facebook.com/groups/joins/` |
|
|
26
|
-
|
|
27
|
-
The `?sorting_setting=CHRONOLOGICAL` flag matters — without it, FB inserts an
|
|
28
|
-
algorithmic ranking that hides older posts and shows the same handful of "popular"
|
|
29
|
-
items every visit, which kills monitoring use cases.
|
|
30
|
-
|
|
31
|
-
## DOM anchors (verified 2026-04-18)
|
|
32
|
-
|
|
33
|
-
FB rewrites class names every few weeks but ARIA roles and stable URL patterns
|
|
34
|
-
hold up well. Anchor on those, not on hashed CSS classes.
|
|
35
|
-
|
|
36
|
-
| Anchor | Selector | Notes |
|
|
37
|
-
|--------|----------|-------|
|
|
38
|
-
| Each post container | `div[role="article"]` | Stable. One per visible post. |
|
|
39
|
-
| Post permalink | `a[href*="/groups/"][href*="/posts/"], a[href*="/groups/"][href*="/permalink/"]` | First match per article = the post link |
|
|
40
|
-
| Post body text | `div[data-ad-preview="message"], div[data-ad-comet-preview="message"]` | One of these is the visible body |
|
|
41
|
-
| Post author | `h3 a, h4 a` (first inside the article) | Falls back to `strong a` |
|
|
42
|
-
| Post timestamp | `a[href*="/posts/"] abbr, a[role="link"] > span > span` (relative time text) | Hover gets the absolute time but the relative string is fine for sorting |
|
|
43
|
-
| External link (FB redirector) | `a[href^="https://l.facebook.com/l.php?u="]` | Decode the `u=` param to get the real URL |
|
|
44
|
-
| "See more" button on long posts | `div[role="button"]:has(span:contains("See more"))` (use XPath fallback if `:has` is unsupported) | Click before reading body or posts get truncated |
|
|
45
|
-
|
|
46
|
-
If selectors stop returning results, run the self-inspection block at the bottom
|
|
47
|
-
of this file and update this table — that's the workflow, not a fallback.
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
## Scrolling the feed (lazy load)
|
|
51
|
-
|
|
52
|
-
FB virtualizes the feed: scrolled-past posts get unmounted from the DOM. So
|
|
53
|
-
"scroll then collect" misses old posts. Pattern that works: **collect-as-you-go.**
|
|
54
|
-
|
|
55
|
-
```python
|
|
56
|
-
seen = {} # post_url -> dict
|
|
57
|
-
TARGET = 50 # how many posts to collect
|
|
58
|
-
MAX_SCROLLS = 30
|
|
59
|
-
|
|
60
|
-
for i in range(MAX_SCROLLS):
|
|
61
|
-
new_posts = js("""
|
|
62
|
-
Array.from(document.querySelectorAll('div[role="article"]')).map(el => {
|
|
63
|
-
const link = el.querySelector('a[href*="/groups/"][href*="/posts/"], a[href*="/groups/"][href*="/permalink/"]');
|
|
64
|
-
const body = el.querySelector('div[data-ad-preview="message"], div[data-ad-comet-preview="message"]');
|
|
65
|
-
const author = el.querySelector('h3 a, h4 a, strong a');
|
|
66
|
-
const time = el.querySelector('abbr, a[role="link"] > span > span');
|
|
67
|
-
const externals = Array.from(el.querySelectorAll('a[href^="https://l.facebook.com/l.php?u="]'))
|
|
68
|
-
.map(a => a.href);
|
|
69
|
-
return {
|
|
70
|
-
url: link?.href || null,
|
|
71
|
-
author: author?.innerText || null,
|
|
72
|
-
time: time?.innerText || null,
|
|
73
|
-
body: body?.innerText?.slice(0, 4000) || null,
|
|
74
|
-
externals: externals,
|
|
75
|
-
};
|
|
76
|
-
}).filter(p => p.url)
|
|
77
|
-
""") or []
|
|
78
|
-
for p in new_posts:
|
|
79
|
-
seen.setdefault(p["url"], p)
|
|
80
|
-
if len(seen) >= TARGET:
|
|
81
|
-
break
|
|
82
|
-
scroll(640, 400, dy=900) # scroll near middle of viewport
|
|
83
|
-
wait(2.5) # FB needs ~2s to render new batch + a little buffer
|
|
84
|
-
```
|
|
85
|
-
|
|
86
|
-
`wait(2.5)` is the floor. Faster than that and you'll see empty post containers
|
|
87
|
-
because React hasn't hydrated them yet.
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
## Decoding the external-URL redirector
|
|
91
|
-
|
|
92
|
-
Every external link gets wrapped in `https://l.facebook.com/l.php?u={URL-encoded real URL}&h=...`.
|
|
93
|
-
You want the real URL, not the redirector.
|
|
94
|
-
|
|
95
|
-
```python
|
|
96
|
-
from urllib.parse import urlparse, parse_qs, unquote
|
|
97
|
-
def decode_fb_link(href):
|
|
98
|
-
if not href.startswith("https://l.facebook.com/l.php"):
|
|
99
|
-
return href
|
|
100
|
-
q = parse_qs(urlparse(href).query)
|
|
101
|
-
return unquote(q["u"][0]) if "u" in q else href
|
|
102
|
-
```
|
|
103
|
-
|
|
104
|
-
## Handoff to Firecrawl (for the public outbound URLs)
|
|
105
|
-
|
|
106
|
-
Once you have the harvested external list, those URLs are outside FB's walled
|
|
107
|
-
garden — public, scrapable by anything. Firecrawl's schema-native extraction
|
|
108
|
-
shines here because you want typed results across heterogeneous sources.
|
|
109
|
-
|
|
110
|
-
```python
|
|
111
|
-
# After the scroll loop:
|
|
112
|
-
external_urls = []
|
|
113
|
-
for p in seen.values():
|
|
114
|
-
for raw in p["externals"]:
|
|
115
|
-
external_urls.append(decode_fb_link(raw))
|
|
116
|
-
external_urls = sorted(set(external_urls))
|
|
117
|
-
print(f"harvested {len(external_urls)} unique external URLs")
|
|
118
|
-
|
|
119
|
-
# Hand off to Firecrawl MCP in the calling conversation:
|
|
120
|
-
# firecrawl_extract(
|
|
121
|
-
# urls=external_urls,
|
|
122
|
-
# prompt="Extract product/listing name, price, location, year, and key features.",
|
|
123
|
-
# schema={...}
|
|
124
|
-
# )
|
|
125
|
-
```
|
|
126
|
-
|
|
127
|
-
When Firecrawl isn't available or the pages are simple, `http_get(url)` from
|
|
128
|
-
Harness itself is fine — it does a plain HTTP fetch without a browser, works
|
|
129
|
-
for static pages, and is the fastest option for bulk.
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
## Rate-limit discipline
|
|
133
|
-
|
|
134
|
-
FB notices automation patterns at the account level, not the IP level. Driving
|
|
135
|
-
a real logged-in session means Jay's account is the one getting rate-limited if
|
|
136
|
-
you get greedy. Keep these floors:
|
|
137
|
-
|
|
138
|
-
- **≥2 seconds between scrolls** in the collect loop (the `wait(2.5)` above)
|
|
139
|
-
- **≥3 seconds between groups** if you're sweeping multiple
|
|
140
|
-
- **No more than ~6 groups per hour** for sustained monitoring
|
|
141
|
-
- **Don't open the same group more than every 15 minutes** — repeated visits
|
|
142
|
-
within a short window is a heuristic that triggers checkpoints
|
|
143
|
-
|
|
144
|
-
Symptoms of over-pacing: article containers start rendering with empty bodies,
|
|
145
|
-
`/groups/{id}/` redirects to `/checkpoint/`, or the account briefly gets asked
|
|
146
|
-
to re-verify a phone or confirm a login from a new device. If that happens,
|
|
147
|
-
**stop immediately** and let Jay deal with the UI — don't try to auto-resolve.
|
|
148
|
-
|
|
149
|
-
## Self-inspection block (run this when selectors stop working)
|
|
150
|
-
|
|
151
|
-
Paste this into a Harness stdin block to see what anchors currently exist in the
|
|
152
|
-
visible feed. Run it on a group you're a member of.
|
|
153
|
-
|
|
154
|
-
```python
|
|
155
|
-
print(js("""
|
|
156
|
-
({
|
|
157
|
-
articles: document.querySelectorAll('div[role="article"]').length,
|
|
158
|
-
body_preview_a: document.querySelectorAll('div[data-ad-preview="message"]').length,
|
|
159
|
-
body_preview_b: document.querySelectorAll('div[data-ad-comet-preview="message"]').length,
|
|
160
|
-
external_redirectors: document.querySelectorAll('a[href^="https://l.facebook.com/l.php?u="]').length,
|
|
161
|
-
permalink_posts: document.querySelectorAll('a[href*="/groups/"][href*="/posts/"]').length,
|
|
162
|
-
permalink_permalinks: document.querySelectorAll('a[href*="/groups/"][href*="/permalink/"]').length,
|
|
163
|
-
})
|
|
164
|
-
"""))
|
|
165
|
-
# If any count is 0, the selector drifted. Open DevTools, right-click a visible
|
|
166
|
-
# post, inspect, find the new stable attribute (aria-*, data-*), and update the
|
|
167
|
-
# DOM anchors table above.
|
|
168
|
-
```
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
## Full example — mine one group, emit JSON for downstream tools
|
|
172
|
-
|
|
173
|
-
```bash
|
|
174
|
-
cd ~/Developer/browser-harness && uv run browser-harness <<'PY'
|
|
175
|
-
import json, sys
|
|
176
|
-
from urllib.parse import urlparse, parse_qs, unquote
|
|
177
|
-
|
|
178
|
-
GROUP = "riceLakeBoating" # slug or numeric id
|
|
179
|
-
TARGET = 50 # how many posts to collect
|
|
180
|
-
MAX_SCROLLS = 30
|
|
181
|
-
|
|
182
|
-
goto_url(f"https://www.facebook.com/groups/{GROUP}/?sorting_setting=CHRONOLOGICAL")
|
|
183
|
-
wait_for_load()
|
|
184
|
-
wait(2)
|
|
185
|
-
|
|
186
|
-
# Abort if FB bounced us
|
|
187
|
-
info = page_info()
|
|
188
|
-
if "/checkpoint/" in info["url"] or "/login" in info["url"]:
|
|
189
|
-
sys.exit("AUTH_WALL — stop and have Jay re-verify the account.")
|
|
190
|
-
|
|
191
|
-
seen = {}
|
|
192
|
-
for _ in range(MAX_SCROLLS):
|
|
193
|
-
batch = js("""
|
|
194
|
-
Array.from(document.querySelectorAll('div[role="article"]')).map(el => {
|
|
195
|
-
const link = el.querySelector('a[href*="/groups/"][href*="/posts/"], a[href*="/groups/"][href*="/permalink/"]');
|
|
196
|
-
const body = el.querySelector('div[data-ad-preview="message"], div[data-ad-comet-preview="message"]');
|
|
197
|
-
const author = el.querySelector('h3 a, h4 a, strong a');
|
|
198
|
-
const time = el.querySelector('abbr, a[role="link"] > span > span');
|
|
199
|
-
const externals = Array.from(el.querySelectorAll('a[href^="https://l.facebook.com/l.php?u="]')).map(a => a.href);
|
|
200
|
-
return { url: link?.href, author: author?.innerText, time: time?.innerText,
|
|
201
|
-
body: body?.innerText?.slice(0, 4000), externals };
|
|
202
|
-
}).filter(p => p.url)
|
|
203
|
-
""") or []
|
|
204
|
-
for p in batch:
|
|
205
|
-
seen.setdefault(p["url"], p)
|
|
206
|
-
if len(seen) >= TARGET:
|
|
207
|
-
break
|
|
208
|
-
scroll(640, 400, dy=900)
|
|
209
|
-
wait(2.5)
|
|
210
|
-
|
|
211
|
-
def decode(u):
|
|
212
|
-
if not u.startswith("https://l.facebook.com/l.php"): return u
|
|
213
|
-
q = parse_qs(urlparse(u).query)
|
|
214
|
-
return unquote(q["u"][0]) if "u" in q else u
|
|
215
|
-
|
|
216
|
-
posts = list(seen.values())
|
|
217
|
-
all_externals = sorted({decode(x) for p in posts for x in p["externals"]})
|
|
218
|
-
capture_screenshot(f"/tmp/fb-group-{GROUP}.png", full=True)
|
|
219
|
-
print(json.dumps({
|
|
220
|
-
"group": GROUP,
|
|
221
|
-
"post_count": len(posts),
|
|
222
|
-
"posts": posts,
|
|
223
|
-
"external_urls": all_externals,
|
|
224
|
-
}, ensure_ascii=False))
|
|
225
|
-
PY
|
|
226
|
-
```
|
|
227
|
-
|
|
228
|
-
The JSON on stdout is the handoff payload — parse it in the calling agent and
|
|
229
|
-
route `external_urls` into `firecrawl_extract` with whatever schema matches the
|
|
230
|
-
downstream task (competitor inventory, pricing intel, boat listings, etc).
|
|
231
|
-
|
|
232
|
-
## Gotchas log (append when you hit something new)
|
|
233
|
-
|
|
234
|
-
- **2026-04-18:** Fresh install verified. People-search URL requires login;
|
|
235
|
-
page search `/search/pages/?q=` works the same way. Groups feed defaults to
|
|
236
|
-
algorithmic sort — always append `?sorting_setting=CHRONOLOGICAL`.
|
|
1
|
+
# Facebook Groups — mining feeds for posts + external URLs
|
|
2
|
+
|
|
3
|
+
Field-tested against a logged-in Jay account on 2026-04-18.
|
|
4
|
+
**Requires:** Browser Harness driving a real Chrome that is (a) signed into
|
|
5
|
+
Facebook and (b) already a member of the target group. Non-member or logged-out
|
|
6
|
+
views serve a stripped landing page with no post content.
|
|
7
|
+
|
|
8
|
+
## What this skill is for
|
|
9
|
+
|
|
10
|
+
1. Pull the N most recent posts from a named FB group
|
|
11
|
+
2. Harvest every external URL that members have shared
|
|
12
|
+
3. Hand that URL list to Firecrawl (or `http_get`) for structured scraping at scale
|
|
13
|
+
4. Cache post text + author + timestamp for downstream keyword matching
|
|
14
|
+
|
|
15
|
+
It is NOT for: replying in groups, DMing members, or any write action.
|
|
16
|
+
|
|
17
|
+
## URL patterns
|
|
18
|
+
|
|
19
|
+
| What | URL |
|
|
20
|
+
|------|-----|
|
|
21
|
+
| Group main feed | `https://www.facebook.com/groups/{id_or_slug}` |
|
|
22
|
+
| Group "Discussion" tab (canonical feed) | `https://www.facebook.com/groups/{id_or_slug}/?sorting_setting=CHRONOLOGICAL` |
|
|
23
|
+
| Single post (permalink) | `https://www.facebook.com/groups/{id_or_slug}/posts/{post_id}/` |
|
|
24
|
+
| User's joined-groups feed | `https://www.facebook.com/groups/feed/` |
|
|
25
|
+
| List of YOUR groups | `https://www.facebook.com/groups/joins/` |
|
|
26
|
+
|
|
27
|
+
The `?sorting_setting=CHRONOLOGICAL` flag matters — without it, FB inserts an
|
|
28
|
+
algorithmic ranking that hides older posts and shows the same handful of "popular"
|
|
29
|
+
items every visit, which kills monitoring use cases.
|
|
30
|
+
|
|
31
|
+
## DOM anchors (verified 2026-04-18)
|
|
32
|
+
|
|
33
|
+
FB rewrites class names every few weeks but ARIA roles and stable URL patterns
|
|
34
|
+
hold up well. Anchor on those, not on hashed CSS classes.
|
|
35
|
+
|
|
36
|
+
| Anchor | Selector | Notes |
|
|
37
|
+
|--------|----------|-------|
|
|
38
|
+
| Each post container | `div[role="article"]` | Stable. One per visible post. |
|
|
39
|
+
| Post permalink | `a[href*="/groups/"][href*="/posts/"], a[href*="/groups/"][href*="/permalink/"]` | First match per article = the post link |
|
|
40
|
+
| Post body text | `div[data-ad-preview="message"], div[data-ad-comet-preview="message"]` | One of these is the visible body |
|
|
41
|
+
| Post author | `h3 a, h4 a` (first inside the article) | Falls back to `strong a` |
|
|
42
|
+
| Post timestamp | `a[href*="/posts/"] abbr, a[role="link"] > span > span` (relative time text) | Hover gets the absolute time but the relative string is fine for sorting |
|
|
43
|
+
| External link (FB redirector) | `a[href^="https://l.facebook.com/l.php?u="]` | Decode the `u=` param to get the real URL |
|
|
44
|
+
| "See more" button on long posts | `div[role="button"]:has(span:contains("See more"))` (use XPath fallback if `:has` is unsupported) | Click before reading body or posts get truncated |
|
|
45
|
+
|
|
46
|
+
If selectors stop returning results, run the self-inspection block at the bottom
|
|
47
|
+
of this file and update this table — that's the workflow, not a fallback.
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
## Scrolling the feed (lazy load)
|
|
51
|
+
|
|
52
|
+
FB virtualizes the feed: scrolled-past posts get unmounted from the DOM. So
|
|
53
|
+
"scroll then collect" misses old posts. Pattern that works: **collect-as-you-go.**
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
seen = {} # post_url -> dict
|
|
57
|
+
TARGET = 50 # how many posts to collect
|
|
58
|
+
MAX_SCROLLS = 30
|
|
59
|
+
|
|
60
|
+
for i in range(MAX_SCROLLS):
|
|
61
|
+
new_posts = js("""
|
|
62
|
+
Array.from(document.querySelectorAll('div[role="article"]')).map(el => {
|
|
63
|
+
const link = el.querySelector('a[href*="/groups/"][href*="/posts/"], a[href*="/groups/"][href*="/permalink/"]');
|
|
64
|
+
const body = el.querySelector('div[data-ad-preview="message"], div[data-ad-comet-preview="message"]');
|
|
65
|
+
const author = el.querySelector('h3 a, h4 a, strong a');
|
|
66
|
+
const time = el.querySelector('abbr, a[role="link"] > span > span');
|
|
67
|
+
const externals = Array.from(el.querySelectorAll('a[href^="https://l.facebook.com/l.php?u="]'))
|
|
68
|
+
.map(a => a.href);
|
|
69
|
+
return {
|
|
70
|
+
url: link?.href || null,
|
|
71
|
+
author: author?.innerText || null,
|
|
72
|
+
time: time?.innerText || null,
|
|
73
|
+
body: body?.innerText?.slice(0, 4000) || null,
|
|
74
|
+
externals: externals,
|
|
75
|
+
};
|
|
76
|
+
}).filter(p => p.url)
|
|
77
|
+
""") or []
|
|
78
|
+
for p in new_posts:
|
|
79
|
+
seen.setdefault(p["url"], p)
|
|
80
|
+
if len(seen) >= TARGET:
|
|
81
|
+
break
|
|
82
|
+
scroll(640, 400, dy=900) # scroll near middle of viewport
|
|
83
|
+
wait(2.5) # FB needs ~2s to render new batch + a little buffer
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
`wait(2.5)` is the floor. Faster than that and you'll see empty post containers
|
|
87
|
+
because React hasn't hydrated them yet.
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
## Decoding the external-URL redirector
|
|
91
|
+
|
|
92
|
+
Every external link gets wrapped in `https://l.facebook.com/l.php?u={URL-encoded real URL}&h=...`.
|
|
93
|
+
You want the real URL, not the redirector.
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from urllib.parse import urlparse, parse_qs, unquote
|
|
97
|
+
def decode_fb_link(href):
|
|
98
|
+
if not href.startswith("https://l.facebook.com/l.php"):
|
|
99
|
+
return href
|
|
100
|
+
q = parse_qs(urlparse(href).query)
|
|
101
|
+
return unquote(q["u"][0]) if "u" in q else href
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
## Handoff to Firecrawl (for the public outbound URLs)
|
|
105
|
+
|
|
106
|
+
Once you have the harvested external list, those URLs are outside FB's walled
|
|
107
|
+
garden — public, scrapable by anything. Firecrawl's schema-native extraction
|
|
108
|
+
shines here because you want typed results across heterogeneous sources.
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
# After the scroll loop:
|
|
112
|
+
external_urls = []
|
|
113
|
+
for p in seen.values():
|
|
114
|
+
for raw in p["externals"]:
|
|
115
|
+
external_urls.append(decode_fb_link(raw))
|
|
116
|
+
external_urls = sorted(set(external_urls))
|
|
117
|
+
print(f"harvested {len(external_urls)} unique external URLs")
|
|
118
|
+
|
|
119
|
+
# Hand off to Firecrawl MCP in the calling conversation:
|
|
120
|
+
# firecrawl_extract(
|
|
121
|
+
# urls=external_urls,
|
|
122
|
+
# prompt="Extract product/listing name, price, location, year, and key features.",
|
|
123
|
+
# schema={...}
|
|
124
|
+
# )
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
When Firecrawl isn't available or the pages are simple, `http_get(url)` from
|
|
128
|
+
Harness itself is fine — it does a plain HTTP fetch without a browser, works
|
|
129
|
+
for static pages, and is the fastest option for bulk.
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
## Rate-limit discipline
|
|
133
|
+
|
|
134
|
+
FB notices automation patterns at the account level, not the IP level. Driving
|
|
135
|
+
a real logged-in session means Jay's account is the one getting rate-limited if
|
|
136
|
+
you get greedy. Keep these floors:
|
|
137
|
+
|
|
138
|
+
- **≥2 seconds between scrolls** in the collect loop (the `wait(2.5)` above)
|
|
139
|
+
- **≥3 seconds between groups** if you're sweeping multiple
|
|
140
|
+
- **No more than ~6 groups per hour** for sustained monitoring
|
|
141
|
+
- **Don't open the same group more than every 15 minutes** — repeated visits
|
|
142
|
+
within a short window is a heuristic that triggers checkpoints
|
|
143
|
+
|
|
144
|
+
Symptoms of over-pacing: article containers start rendering with empty bodies,
|
|
145
|
+
`/groups/{id}/` redirects to `/checkpoint/`, or the account briefly gets asked
|
|
146
|
+
to re-verify a phone or confirm a login from a new device. If that happens,
|
|
147
|
+
**stop immediately** and let Jay deal with the UI — don't try to auto-resolve.
|
|
148
|
+
|
|
149
|
+
## Self-inspection block (run this when selectors stop working)
|
|
150
|
+
|
|
151
|
+
Paste this into a Harness stdin block to see what anchors currently exist in the
|
|
152
|
+
visible feed. Run it on a group you're a member of.
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
print(js("""
|
|
156
|
+
({
|
|
157
|
+
articles: document.querySelectorAll('div[role="article"]').length,
|
|
158
|
+
body_preview_a: document.querySelectorAll('div[data-ad-preview="message"]').length,
|
|
159
|
+
body_preview_b: document.querySelectorAll('div[data-ad-comet-preview="message"]').length,
|
|
160
|
+
external_redirectors: document.querySelectorAll('a[href^="https://l.facebook.com/l.php?u="]').length,
|
|
161
|
+
permalink_posts: document.querySelectorAll('a[href*="/groups/"][href*="/posts/"]').length,
|
|
162
|
+
permalink_permalinks: document.querySelectorAll('a[href*="/groups/"][href*="/permalink/"]').length,
|
|
163
|
+
})
|
|
164
|
+
"""))
|
|
165
|
+
# If any count is 0, the selector drifted. Open DevTools, right-click a visible
|
|
166
|
+
# post, inspect, find the new stable attribute (aria-*, data-*), and update the
|
|
167
|
+
# DOM anchors table above.
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
## Full example — mine one group, emit JSON for downstream tools
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
cd ~/Developer/browser-harness && uv run browser-harness <<'PY'
|
|
175
|
+
import json, sys
|
|
176
|
+
from urllib.parse import urlparse, parse_qs, unquote
|
|
177
|
+
|
|
178
|
+
GROUP = "riceLakeBoating" # slug or numeric id
|
|
179
|
+
TARGET = 50 # how many posts to collect
|
|
180
|
+
MAX_SCROLLS = 30
|
|
181
|
+
|
|
182
|
+
goto_url(f"https://www.facebook.com/groups/{GROUP}/?sorting_setting=CHRONOLOGICAL")
|
|
183
|
+
wait_for_load()
|
|
184
|
+
wait(2)
|
|
185
|
+
|
|
186
|
+
# Abort if FB bounced us
|
|
187
|
+
info = page_info()
|
|
188
|
+
if "/checkpoint/" in info["url"] or "/login" in info["url"]:
|
|
189
|
+
sys.exit("AUTH_WALL — stop and have Jay re-verify the account.")
|
|
190
|
+
|
|
191
|
+
seen = {}
|
|
192
|
+
for _ in range(MAX_SCROLLS):
|
|
193
|
+
batch = js("""
|
|
194
|
+
Array.from(document.querySelectorAll('div[role="article"]')).map(el => {
|
|
195
|
+
const link = el.querySelector('a[href*="/groups/"][href*="/posts/"], a[href*="/groups/"][href*="/permalink/"]');
|
|
196
|
+
const body = el.querySelector('div[data-ad-preview="message"], div[data-ad-comet-preview="message"]');
|
|
197
|
+
const author = el.querySelector('h3 a, h4 a, strong a');
|
|
198
|
+
const time = el.querySelector('abbr, a[role="link"] > span > span');
|
|
199
|
+
const externals = Array.from(el.querySelectorAll('a[href^="https://l.facebook.com/l.php?u="]')).map(a => a.href);
|
|
200
|
+
return { url: link?.href, author: author?.innerText, time: time?.innerText,
|
|
201
|
+
body: body?.innerText?.slice(0, 4000), externals };
|
|
202
|
+
}).filter(p => p.url)
|
|
203
|
+
""") or []
|
|
204
|
+
for p in batch:
|
|
205
|
+
seen.setdefault(p["url"], p)
|
|
206
|
+
if len(seen) >= TARGET:
|
|
207
|
+
break
|
|
208
|
+
scroll(640, 400, dy=900)
|
|
209
|
+
wait(2.5)
|
|
210
|
+
|
|
211
|
+
def decode(u):
|
|
212
|
+
if not u.startswith("https://l.facebook.com/l.php"): return u
|
|
213
|
+
q = parse_qs(urlparse(u).query)
|
|
214
|
+
return unquote(q["u"][0]) if "u" in q else u
|
|
215
|
+
|
|
216
|
+
posts = list(seen.values())
|
|
217
|
+
all_externals = sorted({decode(x) for p in posts for x in p["externals"]})
|
|
218
|
+
capture_screenshot(f"/tmp/fb-group-{GROUP}.png", full=True)
|
|
219
|
+
print(json.dumps({
|
|
220
|
+
"group": GROUP,
|
|
221
|
+
"post_count": len(posts),
|
|
222
|
+
"posts": posts,
|
|
223
|
+
"external_urls": all_externals,
|
|
224
|
+
}, ensure_ascii=False))
|
|
225
|
+
PY
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
The JSON on stdout is the handoff payload — parse it in the calling agent and
|
|
229
|
+
route `external_urls` into `firecrawl_extract` with whatever schema matches the
|
|
230
|
+
downstream task (competitor inventory, pricing intel, boat listings, etc).
|
|
231
|
+
|
|
232
|
+
## Gotchas log (append when you hit something new)
|
|
233
|
+
|
|
234
|
+
- **2026-04-18:** Fresh install verified. People-search URL requires login;
|
|
235
|
+
page search `/search/pages/?q=` works the same way. Groups feed defaults to
|
|
236
|
+
algorithmic sort — always append `?sorting_setting=CHRONOLOGICAL`.
|