@pencil-agent/nano-pencil 2.0.1 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (188) hide show
  1. package/README.md +267 -267
  2. package/dist/build-meta.json +3 -3
  3. package/dist/core/export-html/AGENT.md +11 -11
  4. package/dist/core/export-html/template.css +971 -971
  5. package/dist/core/export-html/template.html +54 -54
  6. package/dist/core/model/custom-providers.js +1 -1
  7. package/dist/core/model-registry.js +5 -5
  8. package/dist/extensions/builtin/AGENT.md +115 -115
  9. package/dist/extensions/builtin/browser/AGENT.md +17 -17
  10. package/dist/extensions/builtin/browser/agent-workspace/agent_helpers.py +12 -12
  11. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/amazon/product-search.md +198 -198
  12. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/archive-org/scraping.md +341 -341
  13. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/arxiv/scraping.md +311 -311
  14. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/arxiv-bulk/scraping.md +333 -333
  15. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/atlas/overview.md +70 -70
  16. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/booking-com/scraping.md +578 -578
  17. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/capterra/scraping.md +440 -440
  18. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/centilebrain/generate-estimates.md +110 -110
  19. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/coingecko/scraping.md +325 -325
  20. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/coinmarketcap/scraping.md +463 -463
  21. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/coursera/scraping.md +360 -360
  22. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/craigslist/scraping.md +390 -390
  23. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/crossref/scraping.md +568 -568
  24. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/dev-to/scraping.md +323 -323
  25. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/duckduckgo/scraping.md +349 -349
  26. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/ebay/scraping.md +435 -435
  27. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/etsy/scraping.md +506 -506
  28. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/eventbrite/scraping.md +363 -363
  29. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/expedia/automation.md +168 -168
  30. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/facebook/groups.md +236 -236
  31. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/facebook/pages.md +295 -295
  32. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/framer/editor.md +108 -108
  33. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/fred/scraping.md +493 -493
  34. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/g2/scraping.md +580 -580
  35. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/genius/scraping.md +511 -511
  36. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/github/repo-actions.md +65 -65
  37. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/github/scraping.md +184 -184
  38. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/glassdoor/scraping.md +543 -543
  39. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/gmail/compose.md +122 -122
  40. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/goodreads/scraping.md +461 -461
  41. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/gutenberg/scraping.md +383 -383
  42. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/hackernews/scraping.md +243 -243
  43. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/howlongtobeat/scraping.md +473 -473
  44. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/imdb/scraping.md +271 -271
  45. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/itch-io/scraping.md +436 -436
  46. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/job-boards/indeed-glassdoor.md +1021 -1021
  47. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/letterboxd/scraping.md +349 -349
  48. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/linkedin/invitation-manager.md +109 -109
  49. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/loom/folder-enumeration.md +170 -170
  50. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/macrotrends/scraping.md +537 -537
  51. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/medium/article-hydration.md +120 -120
  52. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/medium/scraping.md +414 -414
  53. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/metacritic/scraping.md +477 -477
  54. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/musicbrainz/scraping.md +478 -478
  55. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/nasa/scraping.md +339 -339
  56. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/news-aggregation/multi-source.md +205 -205
  57. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/open-library/scraping.md +472 -472
  58. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/openalex/scraping.md +470 -470
  59. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/openstreetmap/scraping.md +490 -490
  60. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/package-registries/npm-pypi.md +478 -478
  61. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/polymarket/scraping.md +234 -234
  62. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/producthunt/scraping.md +307 -307
  63. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/pubmed/scraping.md +421 -421
  64. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/quora/scraping.md +364 -364
  65. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/rawg/scraping.md +352 -352
  66. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/reddit/scraping.md +124 -124
  67. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/rest-countries/scraping.md +233 -233
  68. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/sec-edgar/scraping.md +361 -361
  69. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/README.md +36 -36
  70. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/embedded-apps.md +72 -72
  71. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/knowledge-base.md +109 -109
  72. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/shopify-admin/polaris-inputs.md +137 -137
  73. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/soundcloud/scraping.md +362 -362
  74. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/spotify/scraping.md +339 -339
  75. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/stackoverflow/scraping.md +435 -435
  76. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/steam/scraping.md +575 -575
  77. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/substack/scraping.md +338 -338
  78. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/thetechgeeks/pricing.md +52 -52
  79. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/tiktok/upload.md +107 -107
  80. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/tradingview/scraping.md +309 -309
  81. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/trello/boards-and-lists.md +88 -88
  82. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/trustpilot/scraping.md +375 -375
  83. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/walmart/scraping.md +444 -444
  84. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/wayback-machine/scraping.md +306 -306
  85. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/weather/scraping.md +398 -398
  86. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/wellfound/scraping.md +596 -596
  87. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/world-bank/scraping.md +356 -356
  88. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/xiaohongshu/scraping.md +84 -84
  89. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/youtube/scraping.md +418 -418
  90. package/dist/extensions/builtin/browser/agent-workspace/domain-skills/zillow/scraping.md +433 -433
  91. package/dist/extensions/builtin/browser/browser.md +73 -73
  92. package/dist/extensions/builtin/browser/install.md +142 -142
  93. package/dist/extensions/builtin/browser/interaction-skills/connection.md +48 -48
  94. package/dist/extensions/builtin/browser/interaction-skills/cookies.md +3 -3
  95. package/dist/extensions/builtin/browser/interaction-skills/cross-origin-iframes.md +3 -3
  96. package/dist/extensions/builtin/browser/interaction-skills/dialogs.md +64 -64
  97. package/dist/extensions/builtin/browser/interaction-skills/downloads.md +3 -3
  98. package/dist/extensions/builtin/browser/interaction-skills/drag-and-drop.md +3 -3
  99. package/dist/extensions/builtin/browser/interaction-skills/dropdowns.md +3 -3
  100. package/dist/extensions/builtin/browser/interaction-skills/iframes.md +3 -3
  101. package/dist/extensions/builtin/browser/interaction-skills/network-requests.md +3 -3
  102. package/dist/extensions/builtin/browser/interaction-skills/print-as-pdf.md +3 -3
  103. package/dist/extensions/builtin/browser/interaction-skills/profile-sync.md +90 -90
  104. package/dist/extensions/builtin/browser/interaction-skills/screenshots.md +17 -17
  105. package/dist/extensions/builtin/browser/interaction-skills/scrolling.md +3 -3
  106. package/dist/extensions/builtin/browser/interaction-skills/shadow-dom.md +3 -3
  107. package/dist/extensions/builtin/browser/interaction-skills/tabs.md +69 -69
  108. package/dist/extensions/builtin/browser/interaction-skills/uploads.md +1 -1
  109. package/dist/extensions/builtin/browser/interaction-skills/viewport.md +3 -3
  110. package/dist/extensions/builtin/browser/src/browser_harness/AGENT.md +15 -15
  111. package/dist/extensions/builtin/browser/src/browser_harness/__init__.py +8 -8
  112. package/dist/extensions/builtin/browser/src/browser_harness/_ipc.py +90 -90
  113. package/dist/extensions/builtin/browser/src/browser_harness/admin.py +722 -722
  114. package/dist/extensions/builtin/browser/src/browser_harness/daemon.py +328 -328
  115. package/dist/extensions/builtin/browser/src/browser_harness/helpers.py +396 -396
  116. package/dist/extensions/builtin/browser/src/browser_harness/run.py +103 -103
  117. package/dist/extensions/builtin/debug/index.js +9 -9
  118. package/dist/extensions/builtin/discipline/skills/brainstorming/SKILL.md +33 -33
  119. package/dist/extensions/builtin/discipline/skills/executing-plans/SKILL.md +25 -25
  120. package/dist/extensions/builtin/discipline/skills/finishing-development-branch/SKILL.md +25 -25
  121. package/dist/extensions/builtin/discipline/skills/receiving-code-review/SKILL.md +22 -22
  122. package/dist/extensions/builtin/discipline/skills/requesting-code-review/SKILL.md +31 -31
  123. package/dist/extensions/builtin/discipline/skills/systematic-debugging/SKILL.md +28 -28
  124. package/dist/extensions/builtin/discipline/skills/test-driven-development/SKILL.md +32 -32
  125. package/dist/extensions/builtin/discipline/skills/using-git-worktrees/SKILL.md +25 -25
  126. package/dist/extensions/builtin/discipline/skills/verification-before-completion/SKILL.md +27 -27
  127. package/dist/extensions/builtin/discipline/skills/writing-plans/SKILL.md +26 -26
  128. package/dist/extensions/builtin/goal/README.md +67 -67
  129. package/dist/extensions/builtin/goal/index.js +6 -6
  130. package/dist/extensions/builtin/grub/README.md +112 -112
  131. package/dist/extensions/builtin/link-world/agent-workspace/README.md +16 -16
  132. package/dist/extensions/builtin/link-world/internet-search/internet-search.md +65 -65
  133. package/dist/extensions/builtin/link-world/link-world-agent.md +82 -82
  134. package/dist/extensions/builtin/link-world/linkworld.md +313 -313
  135. package/dist/extensions/builtin/link-world/network-routing/network-routing.md +67 -67
  136. package/dist/extensions/builtin/loop/README.md +92 -92
  137. package/dist/extensions/builtin/mcp/figma-design.md +68 -68
  138. package/dist/extensions/builtin/mcp/mcp-management.md +85 -85
  139. package/dist/extensions/builtin/recap/AGENT.md +15 -15
  140. package/dist/extensions/builtin/sal/README.md +72 -72
  141. package/dist/extensions/builtin/security-audit/README.md +289 -289
  142. package/dist/extensions/builtin/team/AGENT.md +112 -112
  143. package/dist/extensions/builtin/team/TESTING.md +299 -299
  144. package/dist/extensions/builtin/token-save/README.md +56 -56
  145. package/dist/extensions/optional/AGENT.md +10 -10
  146. package/dist/modes/interactive/controllers/input-submit-controller.js +2 -2
  147. package/dist/modes/interactive/controllers/stream-render-controller.js +2 -2
  148. package/dist/modes/interactive/interactive-mode.js +19 -19
  149. package/dist/modes/interactive/theme/dark.json +85 -85
  150. package/dist/modes/interactive/theme/light.json +84 -84
  151. package/dist/modes/interactive/theme/theme-schema.json +335 -335
  152. package/dist/modes/interactive/theme/warm.json +81 -81
  153. package/dist/node_modules/@pencil-agent/ai/dist/cli.js +0 -0
  154. package/dist/node_modules/@pencil-agent/ai/dist/models.generated.js +1 -1
  155. package/docs/ACP/345/215/217/350/256/256/351/233/206/346/210/220/345/274/200/345/217/221/346/226/207/346/241/243.md +851 -0
  156. package/docs/SDK-TESTING.md +364 -0
  157. package/docs/codex-goal-command-impl.md +1055 -1055
  158. package/docs/codex-goal-vs-grub.md +500 -500
  159. package/docs/custom-provider.md +27 -27
  160. package/docs/extensions.md +27 -27
  161. package/docs/keybindings.md +27 -27
  162. package/docs/loop /351/207/215/346/236/204/345/256/214/346/210/220/346/200/273/347/273/223.md" +250 -250
  163. package/docs/loop /351/207/215/346/236/204/345/256/214/346/210/220/346/212/245/345/221/212.md" +122 -122
  164. package/docs/loop /351/207/215/346/236/204/346/226/271/346/241/210.md" +1222 -1222
  165. package/docs/loop /351/207/215/346/236/204/346/226/271/346/241/210/345/256/236/347/216/260/346/212/245/345/221/212.md" +158 -158
  166. package/docs/loop /351/207/215/346/236/204/346/226/271/346/241/210/345/257/271/346/257/224/345/210/206/346/236/220.md" +128 -128
  167. package/docs/loop /351/207/215/346/236/204/350/256/241/345/210/222.md" +320 -320
  168. package/docs/loop-usage-examples.md +214 -214
  169. package/docs/mem-core/346/212/200/346/234/257/346/226/207/346/241/243.md +593 -0
  170. package/docs/models.md +27 -27
  171. package/docs/packages.md +27 -27
  172. package/docs/pi-design-philosophy.md +457 -457
  173. package/docs/planmode.md +1987 -1987
  174. package/docs/prompt-templates.md +27 -27
  175. package/docs/providers.md +27 -27
  176. package/docs/sdk.md +27 -27
  177. package/docs/skills.md +27 -27
  178. package/docs/startup-performance-optimization.md +301 -0
  179. package/docs/themes.md +27 -27
  180. package/docs/tui.md +27 -27
  181. package/docs//350/256/244/347/237/245/345/234/260/345/233/276.md +47 -0
  182. package/package.json +190 -190
  183. package/docs/cc-agent-design.md +0 -1297
  184. package/docs/cc-tui-design.md +0 -1333
  185. package/docs/nanoPencil-/345/255/246/344/271/240/350/256/241/345/210/222.md +0 -170
  186. package/docs/scan-report.md +0 -3820
  187. package/docs//345/257/271/346/240/207Claude-Code.md +0 -1775
  188. package/docs//351/230/277/351/207/214/345/267/264/345/267/264/350/264/242/346/212/245/345/210/206/346/236/220/344/271/246.md +0 -261
@@ -1,236 +1,236 @@
1
- # Facebook Groups — mining feeds for posts + external URLs
2
-
3
- Field-tested against a logged-in Jay account on 2026-04-18.
4
- **Requires:** Browser Harness driving a real Chrome that is (a) signed into
5
- Facebook and (b) already a member of the target group. Non-member or logged-out
6
- views serve a stripped landing page with no post content.
7
-
8
- ## What this skill is for
9
-
10
- 1. Pull the N most recent posts from a named FB group
11
- 2. Harvest every external URL that members have shared
12
- 3. Hand that URL list to Firecrawl (or `http_get`) for structured scraping at scale
13
- 4. Cache post text + author + timestamp for downstream keyword matching
14
-
15
- It is NOT for: replying in groups, DMing members, or any write action.
16
-
17
- ## URL patterns
18
-
19
- | What | URL |
20
- |------|-----|
21
- | Group main feed | `https://www.facebook.com/groups/{id_or_slug}` |
22
- | Group "Discussion" tab (canonical feed) | `https://www.facebook.com/groups/{id_or_slug}/?sorting_setting=CHRONOLOGICAL` |
23
- | Single post (permalink) | `https://www.facebook.com/groups/{id_or_slug}/posts/{post_id}/` |
24
- | User's joined-groups feed | `https://www.facebook.com/groups/feed/` |
25
- | List of YOUR groups | `https://www.facebook.com/groups/joins/` |
26
-
27
- The `?sorting_setting=CHRONOLOGICAL` flag matters — without it, FB inserts an
28
- algorithmic ranking that hides older posts and shows the same handful of "popular"
29
- items every visit, which kills monitoring use cases.
30
-
31
- ## DOM anchors (verified 2026-04-18)
32
-
33
- FB rewrites class names every few weeks but ARIA roles and stable URL patterns
34
- hold up well. Anchor on those, not on hashed CSS classes.
35
-
36
- | Anchor | Selector | Notes |
37
- |--------|----------|-------|
38
- | Each post container | `div[role="article"]` | Stable. One per visible post. |
39
- | Post permalink | `a[href*="/groups/"][href*="/posts/"], a[href*="/groups/"][href*="/permalink/"]` | First match per article = the post link |
40
- | Post body text | `div[data-ad-preview="message"], div[data-ad-comet-preview="message"]` | One of these is the visible body |
41
- | Post author | `h3 a, h4 a` (first inside the article) | Falls back to `strong a` |
42
- | Post timestamp | `a[href*="/posts/"] abbr, a[role="link"] > span > span` (relative time text) | Hover gets the absolute time but the relative string is fine for sorting |
43
- | External link (FB redirector) | `a[href^="https://l.facebook.com/l.php?u="]` | Decode the `u=` param to get the real URL |
44
- | "See more" button on long posts | `div[role="button"]:has(span:contains("See more"))` (use XPath fallback if `:has` is unsupported) | Click before reading body or posts get truncated |
45
-
46
- If selectors stop returning results, run the self-inspection block at the bottom
47
- of this file and update this table — that's the workflow, not a fallback.
48
-
49
-
50
- ## Scrolling the feed (lazy load)
51
-
52
- FB virtualizes the feed: scrolled-past posts get unmounted from the DOM. So
53
- "scroll then collect" misses old posts. Pattern that works: **collect-as-you-go.**
54
-
55
- ```python
56
- seen = {} # post_url -> dict
57
- TARGET = 50 # how many posts to collect
58
- MAX_SCROLLS = 30
59
-
60
- for i in range(MAX_SCROLLS):
61
- new_posts = js("""
62
- Array.from(document.querySelectorAll('div[role="article"]')).map(el => {
63
- const link = el.querySelector('a[href*="/groups/"][href*="/posts/"], a[href*="/groups/"][href*="/permalink/"]');
64
- const body = el.querySelector('div[data-ad-preview="message"], div[data-ad-comet-preview="message"]');
65
- const author = el.querySelector('h3 a, h4 a, strong a');
66
- const time = el.querySelector('abbr, a[role="link"] > span > span');
67
- const externals = Array.from(el.querySelectorAll('a[href^="https://l.facebook.com/l.php?u="]'))
68
- .map(a => a.href);
69
- return {
70
- url: link?.href || null,
71
- author: author?.innerText || null,
72
- time: time?.innerText || null,
73
- body: body?.innerText?.slice(0, 4000) || null,
74
- externals: externals,
75
- };
76
- }).filter(p => p.url)
77
- """) or []
78
- for p in new_posts:
79
- seen.setdefault(p["url"], p)
80
- if len(seen) >= TARGET:
81
- break
82
- scroll(640, 400, dy=900) # scroll near middle of viewport
83
- wait(2.5) # FB needs ~2s to render new batch + a little buffer
84
- ```
85
-
86
- `wait(2.5)` is the floor. Faster than that and you'll see empty post containers
87
- because React hasn't hydrated them yet.
88
-
89
-
90
- ## Decoding the external-URL redirector
91
-
92
- Every external link gets wrapped in `https://l.facebook.com/l.php?u={URL-encoded real URL}&h=...`.
93
- You want the real URL, not the redirector.
94
-
95
- ```python
96
- from urllib.parse import urlparse, parse_qs, unquote
97
- def decode_fb_link(href):
98
- if not href.startswith("https://l.facebook.com/l.php"):
99
- return href
100
- q = parse_qs(urlparse(href).query)
101
- return unquote(q["u"][0]) if "u" in q else href
102
- ```
103
-
104
- ## Handoff to Firecrawl (for the public outbound URLs)
105
-
106
- Once you have the harvested external list, those URLs are outside FB's walled
107
- garden — public, scrapable by anything. Firecrawl's schema-native extraction
108
- shines here because you want typed results across heterogeneous sources.
109
-
110
- ```python
111
- # After the scroll loop:
112
- external_urls = []
113
- for p in seen.values():
114
- for raw in p["externals"]:
115
- external_urls.append(decode_fb_link(raw))
116
- external_urls = sorted(set(external_urls))
117
- print(f"harvested {len(external_urls)} unique external URLs")
118
-
119
- # Hand off to Firecrawl MCP in the calling conversation:
120
- # firecrawl_extract(
121
- # urls=external_urls,
122
- # prompt="Extract product/listing name, price, location, year, and key features.",
123
- # schema={...}
124
- # )
125
- ```
126
-
127
- When Firecrawl isn't available or the pages are simple, `http_get(url)` from
128
- Harness itself is fine — it does a plain HTTP fetch without a browser, works
129
- for static pages, and is the fastest option for bulk.
130
-
131
-
132
- ## Rate-limit discipline
133
-
134
- FB notices automation patterns at the account level, not the IP level. Driving
135
- a real logged-in session means Jay's account is the one getting rate-limited if
136
- you get greedy. Keep these floors:
137
-
138
- - **≥2 seconds between scrolls** in the collect loop (the `wait(2.5)` above)
139
- - **≥3 seconds between groups** if you're sweeping multiple
140
- - **No more than ~6 groups per hour** for sustained monitoring
141
- - **Don't open the same group more than every 15 minutes** — repeated visits
142
- within a short window is a heuristic that triggers checkpoints
143
-
144
- Symptoms of over-pacing: article containers start rendering with empty bodies,
145
- `/groups/{id}/` redirects to `/checkpoint/`, or the account briefly gets asked
146
- to re-verify a phone or confirm a login from a new device. If that happens,
147
- **stop immediately** and let Jay deal with the UI — don't try to auto-resolve.
148
-
149
- ## Self-inspection block (run this when selectors stop working)
150
-
151
- Paste this into a Harness stdin block to see what anchors currently exist in the
152
- visible feed. Run it on a group you're a member of.
153
-
154
- ```python
155
- print(js("""
156
- ({
157
- articles: document.querySelectorAll('div[role="article"]').length,
158
- body_preview_a: document.querySelectorAll('div[data-ad-preview="message"]').length,
159
- body_preview_b: document.querySelectorAll('div[data-ad-comet-preview="message"]').length,
160
- external_redirectors: document.querySelectorAll('a[href^="https://l.facebook.com/l.php?u="]').length,
161
- permalink_posts: document.querySelectorAll('a[href*="/groups/"][href*="/posts/"]').length,
162
- permalink_permalinks: document.querySelectorAll('a[href*="/groups/"][href*="/permalink/"]').length,
163
- })
164
- """))
165
- # If any count is 0, the selector drifted. Open DevTools, right-click a visible
166
- # post, inspect, find the new stable attribute (aria-*, data-*), and update the
167
- # DOM anchors table above.
168
- ```
169
-
170
-
171
- ## Full example — mine one group, emit JSON for downstream tools
172
-
173
- ```bash
174
- cd ~/Developer/browser-harness && uv run browser-harness <<'PY'
175
- import json, sys
176
- from urllib.parse import urlparse, parse_qs, unquote
177
-
178
- GROUP = "riceLakeBoating" # slug or numeric id
179
- TARGET = 50 # how many posts to collect
180
- MAX_SCROLLS = 30
181
-
182
- goto_url(f"https://www.facebook.com/groups/{GROUP}/?sorting_setting=CHRONOLOGICAL")
183
- wait_for_load()
184
- wait(2)
185
-
186
- # Abort if FB bounced us
187
- info = page_info()
188
- if "/checkpoint/" in info["url"] or "/login" in info["url"]:
189
- sys.exit("AUTH_WALL — stop and have Jay re-verify the account.")
190
-
191
- seen = {}
192
- for _ in range(MAX_SCROLLS):
193
- batch = js("""
194
- Array.from(document.querySelectorAll('div[role="article"]')).map(el => {
195
- const link = el.querySelector('a[href*="/groups/"][href*="/posts/"], a[href*="/groups/"][href*="/permalink/"]');
196
- const body = el.querySelector('div[data-ad-preview="message"], div[data-ad-comet-preview="message"]');
197
- const author = el.querySelector('h3 a, h4 a, strong a');
198
- const time = el.querySelector('abbr, a[role="link"] > span > span');
199
- const externals = Array.from(el.querySelectorAll('a[href^="https://l.facebook.com/l.php?u="]')).map(a => a.href);
200
- return { url: link?.href, author: author?.innerText, time: time?.innerText,
201
- body: body?.innerText?.slice(0, 4000), externals };
202
- }).filter(p => p.url)
203
- """) or []
204
- for p in batch:
205
- seen.setdefault(p["url"], p)
206
- if len(seen) >= TARGET:
207
- break
208
- scroll(640, 400, dy=900)
209
- wait(2.5)
210
-
211
- def decode(u):
212
- if not u.startswith("https://l.facebook.com/l.php"): return u
213
- q = parse_qs(urlparse(u).query)
214
- return unquote(q["u"][0]) if "u" in q else u
215
-
216
- posts = list(seen.values())
217
- all_externals = sorted({decode(x) for p in posts for x in p["externals"]})
218
- capture_screenshot(f"/tmp/fb-group-{GROUP}.png", full=True)
219
- print(json.dumps({
220
- "group": GROUP,
221
- "post_count": len(posts),
222
- "posts": posts,
223
- "external_urls": all_externals,
224
- }, ensure_ascii=False))
225
- PY
226
- ```
227
-
228
- The JSON on stdout is the handoff payload — parse it in the calling agent and
229
- route `external_urls` into `firecrawl_extract` with whatever schema matches the
230
- downstream task (competitor inventory, pricing intel, boat listings, etc).
231
-
232
- ## Gotchas log (append when you hit something new)
233
-
234
- - **2026-04-18:** Fresh install verified. People-search URL requires login;
235
- page search `/search/pages/?q=` works the same way. Groups feed defaults to
236
- algorithmic sort — always append `?sorting_setting=CHRONOLOGICAL`.
1
+ # Facebook Groups — mining feeds for posts + external URLs
2
+
3
+ Field-tested against a logged-in Jay account on 2026-04-18.
4
+ **Requires:** Browser Harness driving a real Chrome that is (a) signed into
5
+ Facebook and (b) already a member of the target group. Non-member or logged-out
6
+ views serve a stripped landing page with no post content.
7
+
8
+ ## What this skill is for
9
+
10
+ 1. Pull the N most recent posts from a named FB group
11
+ 2. Harvest every external URL that members have shared
12
+ 3. Hand that URL list to Firecrawl (or `http_get`) for structured scraping at scale
13
+ 4. Cache post text + author + timestamp for downstream keyword matching
14
+
15
+ It is NOT for: replying in groups, DMing members, or any write action.
16
+
17
+ ## URL patterns
18
+
19
+ | What | URL |
20
+ |------|-----|
21
+ | Group main feed | `https://www.facebook.com/groups/{id_or_slug}` |
22
+ | Group "Discussion" tab (canonical feed) | `https://www.facebook.com/groups/{id_or_slug}/?sorting_setting=CHRONOLOGICAL` |
23
+ | Single post (permalink) | `https://www.facebook.com/groups/{id_or_slug}/posts/{post_id}/` |
24
+ | User's joined-groups feed | `https://www.facebook.com/groups/feed/` |
25
+ | List of YOUR groups | `https://www.facebook.com/groups/joins/` |
26
+
27
+ The `?sorting_setting=CHRONOLOGICAL` flag matters — without it, FB inserts an
28
+ algorithmic ranking that hides older posts and shows the same handful of "popular"
29
+ items every visit, which kills monitoring use cases.
30
+
31
+ ## DOM anchors (verified 2026-04-18)
32
+
33
+ FB rewrites class names every few weeks but ARIA roles and stable URL patterns
34
+ hold up well. Anchor on those, not on hashed CSS classes.
35
+
36
+ | Anchor | Selector | Notes |
37
+ |--------|----------|-------|
38
+ | Each post container | `div[role="article"]` | Stable. One per visible post. |
39
+ | Post permalink | `a[href*="/groups/"][href*="/posts/"], a[href*="/groups/"][href*="/permalink/"]` | First match per article = the post link |
40
+ | Post body text | `div[data-ad-preview="message"], div[data-ad-comet-preview="message"]` | One of these is the visible body |
41
+ | Post author | `h3 a, h4 a` (first inside the article) | Falls back to `strong a` |
42
+ | Post timestamp | `a[href*="/posts/"] abbr, a[role="link"] > span > span` (relative time text) | Hover gets the absolute time but the relative string is fine for sorting |
43
+ | External link (FB redirector) | `a[href^="https://l.facebook.com/l.php?u="]` | Decode the `u=` param to get the real URL |
44
+ | "See more" button on long posts | `div[role="button"]:has(span:contains("See more"))` (use XPath fallback if `:has` is unsupported) | Click before reading body or posts get truncated |
45
+
46
+ If selectors stop returning results, run the self-inspection block at the bottom
47
+ of this file and update this table — that's the workflow, not a fallback.
48
+
49
+
50
+ ## Scrolling the feed (lazy load)
51
+
52
+ FB virtualizes the feed: scrolled-past posts get unmounted from the DOM. So
53
+ "scroll then collect" misses old posts. Pattern that works: **collect-as-you-go.**
54
+
55
+ ```python
56
+ seen = {} # post_url -> dict
57
+ TARGET = 50 # how many posts to collect
58
+ MAX_SCROLLS = 30
59
+
60
+ for i in range(MAX_SCROLLS):
61
+ new_posts = js("""
62
+ Array.from(document.querySelectorAll('div[role="article"]')).map(el => {
63
+ const link = el.querySelector('a[href*="/groups/"][href*="/posts/"], a[href*="/groups/"][href*="/permalink/"]');
64
+ const body = el.querySelector('div[data-ad-preview="message"], div[data-ad-comet-preview="message"]');
65
+ const author = el.querySelector('h3 a, h4 a, strong a');
66
+ const time = el.querySelector('abbr, a[role="link"] > span > span');
67
+ const externals = Array.from(el.querySelectorAll('a[href^="https://l.facebook.com/l.php?u="]'))
68
+ .map(a => a.href);
69
+ return {
70
+ url: link?.href || null,
71
+ author: author?.innerText || null,
72
+ time: time?.innerText || null,
73
+ body: body?.innerText?.slice(0, 4000) || null,
74
+ externals: externals,
75
+ };
76
+ }).filter(p => p.url)
77
+ """) or []
78
+ for p in new_posts:
79
+ seen.setdefault(p["url"], p)
80
+ if len(seen) >= TARGET:
81
+ break
82
+ scroll(640, 400, dy=900) # scroll near middle of viewport
83
+ wait(2.5) # FB needs ~2s to render new batch + a little buffer
84
+ ```
85
+
86
+ `wait(2.5)` is the floor. Faster than that and you'll see empty post containers
87
+ because React hasn't hydrated them yet.
88
+
89
+
90
+ ## Decoding the external-URL redirector
91
+
92
+ Every external link gets wrapped in `https://l.facebook.com/l.php?u={URL-encoded real URL}&h=...`.
93
+ You want the real URL, not the redirector.
94
+
95
+ ```python
96
+ from urllib.parse import urlparse, parse_qs, unquote
97
+ def decode_fb_link(href):
98
+ if not href.startswith("https://l.facebook.com/l.php"):
99
+ return href
100
+ q = parse_qs(urlparse(href).query)
101
+ return unquote(q["u"][0]) if "u" in q else href
102
+ ```
103
+
104
+ ## Handoff to Firecrawl (for the public outbound URLs)
105
+
106
+ Once you have the harvested external list, those URLs are outside FB's walled
107
+ garden — public, scrapable by anything. Firecrawl's schema-native extraction
108
+ shines here because you want typed results across heterogeneous sources.
109
+
110
+ ```python
111
+ # After the scroll loop:
112
+ external_urls = []
113
+ for p in seen.values():
114
+ for raw in p["externals"]:
115
+ external_urls.append(decode_fb_link(raw))
116
+ external_urls = sorted(set(external_urls))
117
+ print(f"harvested {len(external_urls)} unique external URLs")
118
+
119
+ # Hand off to Firecrawl MCP in the calling conversation:
120
+ # firecrawl_extract(
121
+ # urls=external_urls,
122
+ # prompt="Extract product/listing name, price, location, year, and key features.",
123
+ # schema={...}
124
+ # )
125
+ ```
126
+
127
+ When Firecrawl isn't available or the pages are simple, `http_get(url)` from
128
+ Harness itself is fine — it does a plain HTTP fetch without a browser, works
129
+ for static pages, and is the fastest option for bulk.
130
+
131
+
132
+ ## Rate-limit discipline
133
+
134
+ FB notices automation patterns at the account level, not the IP level. Driving
135
+ a real logged-in session means Jay's account is the one getting rate-limited if
136
+ you get greedy. Keep these floors:
137
+
138
+ - **≥2 seconds between scrolls** in the collect loop (the `wait(2.5)` above)
139
+ - **≥3 seconds between groups** if you're sweeping multiple
140
+ - **No more than ~6 groups per hour** for sustained monitoring
141
+ - **Don't open the same group more than every 15 minutes** — repeated visits
142
+ within a short window is a heuristic that triggers checkpoints
143
+
144
+ Symptoms of over-pacing: article containers start rendering with empty bodies,
145
+ `/groups/{id}/` redirects to `/checkpoint/`, or the account briefly gets asked
146
+ to re-verify a phone or confirm a login from a new device. If that happens,
147
+ **stop immediately** and let Jay deal with the UI — don't try to auto-resolve.
148
+
149
+ ## Self-inspection block (run this when selectors stop working)
150
+
151
+ Paste this into a Harness stdin block to see what anchors currently exist in the
152
+ visible feed. Run it on a group you're a member of.
153
+
154
+ ```python
155
+ print(js("""
156
+ ({
157
+ articles: document.querySelectorAll('div[role="article"]').length,
158
+ body_preview_a: document.querySelectorAll('div[data-ad-preview="message"]').length,
159
+ body_preview_b: document.querySelectorAll('div[data-ad-comet-preview="message"]').length,
160
+ external_redirectors: document.querySelectorAll('a[href^="https://l.facebook.com/l.php?u="]').length,
161
+ permalink_posts: document.querySelectorAll('a[href*="/groups/"][href*="/posts/"]').length,
162
+ permalink_permalinks: document.querySelectorAll('a[href*="/groups/"][href*="/permalink/"]').length,
163
+ })
164
+ """))
165
+ # If any count is 0, the selector drifted. Open DevTools, right-click a visible
166
+ # post, inspect, find the new stable attribute (aria-*, data-*), and update the
167
+ # DOM anchors table above.
168
+ ```
169
+
170
+
171
+ ## Full example — mine one group, emit JSON for downstream tools
172
+
173
+ ```bash
174
+ cd ~/Developer/browser-harness && uv run browser-harness <<'PY'
175
+ import json, sys
176
+ from urllib.parse import urlparse, parse_qs, unquote
177
+
178
+ GROUP = "riceLakeBoating" # slug or numeric id
179
+ TARGET = 50 # how many posts to collect
180
+ MAX_SCROLLS = 30
181
+
182
+ goto_url(f"https://www.facebook.com/groups/{GROUP}/?sorting_setting=CHRONOLOGICAL")
183
+ wait_for_load()
184
+ wait(2)
185
+
186
+ # Abort if FB bounced us
187
+ info = page_info()
188
+ if "/checkpoint/" in info["url"] or "/login" in info["url"]:
189
+ sys.exit("AUTH_WALL — stop and have Jay re-verify the account.")
190
+
191
+ seen = {}
192
+ for _ in range(MAX_SCROLLS):
193
+ batch = js("""
194
+ Array.from(document.querySelectorAll('div[role="article"]')).map(el => {
195
+ const link = el.querySelector('a[href*="/groups/"][href*="/posts/"], a[href*="/groups/"][href*="/permalink/"]');
196
+ const body = el.querySelector('div[data-ad-preview="message"], div[data-ad-comet-preview="message"]');
197
+ const author = el.querySelector('h3 a, h4 a, strong a');
198
+ const time = el.querySelector('abbr, a[role="link"] > span > span');
199
+ const externals = Array.from(el.querySelectorAll('a[href^="https://l.facebook.com/l.php?u="]')).map(a => a.href);
200
+ return { url: link?.href, author: author?.innerText, time: time?.innerText,
201
+ body: body?.innerText?.slice(0, 4000), externals };
202
+ }).filter(p => p.url)
203
+ """) or []
204
+ for p in batch:
205
+ seen.setdefault(p["url"], p)
206
+ if len(seen) >= TARGET:
207
+ break
208
+ scroll(640, 400, dy=900)
209
+ wait(2.5)
210
+
211
+ def decode(u):
212
+ if not u.startswith("https://l.facebook.com/l.php"): return u
213
+ q = parse_qs(urlparse(u).query)
214
+ return unquote(q["u"][0]) if "u" in q else u
215
+
216
+ posts = list(seen.values())
217
+ all_externals = sorted({decode(x) for p in posts for x in p["externals"]})
218
+ capture_screenshot(f"/tmp/fb-group-{GROUP}.png", full=True)
219
+ print(json.dumps({
220
+ "group": GROUP,
221
+ "post_count": len(posts),
222
+ "posts": posts,
223
+ "external_urls": all_externals,
224
+ }, ensure_ascii=False))
225
+ PY
226
+ ```
227
+
228
+ The JSON on stdout is the handoff payload — parse it in the calling agent and
229
+ route `external_urls` into `firecrawl_extract` with whatever schema matches the
230
+ downstream task (competitor inventory, pricing intel, boat listings, etc).
231
+
232
+ ## Gotchas log (append when you hit something new)
233
+
234
+ - **2026-04-18:** Fresh install verified. People-search URL requires login;
235
+ page search `/search/pages/?q=` works the same way. Groups feed defaults to
236
+ algorithmic sort — always append `?sorting_setting=CHRONOLOGICAL`.