@lobu/cli 6.0.1 → 7.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (265) hide show
  1. package/README.md +20 -27
  2. package/dist/bundled-skills/lobu/SKILL.md +11 -11
  3. package/dist/commands/_lib/apply/apply-cmd.d.ts +38 -0
  4. package/dist/commands/_lib/apply/apply-cmd.d.ts.map +1 -1
  5. package/dist/commands/_lib/apply/apply-cmd.js +574 -40
  6. package/dist/commands/_lib/apply/apply-cmd.js.map +1 -1
  7. package/dist/commands/_lib/apply/client.d.ts +180 -1
  8. package/dist/commands/_lib/apply/client.d.ts.map +1 -1
  9. package/dist/commands/_lib/apply/client.js +308 -28
  10. package/dist/commands/_lib/apply/client.js.map +1 -1
  11. package/dist/commands/_lib/apply/desired-state.d.ts +134 -3
  12. package/dist/commands/_lib/apply/desired-state.d.ts.map +1 -1
  13. package/dist/commands/_lib/apply/desired-state.js +703 -89
  14. package/dist/commands/_lib/apply/desired-state.js.map +1 -1
  15. package/dist/commands/_lib/apply/diff.d.ts +61 -3
  16. package/dist/commands/_lib/apply/diff.d.ts.map +1 -1
  17. package/dist/commands/_lib/apply/diff.js +382 -92
  18. package/dist/commands/_lib/apply/diff.js.map +1 -1
  19. package/dist/commands/_lib/apply/prompt.d.ts +6 -0
  20. package/dist/commands/_lib/apply/prompt.d.ts.map +1 -1
  21. package/dist/commands/_lib/apply/prompt.js +16 -0
  22. package/dist/commands/_lib/apply/prompt.js.map +1 -1
  23. package/dist/commands/_lib/apply/render.d.ts +9 -0
  24. package/dist/commands/_lib/apply/render.d.ts.map +1 -1
  25. package/dist/commands/_lib/apply/render.js +80 -3
  26. package/dist/commands/_lib/apply/render.js.map +1 -1
  27. package/dist/commands/agent.d.ts +7 -0
  28. package/dist/commands/agent.d.ts.map +1 -1
  29. package/dist/commands/agent.js +65 -1
  30. package/dist/commands/agent.js.map +1 -1
  31. package/dist/commands/chat.d.ts +12 -9
  32. package/dist/commands/chat.d.ts.map +1 -1
  33. package/dist/commands/chat.js +125 -57
  34. package/dist/commands/chat.js.map +1 -1
  35. package/dist/commands/dev.d.ts +23 -7
  36. package/dist/commands/dev.d.ts.map +1 -1
  37. package/dist/commands/dev.js +197 -49
  38. package/dist/commands/dev.js.map +1 -1
  39. package/dist/commands/doctor.d.ts +1 -0
  40. package/dist/commands/doctor.d.ts.map +1 -1
  41. package/dist/commands/doctor.js +136 -0
  42. package/dist/commands/doctor.js.map +1 -1
  43. package/dist/commands/eval.d.ts +8 -0
  44. package/dist/commands/eval.d.ts.map +1 -1
  45. package/dist/commands/eval.js +72 -6
  46. package/dist/commands/eval.js.map +1 -1
  47. package/dist/commands/init.d.ts +22 -5
  48. package/dist/commands/init.d.ts.map +1 -1
  49. package/dist/commands/init.js +355 -182
  50. package/dist/commands/init.js.map +1 -1
  51. package/dist/commands/link.d.ts +11 -0
  52. package/dist/commands/link.d.ts.map +1 -0
  53. package/dist/commands/link.js +28 -0
  54. package/dist/commands/link.js.map +1 -0
  55. package/dist/commands/login.d.ts.map +1 -1
  56. package/dist/commands/login.js +14 -2
  57. package/dist/commands/login.js.map +1 -1
  58. package/dist/commands/memory/_lib/browser-auth-cmd.d.ts.map +1 -1
  59. package/dist/commands/memory/_lib/browser-auth-cmd.js +3 -3
  60. package/dist/commands/memory/_lib/browser-auth-cmd.js.map +1 -1
  61. package/dist/commands/memory/_lib/mcp.d.ts +2 -2
  62. package/dist/commands/memory/_lib/mcp.d.ts.map +1 -1
  63. package/dist/commands/memory/_lib/mcp.js +24 -12
  64. package/dist/commands/memory/_lib/mcp.js.map +1 -1
  65. package/dist/commands/memory/_lib/openclaw-auth.d.ts +1 -0
  66. package/dist/commands/memory/_lib/openclaw-auth.d.ts.map +1 -1
  67. package/dist/commands/memory/_lib/openclaw-auth.js +14 -3
  68. package/dist/commands/memory/_lib/openclaw-auth.js.map +1 -1
  69. package/dist/commands/memory/_lib/openclaw-cmd.js +1 -1
  70. package/dist/commands/memory/_lib/openclaw-cmd.js.map +1 -1
  71. package/dist/commands/memory/_lib/schema.d.ts +29 -2
  72. package/dist/commands/memory/_lib/schema.d.ts.map +1 -1
  73. package/dist/commands/memory/_lib/schema.js +121 -5
  74. package/dist/commands/memory/_lib/schema.js.map +1 -1
  75. package/dist/commands/memory/_lib/seed-cmd.d.ts.map +1 -1
  76. package/dist/commands/memory/_lib/seed-cmd.js +46 -24
  77. package/dist/commands/memory/_lib/seed-cmd.js.map +1 -1
  78. package/dist/commands/memory/run.d.ts.map +1 -1
  79. package/dist/commands/memory/run.js +2 -2
  80. package/dist/commands/memory/run.js.map +1 -1
  81. package/dist/commands/org.d.ts +4 -0
  82. package/dist/commands/org.d.ts.map +1 -1
  83. package/dist/commands/org.js +10 -0
  84. package/dist/commands/org.js.map +1 -1
  85. package/dist/commands/platforms/platform-prompts.d.ts +0 -1
  86. package/dist/commands/platforms/platform-prompts.d.ts.map +1 -1
  87. package/dist/commands/platforms/platform-prompts.js +54 -8
  88. package/dist/commands/platforms/platform-prompts.js.map +1 -1
  89. package/dist/commands/telemetry.d.ts +10 -0
  90. package/dist/commands/telemetry.d.ts.map +1 -0
  91. package/dist/commands/telemetry.js +68 -0
  92. package/dist/commands/telemetry.js.map +1 -0
  93. package/dist/commands/token.d.ts +9 -0
  94. package/dist/commands/token.d.ts.map +1 -1
  95. package/dist/commands/token.js +54 -0
  96. package/dist/commands/token.js.map +1 -1
  97. package/dist/commands/whoami.d.ts.map +1 -1
  98. package/dist/commands/whoami.js +1 -1
  99. package/dist/commands/whoami.js.map +1 -1
  100. package/dist/connectors/README.md +534 -0
  101. package/dist/connectors/__tests__/browser-scraper-utils.test.ts +186 -0
  102. package/dist/connectors/apple_health.ts +138 -0
  103. package/dist/connectors/apple_screen_time.ts +82 -0
  104. package/dist/connectors/browser-scraper-utils.ts +246 -0
  105. package/dist/connectors/capterra.ts +277 -0
  106. package/dist/connectors/g2.ts +290 -0
  107. package/dist/connectors/github.ts +1530 -0
  108. package/dist/connectors/glassdoor.ts +295 -0
  109. package/dist/connectors/gmaps.ts +197 -0
  110. package/dist/connectors/google_calendar.ts +641 -0
  111. package/dist/connectors/google_gmail.ts +754 -0
  112. package/dist/connectors/google_photos.ts +776 -0
  113. package/dist/connectors/google_play.ts +349 -0
  114. package/dist/connectors/hackernews.ts +471 -0
  115. package/dist/connectors/index.ts +28 -0
  116. package/dist/connectors/ios_appstore.ts +226 -0
  117. package/dist/connectors/linkedin.ts +494 -0
  118. package/dist/connectors/local_directory.ts +91 -0
  119. package/dist/connectors/microsoft_outlook.ts +410 -0
  120. package/dist/connectors/producthunt.ts +471 -0
  121. package/dist/connectors/reddit.ts +600 -0
  122. package/dist/connectors/revolut.ts +572 -0
  123. package/dist/connectors/rss.ts +448 -0
  124. package/dist/connectors/spotify.ts +590 -0
  125. package/dist/connectors/trustpilot.ts +203 -0
  126. package/dist/connectors/website.ts +629 -0
  127. package/dist/connectors/whatsapp.ts +1081 -0
  128. package/dist/connectors/whatsapp_local.ts +125 -0
  129. package/dist/connectors/x.ts +536 -0
  130. package/dist/connectors/youtube.ts +666 -0
  131. package/dist/db/migrations/00000000000000_baseline.sql +4867 -0
  132. package/dist/db/migrations/20260405193000_add_mcp_sessions.sql +33 -0
  133. package/dist/db/migrations/20260408120000_remove_system_connectors.sql +48 -0
  134. package/dist/db/migrations/20260408120001_optional_compiled_code.sql +6 -0
  135. package/dist/db/migrations/20260409110000_add_active_watcher_run_index.sql +9 -0
  136. package/dist/db/migrations/20260409130000_connector_default_config.sql +5 -0
  137. package/dist/db/migrations/20260410120000_add_agent_secrets.sql +25 -0
  138. package/dist/db/migrations/20260413170000_add_watcher_group_id.sql +67 -0
  139. package/dist/db/migrations/20260416120000_add_entity_wa_jid_index.sql +14 -0
  140. package/dist/db/migrations/20260417100000_add_entity_identities.sql +77 -0
  141. package/dist/db/migrations/20260418100000_add_auth_runs.sql +83 -0
  142. package/dist/db/migrations/20260418110000_add_runs_created_by_user.sql +18 -0
  143. package/dist/db/migrations/20260419120000_add_event_identity_indexes.sql +56 -0
  144. package/dist/db/migrations/20260420120000_extend_reserved_org_slugs.sql +56 -0
  145. package/dist/db/migrations/20260424030000_add_watcher_run_correlation.sql +52 -0
  146. package/dist/db/migrations/20260424130000_relax_events_client_id_fk.sql +47 -0
  147. package/dist/db/migrations/20260425100000_normalize_watcher_feedback.sql +91 -0
  148. package/dist/db/migrations/20260425120000_add_run_diagnostics.sql +20 -0
  149. package/dist/db/migrations/20260425130000_add_repair_agent_plumbing.sql +46 -0
  150. package/dist/db/migrations/20260426120000_entities_entity_type_fk.sql +101 -0
  151. package/dist/db/migrations/20260426130000_db_integrity_cleanup.sql +104 -0
  152. package/dist/db/migrations/20260426130001_db_integrity_cleanup_concurrent.sql +187 -0
  153. package/dist/db/migrations/20260427133000_events_created_by_nullable.sql +74 -0
  154. package/dist/db/migrations/20260427140000_identity_engine_indexes.sql +140 -0
  155. package/dist/db/migrations/20260427150000_drop_events_source_id.sql +177 -0
  156. package/dist/db/migrations/20260427160000_drop_dead_schema.sql +76 -0
  157. package/dist/db/migrations/20260427170000_market_founder_to_member.sql +364 -0
  158. package/dist/db/migrations/20260428040000_cascade_events_watchers_org_fk.sql +66 -0
  159. package/dist/db/migrations/20260428050000_add_runs_approved_input.sql +9 -0
  160. package/dist/db/migrations/20260429010000_auth_profile_tenant_scoped_fk.sql +79 -0
  161. package/dist/db/migrations/20260429060000_extend_runs_for_lobu_queue.sql +108 -0
  162. package/dist/db/migrations/20260429120000_agent_changed_notify.sql +97 -0
  163. package/dist/db/migrations/20260429120100_user_auth_profiles_and_model_prefs.sql +36 -0
  164. package/dist/db/migrations/20260429120200_fix_notify_old_keys.sql +130 -0
  165. package/dist/db/migrations/20260429130000_oauth_states_cli_sessions_rate_limits.sql +83 -0
  166. package/dist/db/migrations/20260429140000_phase8_grants_chat_connections_mcp_sessions.sql +84 -0
  167. package/dist/db/migrations/20260429140100_runs_priority_expires_at_retry_delay.sql +44 -0
  168. package/dist/db/migrations/20260429180000_drop_invalidatable_cache_triggers.sql +25 -0
  169. package/dist/db/migrations/20260430005614_agents_apply_fields.sql +21 -0
  170. package/dist/db/migrations/20260430022231_fix_connection_config_encryption.sql +69 -0
  171. package/dist/db/migrations/20260430151215_add_task_run_type.sql +77 -0
  172. package/dist/db/migrations/20260501000000_drop_cli_sessions.sql +27 -0
  173. package/dist/db/migrations/20260501133000_lobu_memory_mcp_id.sql +117 -0
  174. package/dist/db/migrations/20260502000000_drop_chat_connections.sql +60 -0
  175. package/dist/db/migrations/20260503000000_agent_secrets_org_scope.sql +56 -0
  176. package/dist/db/migrations/20260504000000_flatten_agents_drop_sandbox_model.sql +48 -0
  177. package/dist/db/migrations/20260510220000_connector_required_capability.sql +47 -0
  178. package/dist/db/migrations/20260512000000_device_worker_connection_binding.sql +113 -0
  179. package/dist/db/migrations/20260512131703_connections_slug.sql +131 -0
  180. package/dist/db/migrations/20260513000000_chat_user_identities.sql +24 -0
  181. package/dist/db/migrations/20260513120000_auth_profiles_device_binding.sql +50 -0
  182. package/dist/db/migrations/20260513150000_auth_profiles_cdp_url.sql +43 -0
  183. package/dist/db/migrations/20260513200000_notifications_as_events.sql +86 -0
  184. package/dist/db/migrations/20260514000000_scheduled_jobs.sql +97 -0
  185. package/dist/db/migrations/20260514120000_auth_profiles_connector_key_nullable.sql +42 -0
  186. package/dist/eval/types.d.ts +2 -0
  187. package/dist/eval/types.d.ts.map +1 -1
  188. package/dist/index.d.ts +11 -0
  189. package/dist/index.d.ts.map +1 -1
  190. package/dist/index.js +210 -132
  191. package/dist/index.js.map +1 -1
  192. package/dist/internal/api-client.d.ts +4 -8
  193. package/dist/internal/api-client.d.ts.map +1 -1
  194. package/dist/internal/api-client.js +1 -1
  195. package/dist/internal/api-client.js.map +1 -1
  196. package/dist/internal/context.js +2 -2
  197. package/dist/internal/context.js.map +1 -1
  198. package/dist/internal/credentials.d.ts.map +1 -1
  199. package/dist/internal/credentials.js +6 -1
  200. package/dist/internal/credentials.js.map +1 -1
  201. package/dist/internal/gateway-url.d.ts +14 -0
  202. package/dist/internal/gateway-url.d.ts.map +1 -1
  203. package/dist/internal/gateway-url.js +19 -0
  204. package/dist/internal/gateway-url.js.map +1 -1
  205. package/dist/internal/index.d.ts +3 -4
  206. package/dist/internal/index.d.ts.map +1 -1
  207. package/dist/internal/index.js +3 -3
  208. package/dist/internal/index.js.map +1 -1
  209. package/dist/internal/oauth.d.ts +6 -5
  210. package/dist/internal/oauth.d.ts.map +1 -1
  211. package/dist/internal/oauth.js +2 -2
  212. package/dist/internal/project-link.d.ts +10 -0
  213. package/dist/internal/project-link.d.ts.map +1 -0
  214. package/dist/internal/project-link.js +48 -0
  215. package/dist/internal/project-link.js.map +1 -0
  216. package/dist/providers.json +2 -2
  217. package/dist/server.bundle.mjs +31654 -30866
  218. package/dist/start-local.bundle.mjs +74409 -0
  219. package/dist/templates/README.md.tmpl +10 -11
  220. package/dist/templates/TESTING.md.tmpl +9 -9
  221. package/package.json +15 -13
  222. package/dist/__tests__/chat.integration.test.d.ts +0 -2
  223. package/dist/__tests__/chat.integration.test.d.ts.map +0 -1
  224. package/dist/__tests__/chat.integration.test.js +0 -337
  225. package/dist/__tests__/chat.integration.test.js.map +0 -1
  226. package/dist/__tests__/dev.test.d.ts +0 -2
  227. package/dist/__tests__/dev.test.d.ts.map +0 -1
  228. package/dist/__tests__/dev.test.js +0 -25
  229. package/dist/__tests__/dev.test.js.map +0 -1
  230. package/dist/__tests__/init-memory.test.d.ts +0 -2
  231. package/dist/__tests__/init-memory.test.d.ts.map +0 -1
  232. package/dist/__tests__/init-memory.test.js +0 -45
  233. package/dist/__tests__/init-memory.test.js.map +0 -1
  234. package/dist/__tests__/token.test.d.ts +0 -2
  235. package/dist/__tests__/token.test.d.ts.map +0 -1
  236. package/dist/__tests__/token.test.js +0 -52
  237. package/dist/__tests__/token.test.js.map +0 -1
  238. package/dist/commands/_lib/apply/__tests__/client.test.d.ts +0 -2
  239. package/dist/commands/_lib/apply/__tests__/client.test.d.ts.map +0 -1
  240. package/dist/commands/_lib/apply/__tests__/client.test.js +0 -23
  241. package/dist/commands/_lib/apply/__tests__/client.test.js.map +0 -1
  242. package/dist/commands/_lib/apply/__tests__/desired-state.test.d.ts +0 -2
  243. package/dist/commands/_lib/apply/__tests__/desired-state.test.d.ts.map +0 -1
  244. package/dist/commands/_lib/apply/__tests__/desired-state.test.js +0 -140
  245. package/dist/commands/_lib/apply/__tests__/desired-state.test.js.map +0 -1
  246. package/dist/commands/_lib/apply/__tests__/diff.test.d.ts +0 -2
  247. package/dist/commands/_lib/apply/__tests__/diff.test.d.ts.map +0 -1
  248. package/dist/commands/_lib/apply/__tests__/diff.test.js +0 -378
  249. package/dist/commands/_lib/apply/__tests__/diff.test.js.map +0 -1
  250. package/dist/commands/apply.d.ts +0 -3
  251. package/dist/commands/apply.d.ts.map +0 -1
  252. package/dist/commands/apply.js +0 -5
  253. package/dist/commands/apply.js.map +0 -1
  254. package/dist/commands/memory/_lib/openclaw-auth.test.d.ts +0 -2
  255. package/dist/commands/memory/_lib/openclaw-auth.test.d.ts.map +0 -1
  256. package/dist/commands/memory/_lib/openclaw-auth.test.js +0 -9
  257. package/dist/commands/memory/_lib/openclaw-auth.test.js.map +0 -1
  258. package/dist/internal/__tests__/api-client.test.d.ts +0 -2
  259. package/dist/internal/__tests__/api-client.test.d.ts.map +0 -1
  260. package/dist/internal/__tests__/api-client.test.js +0 -95
  261. package/dist/internal/__tests__/api-client.test.js.map +0 -1
  262. package/dist/internal/__tests__/context.test.d.ts +0 -2
  263. package/dist/internal/__tests__/context.test.d.ts.map +0 -1
  264. package/dist/internal/__tests__/context.test.js +0 -77
  265. package/dist/internal/__tests__/context.test.js.map +0 -1
@@ -0,0 +1,629 @@
1
+ /**
2
+ * Website Connector
3
+ *
4
+ * Scrapes web pages using Playwright for JS rendering.
5
+ * Supports sitemap.xml discovery or explicit URL list.
6
+ * Converts HTML → Markdown, splits into hierarchical sections.
7
+ * Tracks changes between syncs via content hashing.
8
+ */
9
+
10
+ import { createHash } from 'node:crypto';
11
+ import TurndownService from 'turndown';
12
+ import {
13
+ type ActionContext,
14
+ type ActionResult,
15
+ type ConnectorDefinition,
16
+ ConnectorRuntime,
17
+ type EventEnvelope,
18
+ launchBrowser,
19
+ type SyncContext,
20
+ type SyncResult,
21
+ } from '@lobu/connector-sdk';
22
+ import type { Page } from 'playwright';
23
+
24
+ interface PageSection {
25
+ heading: string;
26
+ level: number;
27
+ content: string;
28
+ anchor: string;
29
+ }
30
+
31
+ const COOKIE_BANNER_PATTERNS = [
32
+ /\bcookie(s)?\b/i,
33
+ /\bconsent\b/i,
34
+ /\baccept all\b/i,
35
+ /\breject all\b/i,
36
+ /\bmanage (my )?preferences\b/i,
37
+ /\bprivacy preferences\b/i,
38
+ /\bmarketing\b/i,
39
+ /\bmeasurement\b/i,
40
+ /\bnecessary\b/i,
41
+ ];
42
+
43
+ function countPatternMatches(text: string, patterns: RegExp[]): number {
44
+ return patterns.reduce((count, pattern) => count + (pattern.test(text) ? 1 : 0), 0);
45
+ }
46
+
47
+ function shouldSkipCookieBannerText(text: string): boolean {
48
+ const normalized = text.replace(/\s+/g, ' ').trim();
49
+ if (!normalized) return false;
50
+ return countPatternMatches(normalized, COOKIE_BANNER_PATTERNS) >= 3;
51
+ }
52
+
53
+ /**
54
+ * Validates a URL is safe for server-side fetching.
55
+ * Blocks private/internal network addresses to prevent SSRF attacks.
56
+ */
57
+ function validatePublicUrl(url: string): void {
58
+ let parsed: URL;
59
+ try {
60
+ parsed = new URL(url);
61
+ } catch {
62
+ throw new Error(`Invalid URL: ${url}`);
63
+ }
64
+
65
+ if (parsed.protocol !== 'https:' && parsed.protocol !== 'http:') {
66
+ throw new Error(`URL must use http: or https: protocol, got ${parsed.protocol}`);
67
+ }
68
+
69
+ const hostname = parsed.hostname.toLowerCase();
70
+
71
+ // Block localhost variants
72
+ if (hostname === 'localhost' || hostname === '[::1]' || hostname.endsWith('.localhost')) {
73
+ throw new Error(`URL must not point to localhost: ${hostname}`);
74
+ }
75
+
76
+ // Block private/internal IP ranges
77
+ // IPv4 patterns: 127.x.x.x, 10.x.x.x, 192.168.x.x, 172.16-31.x.x, 169.254.x.x, 0.x.x.x
78
+ const ipv4Match = hostname.match(/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/);
79
+ if (ipv4Match) {
80
+ const [, a, b] = ipv4Match.map(Number);
81
+ if (
82
+ a === 127 || // 127.0.0.0/8 loopback
83
+ a === 10 || // 10.0.0.0/8 private
84
+ (a === 172 && b >= 16 && b <= 31) || // 172.16.0.0/12 private
85
+ (a === 192 && b === 168) || // 192.168.0.0/16 private
86
+ (a === 169 && b === 254) || // 169.254.0.0/16 link-local
87
+ a === 0 // 0.0.0.0/8
88
+ ) {
89
+ throw new Error(`URL must not point to a private/internal IP address: ${hostname}`);
90
+ }
91
+ }
92
+
93
+ // Block IPv6 private ranges (bracketed notation in URLs)
94
+ if (hostname.startsWith('[')) {
95
+ const ipv6 = hostname.slice(1, -1).toLowerCase();
96
+ if (
97
+ ipv6 === '::1' ||
98
+ ipv6.startsWith('fe80:') || // link-local
99
+ ipv6.startsWith('fc') || // unique local (fc00::/7)
100
+ ipv6.startsWith('fd') || // unique local (fc00::/7)
101
+ ipv6 === '::' || // unspecified
102
+ ipv6.startsWith('::ffff:') // IPv4-mapped IPv6
103
+ ) {
104
+ throw new Error(`URL must not point to a private/internal IPv6 address: ${hostname}`);
105
+ }
106
+ }
107
+
108
+ // Block common internal hostnames
109
+ if (
110
+ hostname.endsWith('.internal') ||
111
+ hostname.endsWith('.local') ||
112
+ hostname.endsWith('.corp') ||
113
+ hostname.endsWith('.lan')
114
+ ) {
115
+ throw new Error(`URL must not point to an internal hostname: ${hostname}`);
116
+ }
117
+ }
118
+
119
+ export default class WebsiteConnector extends ConnectorRuntime {
120
+ readonly definition: ConnectorDefinition = {
121
+ key: 'website',
122
+ name: 'Website',
123
+ description:
124
+ 'Scrapes web pages with JS rendering via Playwright. Supports sitemap.xml for auto-discovery. Converts to markdown sections and tracks changes.',
125
+ version: '1.0.0',
126
+ faviconDomain: 'google.com',
127
+ authSchema: {
128
+ methods: [{ type: 'none' }],
129
+ },
130
+ feeds: {
131
+ pages: {
132
+ key: 'pages',
133
+ name: 'Web Pages',
134
+ description: 'Scrape and parse web pages into structured content.',
135
+ configSchema: {
136
+ type: 'object',
137
+ properties: {
138
+ sitemap_url: {
139
+ type: 'string',
140
+ format: 'uri',
141
+ description:
142
+ 'URL to sitemap.xml. All URLs from the sitemap will be scraped. Takes priority over urls.',
143
+ },
144
+ urls: {
145
+ type: 'array',
146
+ items: { type: 'string', format: 'uri' },
147
+ description: 'Explicit list of URLs to scrape. Ignored if sitemap_url is set.',
148
+ },
149
+ max_pages: {
150
+ type: 'integer',
151
+ minimum: 1,
152
+ maximum: 100,
153
+ default: 20,
154
+ description: 'Maximum number of pages to scrape per sync (default: 20)',
155
+ },
156
+ parse_sections: {
157
+ type: 'boolean',
158
+ default: true,
159
+ description:
160
+ 'Split page into sections by headings (h1-h3). If false, one event per page.',
161
+ },
162
+ wait_for_selector: {
163
+ type: 'string',
164
+ description:
165
+ 'CSS selector to wait for before extracting content (e.g. "main", "#content"). Useful for SPAs.',
166
+ },
167
+ },
168
+ },
169
+ eventKinds: {
170
+ page: {
171
+ description: 'Full page content',
172
+ metadataSchema: {
173
+ type: 'object',
174
+ properties: {
175
+ content_hash: { type: 'string' },
176
+ meta_title: { type: 'string' },
177
+ meta_description: { type: 'string' },
178
+ og_image: { type: 'string' },
179
+ word_count: { type: 'number' },
180
+ },
181
+ },
182
+ },
183
+ section: {
184
+ description: 'A section of a page (split by headings)',
185
+ metadataSchema: {
186
+ type: 'object',
187
+ properties: {
188
+ heading: { type: 'string' },
189
+ heading_level: { type: 'number' },
190
+ anchor: { type: 'string' },
191
+ section_index: { type: 'number' },
192
+ page_url: { type: 'string' },
193
+ content_hash: { type: 'string' },
194
+ },
195
+ },
196
+ },
197
+ },
198
+ },
199
+ },
200
+ };
201
+
202
+ private turndown: TurndownService;
203
+ private readonly PAGE_TIMEOUT = 30000;
204
+ private readonly PAGE_DELAY_MS = 2000;
205
+
206
+ constructor() {
207
+ super();
208
+ this.turndown = new TurndownService({
209
+ headingStyle: 'atx',
210
+ codeBlockStyle: 'fenced',
211
+ });
212
+ }
213
+
214
+ async sync(ctx: SyncContext): Promise<SyncResult> {
215
+ const sitemapUrl = ctx.config.sitemap_url as string | undefined;
216
+ const explicitUrls = ctx.config.urls as string[] | undefined;
217
+ const maxPages = (ctx.config.max_pages as number) ?? 20;
218
+ const parseSections = (ctx.config.parse_sections as boolean) ?? true;
219
+ const waitForSelector = ctx.config.wait_for_selector as string | undefined;
220
+ const previousHashes = (ctx.checkpoint?.hashes as Record<string, string>) ?? {};
221
+
222
+ // Resolve URLs from sitemap or explicit list
223
+ let urls: string[];
224
+ if (sitemapUrl) {
225
+ validatePublicUrl(sitemapUrl);
226
+ urls = await this.fetchSitemap(sitemapUrl);
227
+ ctx.log?.(`Sitemap: found ${urls.length} URLs`);
228
+ } else if (explicitUrls?.length) {
229
+ urls = explicitUrls;
230
+ } else {
231
+ return {
232
+ events: [],
233
+ checkpoint: ctx.checkpoint,
234
+ metadata: { error: 'No sitemap_url or urls configured' },
235
+ };
236
+ }
237
+
238
+ urls = urls.slice(0, maxPages);
239
+
240
+ // Launch browser
241
+ const { browser } = await launchBrowser({ stealth: false });
242
+ const events: EventEnvelope[] = [];
243
+ const newHashes: Record<string, string> = {};
244
+
245
+ try {
246
+ for (let i = 0; i < urls.length; i++) {
247
+ const url = urls[i];
248
+ try {
249
+ validatePublicUrl(url);
250
+ const page = (await browser.newPage()) as Page;
251
+ try {
252
+ await page.goto(url, { waitUntil: 'networkidle', timeout: this.PAGE_TIMEOUT });
253
+ await this.dismissOverlays(page);
254
+
255
+ if (waitForSelector) {
256
+ await page.waitForSelector(waitForSelector, { timeout: 10000 }).catch(() => {
257
+ ctx.log?.(`Selector "${waitForSelector}" not found on ${url}, continuing anyway`);
258
+ });
259
+ }
260
+
261
+ await this.removeHiddenElements(page);
262
+ const html = await page.content();
263
+ const finalUrl = page.url();
264
+ const meta = this.extractMeta(html);
265
+ const cleanHtml = this.stripNonContent(html);
266
+ const markdown = this.deduplicateMarkdown(this.turndown.turndown(cleanHtml).trim());
267
+ if (!markdown || shouldSkipCookieBannerText(`${meta.title ?? ''}\n${markdown}`)) {
268
+ ctx.log?.(`Skipping low-signal page content for ${finalUrl}`);
269
+ continue;
270
+ }
271
+ const contentHash = this.hash(markdown);
272
+
273
+ if (previousHashes[url] === contentHash) {
274
+ newHashes[url] = contentHash;
275
+ continue;
276
+ }
277
+ newHashes[url] = contentHash;
278
+
279
+ if (parseSections) {
280
+ const sections = this.parseSections(markdown);
281
+ for (let si = 0; si < sections.length; si++) {
282
+ const section = sections[si];
283
+ const sectionHash = this.hash(section.content);
284
+ const sectionKey = `${url}#${section.anchor}`;
285
+
286
+ if (previousHashes[sectionKey] === sectionHash) {
287
+ newHashes[sectionKey] = sectionHash;
288
+ continue;
289
+ }
290
+ newHashes[sectionKey] = sectionHash;
291
+ if (shouldSkipCookieBannerText(`${section.heading}\n${section.content}`)) {
292
+ continue;
293
+ }
294
+
295
+ const parentKey = section.parentAnchor
296
+ ? `${url}#${section.parentAnchor}`
297
+ : undefined;
298
+ events.push({
299
+ origin_id: `web_section_${this.hash(sectionKey)}`,
300
+ title: section.heading,
301
+ payload_text: section.content,
302
+ source_url: `${finalUrl}#${section.anchor}`,
303
+ occurred_at: new Date(),
304
+ origin_type: 'section',
305
+ semantic_type: 'section',
306
+ score: 50,
307
+ origin_parent_id: parentKey ? `web_section_${this.hash(parentKey)}` : undefined,
308
+ metadata: {
309
+ heading: section.heading,
310
+ heading_level: section.level,
311
+ anchor: section.anchor,
312
+ section_index: si,
313
+ page_url: finalUrl,
314
+ content_hash: sectionHash,
315
+ },
316
+ });
317
+ }
318
+ } else {
319
+ events.push({
320
+ origin_id: `web_page_${this.hash(url)}`,
321
+ title: meta.title || finalUrl,
322
+ payload_text: markdown,
323
+ source_url: finalUrl,
324
+ occurred_at: new Date(),
325
+ origin_type: 'page',
326
+ semantic_type: 'page',
327
+ score: 50,
328
+ metadata: {
329
+ content_hash: contentHash,
330
+ meta_title: meta.title,
331
+ meta_description: meta.description,
332
+ og_image: meta.ogImage,
333
+ word_count: markdown.split(/\s+/).length,
334
+ },
335
+ });
336
+ }
337
+ } finally {
338
+ await page.close();
339
+ }
340
+ } catch (err) {
341
+ ctx.log?.(`Failed to scrape ${url}: ${err instanceof Error ? err.message : String(err)}`);
342
+ }
343
+
344
+ if (i < urls.length - 1) {
345
+ await this.sleep(this.PAGE_DELAY_MS);
346
+ }
347
+ }
348
+ } finally {
349
+ await browser.close();
350
+ }
351
+
352
+ return {
353
+ events,
354
+ checkpoint: { hashes: newHashes, last_sync_at: new Date().toISOString() },
355
+ metadata: { pages_scraped: urls.length, events_created: events.length },
356
+ };
357
+ }
358
+
359
+ async execute(_ctx: ActionContext): Promise<ActionResult> {
360
+ return { success: false, error: 'Actions not supported' };
361
+ }
362
+
363
+ private async dismissOverlays(page: Page): Promise<void> {
364
+ const dismissLabels = [
365
+ 'Accept',
366
+ 'Accept all',
367
+ 'I agree',
368
+ 'Allow all',
369
+ 'Got it',
370
+ 'Continue',
371
+ 'Close',
372
+ ];
373
+
374
+ for (const label of dismissLabels) {
375
+ try {
376
+ const button = page.getByRole('button', { name: new RegExp(`^${label}$`, 'i') }).first();
377
+ if (await button.isVisible({ timeout: 500 })) {
378
+ await button.click({ timeout: 1000 });
379
+ break;
380
+ }
381
+ } catch {
382
+ // Keep trying other labels/selectors.
383
+ }
384
+ }
385
+
386
+ await page
387
+ .evaluate(() => {
388
+ const selectors = [
389
+ '[id*="cookie" i]',
390
+ '[class*="cookie" i]',
391
+ '[id*="consent" i]',
392
+ '[class*="consent" i]',
393
+ '[id*="onetrust" i]',
394
+ '[class*="onetrust" i]',
395
+ '[aria-modal="true"]',
396
+ '[role="dialog"]',
397
+ ];
398
+
399
+ for (const element of document.querySelectorAll(selectors.join(','))) {
400
+ const html = (element as HTMLElement).innerText || '';
401
+ if (/cookie|consent|privacy/i.test(html)) {
402
+ element.remove();
403
+ }
404
+ }
405
+ })
406
+ .catch(() => {
407
+ // DOM cleanup is best-effort.
408
+ });
409
+ }
410
+
411
+ /**
412
+ * Remove DOM elements hidden via CSS (display:none, visibility:hidden, zero dimensions).
413
+ * This handles responsive duplicates where the same content is rendered for
414
+ * desktop and mobile with Tailwind classes like `hidden md:block` / `md:hidden`.
415
+ */
416
+ private async removeHiddenElements(page: Page): Promise<void> {
417
+ await page
418
+ .evaluate(() => {
419
+ const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_ELEMENT);
420
+ const toRemove: Element[] = [];
421
+ while (walker.nextNode()) {
422
+ const el = walker.currentNode as HTMLElement;
423
+ // Skip elements that can't meaningfully contain scraped content
424
+ if (['SCRIPT', 'STYLE', 'LINK', 'META', 'BR', 'HR'].includes(el.tagName)) continue;
425
+ const style = getComputedStyle(el);
426
+ if (style.display === 'none' || style.visibility === 'hidden') {
427
+ toRemove.push(el);
428
+ }
429
+ }
430
+ for (const el of toRemove) {
431
+ el.remove();
432
+ }
433
+ })
434
+ .catch(() => {
435
+ // Best-effort — continue with the full DOM if this fails.
436
+ });
437
+ }
438
+
439
+ /**
440
+ * Deduplicate repeated lines in markdown output.
441
+ * Animation containers and responsive layouts often produce identical image
442
+ * or link lines multiple times. This keeps the first occurrence of each.
443
+ */
444
+ private deduplicateMarkdown(markdown: string): string {
445
+ const lines = markdown.split('\n');
446
+ const seen = new Set<string>();
447
+ const result: string[] = [];
448
+ for (const line of lines) {
449
+ const trimmed = line.trim();
450
+ // Only dedup substantial lines (short lines like blank lines or list markers are fine to repeat)
451
+ if (trimmed.length >= 80) {
452
+ if (seen.has(trimmed)) continue;
453
+ seen.add(trimmed);
454
+ }
455
+ result.push(line);
456
+ }
457
+ return result.join('\n');
458
+ }
459
+
460
+ private async fetchSitemap(sitemapUrl: string): Promise<string[]> {
461
+ const response = await fetch(sitemapUrl, {
462
+ headers: { 'User-Agent': 'Mozilla/5.0 (compatible; LobuBot/1.0)' },
463
+ });
464
+
465
+ if (!response.ok) {
466
+ throw new Error(`Sitemap fetch failed: HTTP ${response.status}`);
467
+ }
468
+
469
+ const xml = await response.text();
470
+ const urls: string[] = [];
471
+
472
+ // Parse <loc> tags from sitemap XML
473
+ const locPattern = /<loc>\s*(.*?)\s*<\/loc>/gi;
474
+ let match;
475
+ while ((match = locPattern.exec(xml)) !== null) {
476
+ const url = match[1].trim();
477
+ // Skip non-HTML resources and anchor fragment URLs
478
+ if (
479
+ url &&
480
+ !url.match(/\.(pdf|jpg|jpeg|png|gif|svg|css|js|xml|json|zip|gz)$/i) &&
481
+ !url.includes('#')
482
+ ) {
483
+ urls.push(url);
484
+ }
485
+ }
486
+
487
+ // Handle sitemap index (sitemaps linking to other sitemaps)
488
+ if (urls.length === 0) {
489
+ const sitemapPattern = /<sitemap>\s*<loc>\s*(.*?)\s*<\/loc>/gi;
490
+ const childSitemaps: string[] = [];
491
+ while ((match = sitemapPattern.exec(xml)) !== null) {
492
+ childSitemaps.push(match[1].trim());
493
+ }
494
+ for (const childUrl of childSitemaps.slice(0, 5)) {
495
+ validatePublicUrl(childUrl);
496
+ const childUrls = await this.fetchSitemap(childUrl);
497
+ urls.push(...childUrls);
498
+ }
499
+ }
500
+
501
+ return urls;
502
+ }
503
+
504
+ private extractMeta(html: string): { title?: string; description?: string; ogImage?: string } {
505
+ const titleMatch = html.match(/<title[^>]*>(.*?)<\/title>/is);
506
+ const descMatch =
507
+ html.match(/<meta[^>]+name=["']description["'][^>]+content=["']([^"']+)["']/i) ||
508
+ html.match(/<meta[^>]+content=["']([^"']+)["'][^>]+name=["']description["']/i);
509
+ const ogMatch =
510
+ html.match(/<meta[^>]+property=["']og:image["'][^>]+content=["']([^"']+)["']/i) ||
511
+ html.match(/<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:image["']/i);
512
+
513
+ return {
514
+ title: titleMatch?.[1]?.trim(),
515
+ description: descMatch?.[1]?.trim(),
516
+ ogImage: ogMatch?.[1]?.trim(),
517
+ };
518
+ }
519
+
520
+ private stripNonContent(html: string): string {
521
+ const tags = [
522
+ 'script',
523
+ 'style',
524
+ 'noscript',
525
+ 'nav',
526
+ 'header',
527
+ 'footer',
528
+ 'aside',
529
+ 'iframe',
530
+ 'svg',
531
+ 'canvas',
532
+ 'video',
533
+ 'audio',
534
+ 'menu',
535
+ 'dialog',
536
+ 'embed',
537
+ 'object',
538
+ 'applet',
539
+ ];
540
+ let cleaned = html;
541
+ for (const tag of tags) {
542
+ cleaned = cleaned.replace(new RegExp(`<${tag}\\b[^>]*>[\\s\\S]*?<\\/${tag}>`, 'gi'), '');
543
+ }
544
+ // Remove self-closing / void elements that add noise
545
+ cleaned = cleaned.replace(/<(link|meta|input)\b[^>]*\/?>/gi, '');
546
+ return cleaned;
547
+ }
548
+
549
+ private parseSections(markdown: string): (PageSection & { parentAnchor?: string })[] {
550
+ const lines = markdown.split('\n');
551
+ const sections: (PageSection & { parentAnchor?: string })[] = [];
552
+ let currentHeading = 'Introduction';
553
+ let currentLevel = 1;
554
+ let currentLines: string[] = [];
555
+
556
+ // Per-slug counters so anchors stay stable when unrelated sections change.
557
+ // Only incremented when a section is emitted, not for heading stack entries.
558
+ const slugCounts = new Map<string, number>();
559
+
560
+ const makeAnchor = (heading: string): string => {
561
+ const slug = this.slugify(heading);
562
+ const count = slugCounts.get(slug) ?? 0;
563
+ slugCounts.set(slug, count + 1);
564
+ return count === 0 ? slug : `${slug}-${count}`;
565
+ };
566
+
567
+ // Track parent heading stack for hierarchy.
568
+ // Anchors are assigned lazily when the heading's section is emitted.
569
+ const headingStack: { heading: string; level: number; anchor?: string }[] = [];
570
+
571
+ const emitSection = (heading: string, level: number, content: string) => {
572
+ const anchor = makeAnchor(heading);
573
+ // Update the heading stack entry for this heading so children can reference it
574
+ const stackEntry = headingStack.find((e) => e.heading === heading && e.anchor === undefined);
575
+ if (stackEntry) stackEntry.anchor = anchor;
576
+ const parent = headingStack.length > 0 ? headingStack[headingStack.length - 1] : undefined;
577
+ const parentAnchor = parent?.heading === heading ? undefined : parent?.anchor;
578
+ sections.push({ heading, level, content, anchor, parentAnchor });
579
+ };
580
+
581
+ for (const line of lines) {
582
+ const headingMatch = line.match(/^(#{1,3})\s+(.+)$/);
583
+ if (headingMatch) {
584
+ const content = currentLines.join('\n').trim();
585
+ if (content.length > 0) {
586
+ emitSection(currentHeading, currentLevel, content);
587
+ }
588
+
589
+ const newLevel = headingMatch[1].length;
590
+ const newHeading = headingMatch[2].trim();
591
+
592
+ // Pop stack until we find a parent with a lower level
593
+ while (headingStack.length > 0 && headingStack[headingStack.length - 1].level >= newLevel) {
594
+ headingStack.pop();
595
+ }
596
+ headingStack.push({ heading: newHeading, level: newLevel });
597
+
598
+ currentHeading = newHeading;
599
+ currentLevel = newLevel;
600
+ currentLines = [];
601
+ } else {
602
+ currentLines.push(line);
603
+ }
604
+ }
605
+
606
+ const content = currentLines.join('\n').trim();
607
+ if (content.length > 0) {
608
+ emitSection(currentHeading, currentLevel, content);
609
+ }
610
+
611
+ return sections;
612
+ }
613
+
614
+ private slugify(text: string): string {
615
+ return text
616
+ .toLowerCase()
617
+ .replace(/[^a-z0-9]+/g, '-')
618
+ .replace(/^-|-$/g, '')
619
+ .substring(0, 60);
620
+ }
621
+
622
+ private hash(text: string): string {
623
+ return createHash('sha256').update(text).digest('hex').substring(0, 16);
624
+ }
625
+
626
+ private sleep(ms: number): Promise<void> {
627
+ return new Promise((resolve) => setTimeout(resolve, ms));
628
+ }
629
+ }