@lobu/cli 6.0.0 → 6.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -27
- package/dist/bundled-skills/lobu/SKILL.md +12 -12
- package/dist/commands/_lib/apply/apply-cmd.d.ts +2 -0
- package/dist/commands/_lib/apply/apply-cmd.d.ts.map +1 -1
- package/dist/commands/_lib/apply/apply-cmd.js +26 -0
- package/dist/commands/_lib/apply/apply-cmd.js.map +1 -1
- package/dist/commands/_lib/apply/client.d.ts +1 -1
- package/dist/commands/_lib/apply/client.d.ts.map +1 -1
- package/dist/commands/_lib/apply/desired-state.js +6 -6
- package/dist/commands/_lib/apply/desired-state.js.map +1 -1
- package/dist/commands/agent.d.ts +7 -0
- package/dist/commands/agent.d.ts.map +1 -1
- package/dist/commands/agent.js +65 -1
- package/dist/commands/agent.js.map +1 -1
- package/dist/commands/chat.d.ts +12 -9
- package/dist/commands/chat.d.ts.map +1 -1
- package/dist/commands/chat.js +117 -56
- package/dist/commands/chat.js.map +1 -1
- package/dist/commands/dev.d.ts +15 -7
- package/dist/commands/dev.d.ts.map +1 -1
- package/dist/commands/dev.js +79 -44
- package/dist/commands/dev.js.map +1 -1
- package/dist/commands/doctor.d.ts +1 -0
- package/dist/commands/doctor.d.ts.map +1 -1
- package/dist/commands/doctor.js +136 -0
- package/dist/commands/doctor.js.map +1 -1
- package/dist/commands/eval.d.ts +8 -0
- package/dist/commands/eval.d.ts.map +1 -1
- package/dist/commands/eval.js +56 -1
- package/dist/commands/eval.js.map +1 -1
- package/dist/commands/init.d.ts +20 -5
- package/dist/commands/init.d.ts.map +1 -1
- package/dist/commands/init.js +332 -183
- package/dist/commands/init.js.map +1 -1
- package/dist/commands/link.d.ts +11 -0
- package/dist/commands/link.d.ts.map +1 -0
- package/dist/commands/link.js +28 -0
- package/dist/commands/link.js.map +1 -0
- package/dist/commands/login.d.ts.map +1 -1
- package/dist/commands/login.js +14 -2
- package/dist/commands/login.js.map +1 -1
- package/dist/commands/memory/_lib/browser-auth-cmd.d.ts.map +1 -1
- package/dist/commands/memory/_lib/browser-auth-cmd.js +4 -4
- package/dist/commands/memory/_lib/browser-auth-cmd.js.map +1 -1
- package/dist/commands/memory/_lib/install-targets.d.ts.map +1 -1
- package/dist/commands/memory/_lib/install-targets.js +1 -5
- package/dist/commands/memory/_lib/install-targets.js.map +1 -1
- package/dist/commands/memory/_lib/mcp.d.ts +2 -2
- package/dist/commands/memory/_lib/mcp.d.ts.map +1 -1
- package/dist/commands/memory/_lib/mcp.js +24 -12
- package/dist/commands/memory/_lib/mcp.js.map +1 -1
- package/dist/commands/memory/_lib/openclaw-auth.d.ts +1 -0
- package/dist/commands/memory/_lib/openclaw-auth.d.ts.map +1 -1
- package/dist/commands/memory/_lib/openclaw-auth.js +14 -3
- package/dist/commands/memory/_lib/openclaw-auth.js.map +1 -1
- package/dist/commands/memory/_lib/openclaw-cmd.js +1 -1
- package/dist/commands/memory/_lib/openclaw-cmd.js.map +1 -1
- package/dist/commands/memory/_lib/schema.d.ts +2 -2
- package/dist/commands/memory/_lib/schema.d.ts.map +1 -1
- package/dist/commands/memory/_lib/schema.js +3 -3
- package/dist/commands/memory/_lib/schema.js.map +1 -1
- package/dist/commands/memory/_lib/seed-cmd.d.ts.map +1 -1
- package/dist/commands/memory/_lib/seed-cmd.js +5 -6
- package/dist/commands/memory/_lib/seed-cmd.js.map +1 -1
- package/dist/commands/memory/run.d.ts.map +1 -1
- package/dist/commands/memory/run.js +2 -2
- package/dist/commands/memory/run.js.map +1 -1
- package/dist/commands/platforms/platform-prompts.d.ts +0 -1
- package/dist/commands/platforms/platform-prompts.d.ts.map +1 -1
- package/dist/commands/platforms/platform-prompts.js +54 -8
- package/dist/commands/platforms/platform-prompts.js.map +1 -1
- package/dist/commands/telemetry.d.ts +10 -0
- package/dist/commands/telemetry.d.ts.map +1 -0
- package/dist/commands/telemetry.js +68 -0
- package/dist/commands/telemetry.js.map +1 -0
- package/dist/commands/whoami.d.ts.map +1 -1
- package/dist/commands/whoami.js +1 -1
- package/dist/commands/whoami.js.map +1 -1
- package/dist/connectors/README.md +534 -0
- package/dist/connectors/__tests__/browser-scraper-utils.test.ts +186 -0
- package/dist/connectors/browser-scraper-utils.ts +214 -0
- package/dist/connectors/capterra.ts +273 -0
- package/dist/connectors/g2.ts +286 -0
- package/dist/connectors/github.ts +1553 -0
- package/dist/connectors/glassdoor.ts +291 -0
- package/dist/connectors/gmaps.ts +197 -0
- package/dist/connectors/google_calendar.ts +631 -0
- package/dist/connectors/google_gmail.ts +751 -0
- package/dist/connectors/google_photos.ts +776 -0
- package/dist/connectors/google_play.ts +342 -0
- package/dist/connectors/hackernews.ts +471 -0
- package/dist/connectors/index.ts +23 -0
- package/dist/connectors/ios_appstore.ts +226 -0
- package/dist/connectors/linkedin.ts +471 -0
- package/dist/connectors/microsoft_outlook.ts +410 -0
- package/dist/connectors/producthunt.ts +471 -0
- package/dist/connectors/reddit.ts +600 -0
- package/dist/connectors/rss.ts +448 -0
- package/dist/connectors/spotify.ts +590 -0
- package/dist/connectors/trustpilot.ts +199 -0
- package/dist/connectors/website.ts +629 -0
- package/dist/connectors/whatsapp.ts +1073 -0
- package/dist/connectors/x.ts +526 -0
- package/dist/connectors/youtube.ts +666 -0
- package/dist/db/migrations/00000000000000_baseline.sql +4867 -0
- package/dist/db/migrations/20260405193000_add_mcp_sessions.sql +33 -0
- package/dist/db/migrations/20260408120000_remove_system_connectors.sql +48 -0
- package/dist/db/migrations/20260408120001_optional_compiled_code.sql +6 -0
- package/dist/db/migrations/20260409110000_add_active_watcher_run_index.sql +9 -0
- package/dist/db/migrations/20260409130000_connector_default_config.sql +5 -0
- package/dist/db/migrations/20260410120000_add_agent_secrets.sql +25 -0
- package/dist/db/migrations/20260413170000_add_watcher_group_id.sql +67 -0
- package/dist/db/migrations/20260416120000_add_entity_wa_jid_index.sql +14 -0
- package/dist/db/migrations/20260417100000_add_entity_identities.sql +77 -0
- package/dist/db/migrations/20260418100000_add_auth_runs.sql +83 -0
- package/dist/db/migrations/20260418110000_add_runs_created_by_user.sql +18 -0
- package/dist/db/migrations/20260419120000_add_event_identity_indexes.sql +56 -0
- package/dist/db/migrations/20260420120000_extend_reserved_org_slugs.sql +56 -0
- package/dist/db/migrations/20260424030000_add_watcher_run_correlation.sql +52 -0
- package/dist/db/migrations/20260424130000_relax_events_client_id_fk.sql +47 -0
- package/dist/db/migrations/20260425100000_normalize_watcher_feedback.sql +91 -0
- package/dist/db/migrations/20260425120000_add_run_diagnostics.sql +20 -0
- package/dist/db/migrations/20260425130000_add_repair_agent_plumbing.sql +46 -0
- package/dist/db/migrations/20260426120000_entities_entity_type_fk.sql +101 -0
- package/dist/db/migrations/20260426130000_db_integrity_cleanup.sql +104 -0
- package/dist/db/migrations/20260426130001_db_integrity_cleanup_concurrent.sql +187 -0
- package/dist/db/migrations/20260427133000_events_created_by_nullable.sql +74 -0
- package/dist/db/migrations/20260427140000_identity_engine_indexes.sql +140 -0
- package/dist/db/migrations/20260427150000_drop_events_source_id.sql +177 -0
- package/dist/db/migrations/20260427160000_drop_dead_schema.sql +76 -0
- package/dist/db/migrations/20260427170000_market_founder_to_member.sql +364 -0
- package/dist/db/migrations/20260428040000_cascade_events_watchers_org_fk.sql +66 -0
- package/dist/db/migrations/20260428050000_add_runs_approved_input.sql +9 -0
- package/dist/db/migrations/20260429010000_auth_profile_tenant_scoped_fk.sql +79 -0
- package/dist/db/migrations/20260429060000_extend_runs_for_lobu_queue.sql +108 -0
- package/dist/db/migrations/20260429120000_agent_changed_notify.sql +97 -0
- package/dist/db/migrations/20260429120100_user_auth_profiles_and_model_prefs.sql +36 -0
- package/dist/db/migrations/20260429120200_fix_notify_old_keys.sql +130 -0
- package/dist/db/migrations/20260429130000_oauth_states_cli_sessions_rate_limits.sql +83 -0
- package/dist/db/migrations/20260429140000_phase8_grants_chat_connections_mcp_sessions.sql +84 -0
- package/dist/db/migrations/20260429140100_runs_priority_expires_at_retry_delay.sql +44 -0
- package/dist/db/migrations/20260429180000_drop_invalidatable_cache_triggers.sql +25 -0
- package/dist/db/migrations/20260430005614_agents_apply_fields.sql +21 -0
- package/dist/db/migrations/20260430022231_fix_connection_config_encryption.sql +69 -0
- package/dist/db/migrations/20260430151215_add_task_run_type.sql +77 -0
- package/dist/db/migrations/20260501000000_drop_cli_sessions.sql +27 -0
- package/dist/db/migrations/20260501133000_lobu_memory_mcp_id.sql +117 -0
- package/dist/db/migrations/20260502000000_drop_chat_connections.sql +60 -0
- package/dist/db/migrations/20260503000000_agent_secrets_org_scope.sql +56 -0
- package/dist/db/migrations/20260504000000_flatten_agents_drop_sandbox_model.sql +48 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +147 -23
- package/dist/index.js.map +1 -1
- package/dist/internal/api-client.d.ts +4 -8
- package/dist/internal/api-client.d.ts.map +1 -1
- package/dist/internal/api-client.js +1 -1
- package/dist/internal/api-client.js.map +1 -1
- package/dist/internal/context.js +2 -2
- package/dist/internal/context.js.map +1 -1
- package/dist/internal/credentials.d.ts.map +1 -1
- package/dist/internal/credentials.js +6 -1
- package/dist/internal/credentials.js.map +1 -1
- package/dist/internal/index.d.ts +2 -3
- package/dist/internal/index.d.ts.map +1 -1
- package/dist/internal/index.js +2 -2
- package/dist/internal/index.js.map +1 -1
- package/dist/internal/oauth.d.ts +7 -6
- package/dist/internal/oauth.d.ts.map +1 -1
- package/dist/internal/oauth.js +3 -3
- package/dist/internal/project-link.d.ts +10 -0
- package/dist/internal/project-link.d.ts.map +1 -0
- package/dist/internal/project-link.js +48 -0
- package/dist/internal/project-link.js.map +1 -0
- package/dist/providers.json +2 -2
- package/dist/server.bundle.mjs +3173 -4404
- package/dist/start-local.bundle.mjs +71481 -0
- package/dist/templates/README.md.tmpl +10 -11
- package/package.json +14 -12
- package/dist/__tests__/chat.integration.test.d.ts +0 -2
- package/dist/__tests__/chat.integration.test.d.ts.map +0 -1
- package/dist/__tests__/chat.integration.test.js +0 -337
- package/dist/__tests__/chat.integration.test.js.map +0 -1
- package/dist/__tests__/dev.test.d.ts +0 -2
- package/dist/__tests__/dev.test.d.ts.map +0 -1
- package/dist/__tests__/dev.test.js +0 -25
- package/dist/__tests__/dev.test.js.map +0 -1
- package/dist/__tests__/init-memory.test.d.ts +0 -2
- package/dist/__tests__/init-memory.test.d.ts.map +0 -1
- package/dist/__tests__/init-memory.test.js +0 -45
- package/dist/__tests__/init-memory.test.js.map +0 -1
- package/dist/__tests__/token.test.d.ts +0 -2
- package/dist/__tests__/token.test.d.ts.map +0 -1
- package/dist/__tests__/token.test.js +0 -52
- package/dist/__tests__/token.test.js.map +0 -1
- package/dist/commands/_lib/apply/__tests__/client.test.d.ts +0 -2
- package/dist/commands/_lib/apply/__tests__/client.test.d.ts.map +0 -1
- package/dist/commands/_lib/apply/__tests__/client.test.js +0 -23
- package/dist/commands/_lib/apply/__tests__/client.test.js.map +0 -1
- package/dist/commands/_lib/apply/__tests__/desired-state.test.d.ts +0 -2
- package/dist/commands/_lib/apply/__tests__/desired-state.test.d.ts.map +0 -1
- package/dist/commands/_lib/apply/__tests__/desired-state.test.js +0 -140
- package/dist/commands/_lib/apply/__tests__/desired-state.test.js.map +0 -1
- package/dist/commands/_lib/apply/__tests__/diff.test.d.ts +0 -2
- package/dist/commands/_lib/apply/__tests__/diff.test.d.ts.map +0 -1
- package/dist/commands/_lib/apply/__tests__/diff.test.js +0 -378
- package/dist/commands/_lib/apply/__tests__/diff.test.js.map +0 -1
- package/dist/commands/apply.d.ts +0 -3
- package/dist/commands/apply.d.ts.map +0 -1
- package/dist/commands/apply.js +0 -5
- package/dist/commands/apply.js.map +0 -1
- package/dist/commands/memory/_lib/openclaw-auth.test.d.ts +0 -2
- package/dist/commands/memory/_lib/openclaw-auth.test.d.ts.map +0 -1
- package/dist/commands/memory/_lib/openclaw-auth.test.js +0 -9
- package/dist/commands/memory/_lib/openclaw-auth.test.js.map +0 -1
- package/dist/internal/__tests__/api-client.test.d.ts +0 -2
- package/dist/internal/__tests__/api-client.test.d.ts.map +0 -1
- package/dist/internal/__tests__/api-client.test.js +0 -95
- package/dist/internal/__tests__/api-client.test.js.map +0 -1
- package/dist/internal/__tests__/context.test.d.ts +0 -2
- package/dist/internal/__tests__/context.test.d.ts.map +0 -1
- package/dist/internal/__tests__/context.test.js +0 -77
- package/dist/internal/__tests__/context.test.js.map +0 -1
|
@@ -0,0 +1,629 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Website Connector
|
|
3
|
+
*
|
|
4
|
+
* Scrapes web pages using Playwright for JS rendering.
|
|
5
|
+
* Supports sitemap.xml discovery or explicit URL list.
|
|
6
|
+
* Converts HTML → Markdown, splits into hierarchical sections.
|
|
7
|
+
* Tracks changes between syncs via content hashing.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { createHash } from 'node:crypto';
|
|
11
|
+
import TurndownService from 'turndown';
|
|
12
|
+
import {
|
|
13
|
+
type ActionContext,
|
|
14
|
+
type ActionResult,
|
|
15
|
+
type ConnectorDefinition,
|
|
16
|
+
ConnectorRuntime,
|
|
17
|
+
type EventEnvelope,
|
|
18
|
+
launchBrowser,
|
|
19
|
+
type SyncContext,
|
|
20
|
+
type SyncResult,
|
|
21
|
+
} from '@lobu/connector-sdk';
|
|
22
|
+
import type { Page } from 'playwright';
|
|
23
|
+
|
|
24
|
+
interface PageSection {
|
|
25
|
+
heading: string;
|
|
26
|
+
level: number;
|
|
27
|
+
content: string;
|
|
28
|
+
anchor: string;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
const COOKIE_BANNER_PATTERNS = [
|
|
32
|
+
/\bcookie(s)?\b/i,
|
|
33
|
+
/\bconsent\b/i,
|
|
34
|
+
/\baccept all\b/i,
|
|
35
|
+
/\breject all\b/i,
|
|
36
|
+
/\bmanage (my )?preferences\b/i,
|
|
37
|
+
/\bprivacy preferences\b/i,
|
|
38
|
+
/\bmarketing\b/i,
|
|
39
|
+
/\bmeasurement\b/i,
|
|
40
|
+
/\bnecessary\b/i,
|
|
41
|
+
];
|
|
42
|
+
|
|
43
|
+
function countPatternMatches(text: string, patterns: RegExp[]): number {
|
|
44
|
+
return patterns.reduce((count, pattern) => count + (pattern.test(text) ? 1 : 0), 0);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function shouldSkipCookieBannerText(text: string): boolean {
|
|
48
|
+
const normalized = text.replace(/\s+/g, ' ').trim();
|
|
49
|
+
if (!normalized) return false;
|
|
50
|
+
return countPatternMatches(normalized, COOKIE_BANNER_PATTERNS) >= 3;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Validates a URL is safe for server-side fetching.
|
|
55
|
+
* Blocks private/internal network addresses to prevent SSRF attacks.
|
|
56
|
+
*/
|
|
57
|
+
function validatePublicUrl(url: string): void {
|
|
58
|
+
let parsed: URL;
|
|
59
|
+
try {
|
|
60
|
+
parsed = new URL(url);
|
|
61
|
+
} catch {
|
|
62
|
+
throw new Error(`Invalid URL: ${url}`);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
if (parsed.protocol !== 'https:' && parsed.protocol !== 'http:') {
|
|
66
|
+
throw new Error(`URL must use http: or https: protocol, got ${parsed.protocol}`);
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const hostname = parsed.hostname.toLowerCase();
|
|
70
|
+
|
|
71
|
+
// Block localhost variants
|
|
72
|
+
if (hostname === 'localhost' || hostname === '[::1]' || hostname.endsWith('.localhost')) {
|
|
73
|
+
throw new Error(`URL must not point to localhost: ${hostname}`);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Block private/internal IP ranges
|
|
77
|
+
// IPv4 patterns: 127.x.x.x, 10.x.x.x, 192.168.x.x, 172.16-31.x.x, 169.254.x.x, 0.x.x.x
|
|
78
|
+
const ipv4Match = hostname.match(/^(\d{1,3})\.(\d{1,3})\.(\d{1,3})\.(\d{1,3})$/);
|
|
79
|
+
if (ipv4Match) {
|
|
80
|
+
const [, a, b] = ipv4Match.map(Number);
|
|
81
|
+
if (
|
|
82
|
+
a === 127 || // 127.0.0.0/8 loopback
|
|
83
|
+
a === 10 || // 10.0.0.0/8 private
|
|
84
|
+
(a === 172 && b >= 16 && b <= 31) || // 172.16.0.0/12 private
|
|
85
|
+
(a === 192 && b === 168) || // 192.168.0.0/16 private
|
|
86
|
+
(a === 169 && b === 254) || // 169.254.0.0/16 link-local
|
|
87
|
+
a === 0 // 0.0.0.0/8
|
|
88
|
+
) {
|
|
89
|
+
throw new Error(`URL must not point to a private/internal IP address: ${hostname}`);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// Block IPv6 private ranges (bracketed notation in URLs)
|
|
94
|
+
if (hostname.startsWith('[')) {
|
|
95
|
+
const ipv6 = hostname.slice(1, -1).toLowerCase();
|
|
96
|
+
if (
|
|
97
|
+
ipv6 === '::1' ||
|
|
98
|
+
ipv6.startsWith('fe80:') || // link-local
|
|
99
|
+
ipv6.startsWith('fc') || // unique local (fc00::/7)
|
|
100
|
+
ipv6.startsWith('fd') || // unique local (fc00::/7)
|
|
101
|
+
ipv6 === '::' || // unspecified
|
|
102
|
+
ipv6.startsWith('::ffff:') // IPv4-mapped IPv6
|
|
103
|
+
) {
|
|
104
|
+
throw new Error(`URL must not point to a private/internal IPv6 address: ${hostname}`);
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// Block common internal hostnames
|
|
109
|
+
if (
|
|
110
|
+
hostname.endsWith('.internal') ||
|
|
111
|
+
hostname.endsWith('.local') ||
|
|
112
|
+
hostname.endsWith('.corp') ||
|
|
113
|
+
hostname.endsWith('.lan')
|
|
114
|
+
) {
|
|
115
|
+
throw new Error(`URL must not point to an internal hostname: ${hostname}`);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
export default class WebsiteConnector extends ConnectorRuntime {
|
|
120
|
+
readonly definition: ConnectorDefinition = {
|
|
121
|
+
key: 'website',
|
|
122
|
+
name: 'Website',
|
|
123
|
+
description:
|
|
124
|
+
'Scrapes web pages with JS rendering via Playwright. Supports sitemap.xml for auto-discovery. Converts to markdown sections and tracks changes.',
|
|
125
|
+
version: '1.0.0',
|
|
126
|
+
faviconDomain: 'google.com',
|
|
127
|
+
authSchema: {
|
|
128
|
+
methods: [{ type: 'none' }],
|
|
129
|
+
},
|
|
130
|
+
feeds: {
|
|
131
|
+
pages: {
|
|
132
|
+
key: 'pages',
|
|
133
|
+
name: 'Web Pages',
|
|
134
|
+
description: 'Scrape and parse web pages into structured content.',
|
|
135
|
+
configSchema: {
|
|
136
|
+
type: 'object',
|
|
137
|
+
properties: {
|
|
138
|
+
sitemap_url: {
|
|
139
|
+
type: 'string',
|
|
140
|
+
format: 'uri',
|
|
141
|
+
description:
|
|
142
|
+
'URL to sitemap.xml. All URLs from the sitemap will be scraped. Takes priority over urls.',
|
|
143
|
+
},
|
|
144
|
+
urls: {
|
|
145
|
+
type: 'array',
|
|
146
|
+
items: { type: 'string', format: 'uri' },
|
|
147
|
+
description: 'Explicit list of URLs to scrape. Ignored if sitemap_url is set.',
|
|
148
|
+
},
|
|
149
|
+
max_pages: {
|
|
150
|
+
type: 'integer',
|
|
151
|
+
minimum: 1,
|
|
152
|
+
maximum: 100,
|
|
153
|
+
default: 20,
|
|
154
|
+
description: 'Maximum number of pages to scrape per sync (default: 20)',
|
|
155
|
+
},
|
|
156
|
+
parse_sections: {
|
|
157
|
+
type: 'boolean',
|
|
158
|
+
default: true,
|
|
159
|
+
description:
|
|
160
|
+
'Split page into sections by headings (h1-h3). If false, one event per page.',
|
|
161
|
+
},
|
|
162
|
+
wait_for_selector: {
|
|
163
|
+
type: 'string',
|
|
164
|
+
description:
|
|
165
|
+
'CSS selector to wait for before extracting content (e.g. "main", "#content"). Useful for SPAs.',
|
|
166
|
+
},
|
|
167
|
+
},
|
|
168
|
+
},
|
|
169
|
+
eventKinds: {
|
|
170
|
+
page: {
|
|
171
|
+
description: 'Full page content',
|
|
172
|
+
metadataSchema: {
|
|
173
|
+
type: 'object',
|
|
174
|
+
properties: {
|
|
175
|
+
content_hash: { type: 'string' },
|
|
176
|
+
meta_title: { type: 'string' },
|
|
177
|
+
meta_description: { type: 'string' },
|
|
178
|
+
og_image: { type: 'string' },
|
|
179
|
+
word_count: { type: 'number' },
|
|
180
|
+
},
|
|
181
|
+
},
|
|
182
|
+
},
|
|
183
|
+
section: {
|
|
184
|
+
description: 'A section of a page (split by headings)',
|
|
185
|
+
metadataSchema: {
|
|
186
|
+
type: 'object',
|
|
187
|
+
properties: {
|
|
188
|
+
heading: { type: 'string' },
|
|
189
|
+
heading_level: { type: 'number' },
|
|
190
|
+
anchor: { type: 'string' },
|
|
191
|
+
section_index: { type: 'number' },
|
|
192
|
+
page_url: { type: 'string' },
|
|
193
|
+
content_hash: { type: 'string' },
|
|
194
|
+
},
|
|
195
|
+
},
|
|
196
|
+
},
|
|
197
|
+
},
|
|
198
|
+
},
|
|
199
|
+
},
|
|
200
|
+
};
|
|
201
|
+
|
|
202
|
+
private turndown: TurndownService;
|
|
203
|
+
private readonly PAGE_TIMEOUT = 30000;
|
|
204
|
+
private readonly PAGE_DELAY_MS = 2000;
|
|
205
|
+
|
|
206
|
+
constructor() {
|
|
207
|
+
super();
|
|
208
|
+
this.turndown = new TurndownService({
|
|
209
|
+
headingStyle: 'atx',
|
|
210
|
+
codeBlockStyle: 'fenced',
|
|
211
|
+
});
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
async sync(ctx: SyncContext): Promise<SyncResult> {
|
|
215
|
+
const sitemapUrl = ctx.config.sitemap_url as string | undefined;
|
|
216
|
+
const explicitUrls = ctx.config.urls as string[] | undefined;
|
|
217
|
+
const maxPages = (ctx.config.max_pages as number) ?? 20;
|
|
218
|
+
const parseSections = (ctx.config.parse_sections as boolean) ?? true;
|
|
219
|
+
const waitForSelector = ctx.config.wait_for_selector as string | undefined;
|
|
220
|
+
const previousHashes = (ctx.checkpoint?.hashes as Record<string, string>) ?? {};
|
|
221
|
+
|
|
222
|
+
// Resolve URLs from sitemap or explicit list
|
|
223
|
+
let urls: string[];
|
|
224
|
+
if (sitemapUrl) {
|
|
225
|
+
validatePublicUrl(sitemapUrl);
|
|
226
|
+
urls = await this.fetchSitemap(sitemapUrl);
|
|
227
|
+
ctx.log?.(`Sitemap: found ${urls.length} URLs`);
|
|
228
|
+
} else if (explicitUrls?.length) {
|
|
229
|
+
urls = explicitUrls;
|
|
230
|
+
} else {
|
|
231
|
+
return {
|
|
232
|
+
events: [],
|
|
233
|
+
checkpoint: ctx.checkpoint,
|
|
234
|
+
metadata: { error: 'No sitemap_url or urls configured' },
|
|
235
|
+
};
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
urls = urls.slice(0, maxPages);
|
|
239
|
+
|
|
240
|
+
// Launch browser
|
|
241
|
+
const { browser } = await launchBrowser({} as any, { stealth: false });
|
|
242
|
+
const events: EventEnvelope[] = [];
|
|
243
|
+
const newHashes: Record<string, string> = {};
|
|
244
|
+
|
|
245
|
+
try {
|
|
246
|
+
for (let i = 0; i < urls.length; i++) {
|
|
247
|
+
const url = urls[i];
|
|
248
|
+
try {
|
|
249
|
+
validatePublicUrl(url);
|
|
250
|
+
const page = (await browser.newPage()) as Page;
|
|
251
|
+
try {
|
|
252
|
+
await page.goto(url, { waitUntil: 'networkidle', timeout: this.PAGE_TIMEOUT });
|
|
253
|
+
await this.dismissOverlays(page);
|
|
254
|
+
|
|
255
|
+
if (waitForSelector) {
|
|
256
|
+
await page.waitForSelector(waitForSelector, { timeout: 10000 }).catch(() => {
|
|
257
|
+
ctx.log?.(`Selector "${waitForSelector}" not found on ${url}, continuing anyway`);
|
|
258
|
+
});
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
await this.removeHiddenElements(page);
|
|
262
|
+
const html = await page.content();
|
|
263
|
+
const finalUrl = page.url();
|
|
264
|
+
const meta = this.extractMeta(html);
|
|
265
|
+
const cleanHtml = this.stripNonContent(html);
|
|
266
|
+
const markdown = this.deduplicateMarkdown(this.turndown.turndown(cleanHtml).trim());
|
|
267
|
+
if (!markdown || shouldSkipCookieBannerText(`${meta.title ?? ''}\n${markdown}`)) {
|
|
268
|
+
ctx.log?.(`Skipping low-signal page content for ${finalUrl}`);
|
|
269
|
+
continue;
|
|
270
|
+
}
|
|
271
|
+
const contentHash = this.hash(markdown);
|
|
272
|
+
|
|
273
|
+
if (previousHashes[url] === contentHash) {
|
|
274
|
+
newHashes[url] = contentHash;
|
|
275
|
+
continue;
|
|
276
|
+
}
|
|
277
|
+
newHashes[url] = contentHash;
|
|
278
|
+
|
|
279
|
+
if (parseSections) {
|
|
280
|
+
const sections = this.parseSections(markdown);
|
|
281
|
+
for (let si = 0; si < sections.length; si++) {
|
|
282
|
+
const section = sections[si];
|
|
283
|
+
const sectionHash = this.hash(section.content);
|
|
284
|
+
const sectionKey = `${url}#${section.anchor}`;
|
|
285
|
+
|
|
286
|
+
if (previousHashes[sectionKey] === sectionHash) {
|
|
287
|
+
newHashes[sectionKey] = sectionHash;
|
|
288
|
+
continue;
|
|
289
|
+
}
|
|
290
|
+
newHashes[sectionKey] = sectionHash;
|
|
291
|
+
if (shouldSkipCookieBannerText(`${section.heading}\n${section.content}`)) {
|
|
292
|
+
continue;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
const parentKey = section.parentAnchor
|
|
296
|
+
? `${url}#${section.parentAnchor}`
|
|
297
|
+
: undefined;
|
|
298
|
+
events.push({
|
|
299
|
+
origin_id: `web_section_${this.hash(sectionKey)}`,
|
|
300
|
+
title: section.heading,
|
|
301
|
+
payload_text: section.content,
|
|
302
|
+
source_url: `${finalUrl}#${section.anchor}`,
|
|
303
|
+
occurred_at: new Date(),
|
|
304
|
+
origin_type: 'section',
|
|
305
|
+
semantic_type: 'section',
|
|
306
|
+
score: 50,
|
|
307
|
+
origin_parent_id: parentKey ? `web_section_${this.hash(parentKey)}` : undefined,
|
|
308
|
+
metadata: {
|
|
309
|
+
heading: section.heading,
|
|
310
|
+
heading_level: section.level,
|
|
311
|
+
anchor: section.anchor,
|
|
312
|
+
section_index: si,
|
|
313
|
+
page_url: finalUrl,
|
|
314
|
+
content_hash: sectionHash,
|
|
315
|
+
},
|
|
316
|
+
});
|
|
317
|
+
}
|
|
318
|
+
} else {
|
|
319
|
+
events.push({
|
|
320
|
+
origin_id: `web_page_${this.hash(url)}`,
|
|
321
|
+
title: meta.title || finalUrl,
|
|
322
|
+
payload_text: markdown,
|
|
323
|
+
source_url: finalUrl,
|
|
324
|
+
occurred_at: new Date(),
|
|
325
|
+
origin_type: 'page',
|
|
326
|
+
semantic_type: 'page',
|
|
327
|
+
score: 50,
|
|
328
|
+
metadata: {
|
|
329
|
+
content_hash: contentHash,
|
|
330
|
+
meta_title: meta.title,
|
|
331
|
+
meta_description: meta.description,
|
|
332
|
+
og_image: meta.ogImage,
|
|
333
|
+
word_count: markdown.split(/\s+/).length,
|
|
334
|
+
},
|
|
335
|
+
});
|
|
336
|
+
}
|
|
337
|
+
} finally {
|
|
338
|
+
await page.close();
|
|
339
|
+
}
|
|
340
|
+
} catch (err) {
|
|
341
|
+
ctx.log?.(`Failed to scrape ${url}: ${err instanceof Error ? err.message : String(err)}`);
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
if (i < urls.length - 1) {
|
|
345
|
+
await this.sleep(this.PAGE_DELAY_MS);
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
} finally {
|
|
349
|
+
await browser.close();
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
return {
|
|
353
|
+
events,
|
|
354
|
+
checkpoint: { hashes: newHashes, last_sync_at: new Date().toISOString() },
|
|
355
|
+
metadata: { pages_scraped: urls.length, events_created: events.length },
|
|
356
|
+
};
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
async execute(_ctx: ActionContext): Promise<ActionResult> {
|
|
360
|
+
return { success: false, error: 'Actions not supported' };
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
private async dismissOverlays(page: Page): Promise<void> {
|
|
364
|
+
const dismissLabels = [
|
|
365
|
+
'Accept',
|
|
366
|
+
'Accept all',
|
|
367
|
+
'I agree',
|
|
368
|
+
'Allow all',
|
|
369
|
+
'Got it',
|
|
370
|
+
'Continue',
|
|
371
|
+
'Close',
|
|
372
|
+
];
|
|
373
|
+
|
|
374
|
+
for (const label of dismissLabels) {
|
|
375
|
+
try {
|
|
376
|
+
const button = page.getByRole('button', { name: new RegExp(`^${label}$`, 'i') }).first();
|
|
377
|
+
if (await button.isVisible({ timeout: 500 })) {
|
|
378
|
+
await button.click({ timeout: 1000 });
|
|
379
|
+
break;
|
|
380
|
+
}
|
|
381
|
+
} catch {
|
|
382
|
+
// Keep trying other labels/selectors.
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
await page
|
|
387
|
+
.evaluate(() => {
|
|
388
|
+
const selectors = [
|
|
389
|
+
'[id*="cookie" i]',
|
|
390
|
+
'[class*="cookie" i]',
|
|
391
|
+
'[id*="consent" i]',
|
|
392
|
+
'[class*="consent" i]',
|
|
393
|
+
'[id*="onetrust" i]',
|
|
394
|
+
'[class*="onetrust" i]',
|
|
395
|
+
'[aria-modal="true"]',
|
|
396
|
+
'[role="dialog"]',
|
|
397
|
+
];
|
|
398
|
+
|
|
399
|
+
for (const element of document.querySelectorAll(selectors.join(','))) {
|
|
400
|
+
const html = (element as HTMLElement).innerText || '';
|
|
401
|
+
if (/cookie|consent|privacy/i.test(html)) {
|
|
402
|
+
element.remove();
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
})
|
|
406
|
+
.catch(() => {
|
|
407
|
+
// DOM cleanup is best-effort.
|
|
408
|
+
});
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
/**
|
|
412
|
+
* Remove DOM elements hidden via CSS (display:none, visibility:hidden, zero dimensions).
|
|
413
|
+
* This handles responsive duplicates where the same content is rendered for
|
|
414
|
+
* desktop and mobile with Tailwind classes like `hidden md:block` / `md:hidden`.
|
|
415
|
+
*/
|
|
416
|
+
private async removeHiddenElements(page: Page): Promise<void> {
|
|
417
|
+
await page
|
|
418
|
+
.evaluate(() => {
|
|
419
|
+
const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_ELEMENT);
|
|
420
|
+
const toRemove: Element[] = [];
|
|
421
|
+
while (walker.nextNode()) {
|
|
422
|
+
const el = walker.currentNode as HTMLElement;
|
|
423
|
+
// Skip elements that can't meaningfully contain scraped content
|
|
424
|
+
if (['SCRIPT', 'STYLE', 'LINK', 'META', 'BR', 'HR'].includes(el.tagName)) continue;
|
|
425
|
+
const style = getComputedStyle(el);
|
|
426
|
+
if (style.display === 'none' || style.visibility === 'hidden') {
|
|
427
|
+
toRemove.push(el);
|
|
428
|
+
}
|
|
429
|
+
}
|
|
430
|
+
for (const el of toRemove) {
|
|
431
|
+
el.remove();
|
|
432
|
+
}
|
|
433
|
+
})
|
|
434
|
+
.catch(() => {
|
|
435
|
+
// Best-effort — continue with the full DOM if this fails.
|
|
436
|
+
});
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
/**
|
|
440
|
+
* Deduplicate repeated lines in markdown output.
|
|
441
|
+
* Animation containers and responsive layouts often produce identical image
|
|
442
|
+
* or link lines multiple times. This keeps the first occurrence of each.
|
|
443
|
+
*/
|
|
444
|
+
private deduplicateMarkdown(markdown: string): string {
|
|
445
|
+
const lines = markdown.split('\n');
|
|
446
|
+
const seen = new Set<string>();
|
|
447
|
+
const result: string[] = [];
|
|
448
|
+
for (const line of lines) {
|
|
449
|
+
const trimmed = line.trim();
|
|
450
|
+
// Only dedup substantial lines (short lines like blank lines or list markers are fine to repeat)
|
|
451
|
+
if (trimmed.length >= 80) {
|
|
452
|
+
if (seen.has(trimmed)) continue;
|
|
453
|
+
seen.add(trimmed);
|
|
454
|
+
}
|
|
455
|
+
result.push(line);
|
|
456
|
+
}
|
|
457
|
+
return result.join('\n');
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
private async fetchSitemap(sitemapUrl: string): Promise<string[]> {
|
|
461
|
+
const response = await fetch(sitemapUrl, {
|
|
462
|
+
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; LobuBot/1.0)' },
|
|
463
|
+
});
|
|
464
|
+
|
|
465
|
+
if (!response.ok) {
|
|
466
|
+
throw new Error(`Sitemap fetch failed: HTTP ${response.status}`);
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
const xml = await response.text();
|
|
470
|
+
const urls: string[] = [];
|
|
471
|
+
|
|
472
|
+
// Parse <loc> tags from sitemap XML
|
|
473
|
+
const locPattern = /<loc>\s*(.*?)\s*<\/loc>/gi;
|
|
474
|
+
let match;
|
|
475
|
+
while ((match = locPattern.exec(xml)) !== null) {
|
|
476
|
+
const url = match[1].trim();
|
|
477
|
+
// Skip non-HTML resources and anchor fragment URLs
|
|
478
|
+
if (
|
|
479
|
+
url &&
|
|
480
|
+
!url.match(/\.(pdf|jpg|jpeg|png|gif|svg|css|js|xml|json|zip|gz)$/i) &&
|
|
481
|
+
!url.includes('#')
|
|
482
|
+
) {
|
|
483
|
+
urls.push(url);
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
// Handle sitemap index (sitemaps linking to other sitemaps)
|
|
488
|
+
if (urls.length === 0) {
|
|
489
|
+
const sitemapPattern = /<sitemap>\s*<loc>\s*(.*?)\s*<\/loc>/gi;
|
|
490
|
+
const childSitemaps: string[] = [];
|
|
491
|
+
while ((match = sitemapPattern.exec(xml)) !== null) {
|
|
492
|
+
childSitemaps.push(match[1].trim());
|
|
493
|
+
}
|
|
494
|
+
for (const childUrl of childSitemaps.slice(0, 5)) {
|
|
495
|
+
validatePublicUrl(childUrl);
|
|
496
|
+
const childUrls = await this.fetchSitemap(childUrl);
|
|
497
|
+
urls.push(...childUrls);
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
return urls;
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
private extractMeta(html: string): { title?: string; description?: string; ogImage?: string } {
|
|
505
|
+
const titleMatch = html.match(/<title[^>]*>(.*?)<\/title>/is);
|
|
506
|
+
const descMatch =
|
|
507
|
+
html.match(/<meta[^>]+name=["']description["'][^>]+content=["']([^"']+)["']/i) ||
|
|
508
|
+
html.match(/<meta[^>]+content=["']([^"']+)["'][^>]+name=["']description["']/i);
|
|
509
|
+
const ogMatch =
|
|
510
|
+
html.match(/<meta[^>]+property=["']og:image["'][^>]+content=["']([^"']+)["']/i) ||
|
|
511
|
+
html.match(/<meta[^>]+content=["']([^"']+)["'][^>]+property=["']og:image["']/i);
|
|
512
|
+
|
|
513
|
+
return {
|
|
514
|
+
title: titleMatch?.[1]?.trim(),
|
|
515
|
+
description: descMatch?.[1]?.trim(),
|
|
516
|
+
ogImage: ogMatch?.[1]?.trim(),
|
|
517
|
+
};
|
|
518
|
+
}
|
|
519
|
+
|
|
520
|
+
private stripNonContent(html: string): string {
|
|
521
|
+
const tags = [
|
|
522
|
+
'script',
|
|
523
|
+
'style',
|
|
524
|
+
'noscript',
|
|
525
|
+
'nav',
|
|
526
|
+
'header',
|
|
527
|
+
'footer',
|
|
528
|
+
'aside',
|
|
529
|
+
'iframe',
|
|
530
|
+
'svg',
|
|
531
|
+
'canvas',
|
|
532
|
+
'video',
|
|
533
|
+
'audio',
|
|
534
|
+
'menu',
|
|
535
|
+
'dialog',
|
|
536
|
+
'embed',
|
|
537
|
+
'object',
|
|
538
|
+
'applet',
|
|
539
|
+
];
|
|
540
|
+
let cleaned = html;
|
|
541
|
+
for (const tag of tags) {
|
|
542
|
+
cleaned = cleaned.replace(new RegExp(`<${tag}\\b[^>]*>[\\s\\S]*?<\\/${tag}>`, 'gi'), '');
|
|
543
|
+
}
|
|
544
|
+
// Remove self-closing / void elements that add noise
|
|
545
|
+
cleaned = cleaned.replace(/<(link|meta|input)\b[^>]*\/?>/gi, '');
|
|
546
|
+
return cleaned;
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
private parseSections(markdown: string): (PageSection & { parentAnchor?: string })[] {
|
|
550
|
+
const lines = markdown.split('\n');
|
|
551
|
+
const sections: (PageSection & { parentAnchor?: string })[] = [];
|
|
552
|
+
let currentHeading = 'Introduction';
|
|
553
|
+
let currentLevel = 1;
|
|
554
|
+
let currentLines: string[] = [];
|
|
555
|
+
|
|
556
|
+
// Per-slug counters so anchors stay stable when unrelated sections change.
|
|
557
|
+
// Only incremented when a section is emitted, not for heading stack entries.
|
|
558
|
+
const slugCounts = new Map<string, number>();
|
|
559
|
+
|
|
560
|
+
const makeAnchor = (heading: string): string => {
|
|
561
|
+
const slug = this.slugify(heading);
|
|
562
|
+
const count = slugCounts.get(slug) ?? 0;
|
|
563
|
+
slugCounts.set(slug, count + 1);
|
|
564
|
+
return count === 0 ? slug : `${slug}-${count}`;
|
|
565
|
+
};
|
|
566
|
+
|
|
567
|
+
// Track parent heading stack for hierarchy.
|
|
568
|
+
// Anchors are assigned lazily when the heading's section is emitted.
|
|
569
|
+
const headingStack: { heading: string; level: number; anchor?: string }[] = [];
|
|
570
|
+
|
|
571
|
+
const emitSection = (heading: string, level: number, content: string) => {
|
|
572
|
+
const anchor = makeAnchor(heading);
|
|
573
|
+
// Update the heading stack entry for this heading so children can reference it
|
|
574
|
+
const stackEntry = headingStack.find((e) => e.heading === heading && e.anchor === undefined);
|
|
575
|
+
if (stackEntry) stackEntry.anchor = anchor;
|
|
576
|
+
const parent = headingStack.length > 0 ? headingStack[headingStack.length - 1] : undefined;
|
|
577
|
+
const parentAnchor = parent?.heading === heading ? undefined : parent?.anchor;
|
|
578
|
+
sections.push({ heading, level, content, anchor, parentAnchor });
|
|
579
|
+
};
|
|
580
|
+
|
|
581
|
+
for (const line of lines) {
|
|
582
|
+
const headingMatch = line.match(/^(#{1,3})\s+(.+)$/);
|
|
583
|
+
if (headingMatch) {
|
|
584
|
+
const content = currentLines.join('\n').trim();
|
|
585
|
+
if (content.length > 0) {
|
|
586
|
+
emitSection(currentHeading, currentLevel, content);
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
const newLevel = headingMatch[1].length;
|
|
590
|
+
const newHeading = headingMatch[2].trim();
|
|
591
|
+
|
|
592
|
+
// Pop stack until we find a parent with a lower level
|
|
593
|
+
while (headingStack.length > 0 && headingStack[headingStack.length - 1].level >= newLevel) {
|
|
594
|
+
headingStack.pop();
|
|
595
|
+
}
|
|
596
|
+
headingStack.push({ heading: newHeading, level: newLevel });
|
|
597
|
+
|
|
598
|
+
currentHeading = newHeading;
|
|
599
|
+
currentLevel = newLevel;
|
|
600
|
+
currentLines = [];
|
|
601
|
+
} else {
|
|
602
|
+
currentLines.push(line);
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
|
|
606
|
+
const content = currentLines.join('\n').trim();
|
|
607
|
+
if (content.length > 0) {
|
|
608
|
+
emitSection(currentHeading, currentLevel, content);
|
|
609
|
+
}
|
|
610
|
+
|
|
611
|
+
return sections;
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
private slugify(text: string): string {
|
|
615
|
+
return text
|
|
616
|
+
.toLowerCase()
|
|
617
|
+
.replace(/[^a-z0-9]+/g, '-')
|
|
618
|
+
.replace(/^-|-$/g, '')
|
|
619
|
+
.substring(0, 60);
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
private hash(text: string): string {
|
|
623
|
+
return createHash('sha256').update(text).digest('hex').substring(0, 16);
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
private sleep(ms: number): Promise<void> {
|
|
627
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
628
|
+
}
|
|
629
|
+
}
|