owletto 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json ADDED
@@ -0,0 +1,58 @@
1
+ {
2
+ "name": "owletto",
3
+ "version": "1.0.0",
4
+ "description": "Owletto SDK - build pluggable connectors for the Owletto integration platform",
5
+ "type": "module",
6
+ "main": "dist/index.js",
7
+ "types": "dist/index.d.ts",
8
+ "exports": {
9
+ ".": {
10
+ "import": "./src/index.ts",
11
+ "types": "./src/index.ts"
12
+ }
13
+ },
14
+ "bin": {
15
+ "owletto-sdk": "./src/cli.ts"
16
+ },
17
+ "files": [
18
+ "dist",
19
+ "src"
20
+ ],
21
+ "scripts": {
22
+ "build": "tsc",
23
+ "typecheck": "tsc --noEmit",
24
+ "clean": "rm -rf dist"
25
+ },
26
+ "dependencies": {
27
+ "@sinclair/typebox": "^0.34.33",
28
+ "esbuild": "^0.27.0",
29
+ "ky": "^1.14.0",
30
+ "p-retry": "^7.1.0",
31
+ "pino": "^10.1.0"
32
+ },
33
+ "peerDependencies": {
34
+ "playwright": "^1.40.0"
35
+ },
36
+ "peerDependenciesMeta": {
37
+ "playwright": {
38
+ "optional": true
39
+ }
40
+ },
41
+ "devDependencies": {
42
+ "@types/node": "^20.10.0",
43
+ "typescript": "^5.3.3"
44
+ },
45
+ "engines": {
46
+ "node": ">=18"
47
+ },
48
+ "license": "MIT",
49
+ "publishConfig": {
50
+ "access": "public"
51
+ },
52
+ "homepage": "https://github.com/lobu-ai/owletto",
53
+ "repository": {
54
+ "type": "git",
55
+ "url": "https://github.com/lobu-ai/owletto.git",
56
+ "directory": "packages/sdk"
57
+ }
58
+ }
@@ -0,0 +1,185 @@
1
+ /**
2
+ * API Paginated Crawler Base Class
3
+ *
4
+ * Extends PaginatedCrawler for HTTP/REST API-based crawlers.
5
+ * Provides HTTP client setup and common API patterns.
6
+ */
7
+
8
+ import type { KyInstance } from 'ky';
9
+ import { HTTPError } from 'ky';
10
+ import { RateLimitError } from './base.js';
11
+ import { createAuthenticatedClient, createHttpClient, httpClient } from './http.js';
12
+ import { sdkLogger } from './logger.js';
13
+ import type { PageFetchResult, PaginatedCheckpoint } from './paginated.js';
14
+ import { PaginatedCrawler } from './paginated.js';
15
+ import { withHttpRetry } from './retry.js';
16
+ import type { CrawlerOptions, Env, SessionState } from './types.js';
17
+
18
+ /**
19
+ * API session state for OAuth/token-based crawlers
20
+ */
21
+ export interface ApiSessionState extends SessionState {
22
+ /** OAuth/API access token */
23
+ access_token?: string;
24
+ /** OAuth refresh token (for token refresh flows) */
25
+ refresh_token?: string;
26
+ /** Token type (e.g., 'Bearer') */
27
+ token_type?: string;
28
+ /** Token expiration time (ISO string) */
29
+ expires_at?: string;
30
+ /** Additional headers to include in requests */
31
+ headers?: Record<string, string>;
32
+ /** API key (alternative to OAuth tokens) */
33
+ api_key?: string;
34
+ }
35
+
36
+ /**
37
+ * Base class for API-based crawlers with HTTP pagination
38
+ */
39
+ export abstract class ApiPaginatedCrawler<
40
+ TItem,
41
+ TResponse = unknown,
42
+ TCheckpoint extends PaginatedCheckpoint = PaginatedCheckpoint,
43
+ > extends PaginatedCrawler<TItem, TCheckpoint> {
44
+ readonly apiType = 'api' as const;
45
+
46
+ /**
47
+ * Session state for this crawl session
48
+ */
49
+ protected _sessionState: ApiSessionState | null = null;
50
+
51
+ /**
52
+ * Set session state for this crawl session
53
+ */
54
+ protected setSessionState(sessionState: SessionState | null | undefined): void {
55
+ this._sessionState = (sessionState as ApiSessionState) || null;
56
+ if (this._sessionState) {
57
+ sdkLogger.debug(
58
+ { hasToken: !!this._sessionState.access_token, hasApiKey: !!this._sessionState.api_key },
59
+ `[${this.type}] Session state set`
60
+ );
61
+ }
62
+ }
63
+
64
+ /**
65
+ * Get current session state
66
+ */
67
+ protected getSessionState(): ApiSessionState | null {
68
+ return this._sessionState;
69
+ }
70
+
71
+ /**
72
+ * ABSTRACT: Build URL for fetching a specific page
73
+ */
74
+ protected abstract buildPageUrl(cursor: string | null, options: CrawlerOptions): string;
75
+
76
+ /**
77
+ * ABSTRACT: Parse API response into items and next token
78
+ */
79
+ protected abstract parseResponse(
80
+ response: TResponse,
81
+ options: CrawlerOptions
82
+ ): PageFetchResult<TItem>;
83
+
84
+ /**
85
+ * Get configured HTTP client for this crawler
86
+ */
87
+ protected getHttpClient(_env: Env): KyInstance {
88
+ return httpClient;
89
+ }
90
+
91
+ /**
92
+ * Create HTTP client with Bearer token authentication
93
+ */
94
+ protected createBearerClient(
95
+ token: string,
96
+ additionalHeaders?: Record<string, string>
97
+ ): KyInstance {
98
+ return createAuthenticatedClient(`Bearer ${token}`, additionalHeaders);
99
+ }
100
+
101
+ /**
102
+ * Create HTTP client with custom headers
103
+ */
104
+ protected createClientWithHeaders(headers: Record<string, string>): KyInstance {
105
+ return createHttpClient({ headers });
106
+ }
107
+
108
+ /**
109
+ * Create HTTP client from session state
110
+ */
111
+ protected createClientFromSessionState(additionalHeaders?: Record<string, string>): KyInstance {
112
+ if (!this._sessionState) {
113
+ return additionalHeaders ? createHttpClient({ headers: additionalHeaders }) : httpClient;
114
+ }
115
+
116
+ const headers: Record<string, string> = {
117
+ ...this._sessionState.headers,
118
+ ...additionalHeaders,
119
+ };
120
+
121
+ if (this._sessionState.access_token) {
122
+ const tokenType = this._sessionState.token_type || 'Bearer';
123
+ return createAuthenticatedClient(`${tokenType} ${this._sessionState.access_token}`, headers);
124
+ }
125
+
126
+ if (this._sessionState.api_key) {
127
+ return createAuthenticatedClient(this._sessionState.api_key, headers);
128
+ }
129
+
130
+ return Object.keys(headers).length > 0 ? createHttpClient({ headers }) : httpClient;
131
+ }
132
+
133
+ /**
134
+ * Handle HTTP errors with platform-specific messages
135
+ */
136
+ protected handleHttpError(error: HTTPError, url: string): never {
137
+ this.handleHTTPError(error.response.status, url);
138
+ }
139
+
140
+ /**
141
+ * Default fetchPage implementation using HTTP client with retry
142
+ */
143
+ protected async fetchPage(
144
+ cursor: string | null,
145
+ options: CrawlerOptions,
146
+ env: Env
147
+ ): Promise<PageFetchResult<TItem>> {
148
+ const client = this.getHttpClient(env);
149
+ const url = this.buildPageUrl(cursor, options);
150
+
151
+ try {
152
+ const response = await withHttpRetry(async () => client.get(url).json<TResponse>(), {
153
+ operation: `${this.type} API fetch`,
154
+ context: { url, cursor },
155
+ });
156
+
157
+ return this.parseResponse(response, options);
158
+ } catch (error) {
159
+ if (error instanceof HTTPError) {
160
+ if (error.response.status === 429) {
161
+ const retryAfter = error.response.headers.get('retry-after');
162
+ let retryAfterMs: number | undefined;
163
+ if (retryAfter) {
164
+ const numericRetry = Number(retryAfter);
165
+ if (!Number.isNaN(numericRetry)) {
166
+ retryAfterMs = numericRetry * 1000;
167
+ } else {
168
+ const retryDate = Date.parse(retryAfter);
169
+ if (!Number.isNaN(retryDate)) {
170
+ retryAfterMs = retryDate - Date.now();
171
+ }
172
+ }
173
+ }
174
+ throw new RateLimitError(
175
+ `${this.displayName} rate limit exceeded. Please wait before retrying.`,
176
+ retryAfterMs && retryAfterMs > 0 ? retryAfterMs : undefined
177
+ );
178
+ }
179
+ this.handleHttpError(error, url);
180
+ }
181
+ sdkLogger.error({ error, url }, `[${this.type}] API fetch failed`);
182
+ throw error;
183
+ }
184
+ }
185
+ }
package/src/base.ts ADDED
@@ -0,0 +1,173 @@
1
+ import type { TObject } from '@sinclair/typebox';
2
+ import { Value } from '@sinclair/typebox/value';
3
+ import { sdkLogger } from './logger.js';
4
+ import type {
5
+ Checkpoint,
6
+ Content,
7
+ CrawlerAuthSchema,
8
+ CrawlerOptions,
9
+ CrawlResult,
10
+ Env,
11
+ ICrawler,
12
+ ParentSourceDefinition,
13
+ ScoringConfig,
14
+ SessionState,
15
+ } from './types.js';
16
+
17
+ export class RateLimitError extends Error {
18
+ readonly retryAfterMs?: number;
19
+
20
+ constructor(message: string, retryAfterMs?: number) {
21
+ super(message);
22
+ this.name = 'RateLimitError';
23
+ this.retryAfterMs = retryAfterMs;
24
+ }
25
+ }
26
+
27
+ /**
28
+ * Base crawler implementation with common functionality
29
+ * All platform-specific crawlers should extend this class
30
+ */
31
+ export abstract class BaseCrawler implements ICrawler {
32
+ abstract readonly type: string;
33
+ abstract readonly displayName: string;
34
+ abstract readonly apiType: 'api' | 'browser';
35
+ abstract readonly crawlerType: 'entity' | 'search';
36
+ abstract readonly optionsSchema: TObject;
37
+ abstract readonly defaultScoringConfig: ScoringConfig;
38
+ abstract readonly defaultScoringFormula: string;
39
+
40
+ readonly authSchema: CrawlerAuthSchema = { methods: [{ type: 'none' }] };
41
+
42
+ abstract pull(
43
+ options: CrawlerOptions,
44
+ checkpoint: Checkpoint | null,
45
+ env: Env,
46
+ sessionState?: SessionState | null,
47
+ updateCheckpointFn?: (checkpoint: Checkpoint) => Promise<void>
48
+ ): Promise<CrawlResult>;
49
+
50
+ abstract urlFromOptions(options: CrawlerOptions): string;
51
+
52
+ abstract displayLabelFromOptions(options: CrawlerOptions): string;
53
+
54
+ abstract validateOptions(options: CrawlerOptions): string | null;
55
+
56
+ getParentSourceDefinitions(_options: CrawlerOptions): ParentSourceDefinition[] {
57
+ return [];
58
+ }
59
+
60
+ /**
61
+ * Validate options using TypeBox schema
62
+ * Subclasses can call this for schema validation before adding custom business logic
63
+ */
64
+ protected validateWithSchema(options: CrawlerOptions): string | null {
65
+ try {
66
+ const errors = [...Value.Errors(this.optionsSchema, options)];
67
+ if (errors.length > 0) {
68
+ // Format first error for user-friendly message
69
+ const firstError = errors[0];
70
+ const field = firstError.path.replace(/^\//, '');
71
+ return `Invalid option ${field ? `"${field}"` : ''}: ${firstError.message}`;
72
+ }
73
+ return null;
74
+ } catch (error) {
75
+ sdkLogger.error({ error }, '[BaseCrawler] Schema validation error:');
76
+ return 'Invalid crawler options format';
77
+ }
78
+ }
79
+
80
+ /**
81
+ * Get rate limit information for this platform
82
+ * Override this method in platform-specific crawlers
83
+ * Default is conservative: 10 requests per minute
84
+ */
85
+ getRateLimit() {
86
+ return {
87
+ requests_per_minute: 10,
88
+ recommended_interval_ms: 6000, // 6 seconds
89
+ };
90
+ }
91
+
92
+ /**
93
+ * Helper to check if content is newer than checkpoint
94
+ */
95
+ protected isNewerThan(contentDate: Date, checkpoint: Checkpoint | null): boolean {
96
+ if (!checkpoint || !checkpoint.last_timestamp) return true;
97
+ return contentDate > checkpoint.last_timestamp;
98
+ }
99
+
100
+ /**
101
+ * Calculate lookback date from options
102
+ * @param options - Crawler options with optional lookback_days
103
+ * @param defaultDays - Default lookback period (default: 365)
104
+ */
105
+ protected getLookbackDate(options: CrawlerOptions, defaultDays: number = 365): Date {
106
+ const lookbackDays = options.lookback_days || defaultDays;
107
+ return new Date(Date.now() - lookbackDays * 24 * 60 * 60 * 1000);
108
+ }
109
+
110
+ /**
111
+ * Sleep for specified milliseconds (for rate limiting)
112
+ */
113
+ protected sleep(ms: number): Promise<void> {
114
+ return new Promise((resolve) => setTimeout(resolve, ms));
115
+ }
116
+
117
+ /**
118
+ * Check if crawler is in incremental mode
119
+ */
120
+ protected isIncrementalMode(
121
+ checkpoint: Checkpoint | null,
122
+ paginationToken?: string | null
123
+ ): boolean {
124
+ return !!checkpoint?.last_timestamp && !paginationToken;
125
+ }
126
+
127
+ /**
128
+ * Helper to deduplicate content by external_id
129
+ */
130
+ protected deduplicate(contents: Content[], seenIds: Set<string>): Content[] {
131
+ return contents.filter((c) => {
132
+ if (!c.external_id) return false;
133
+ if (seenIds.has(c.external_id)) return false;
134
+ seenIds.add(c.external_id);
135
+ return true;
136
+ });
137
+ }
138
+
139
+ /**
140
+ * Handle HTTP errors with structured logging and platform-specific messages
141
+ */
142
+ protected handleHTTPError(status: number, context: string, platformName?: string): never {
143
+ const platform = platformName || this.displayName;
144
+
145
+ sdkLogger.error(
146
+ {
147
+ status,
148
+ context,
149
+ platform: this.type,
150
+ timestamp: new Date().toISOString(),
151
+ },
152
+ `[${platform}Crawler] HTTP ${status} error:`
153
+ );
154
+
155
+ const errorMessages: Record<number, string> = {
156
+ 400: `Bad request to ${platform}: ${context}. Check your crawler options.`,
157
+ 401: `Authentication failed for ${platform}. Check your API credentials.`,
158
+ 403: `Access forbidden to ${platform} resource: ${context}. The resource may be private or require authentication.`,
159
+ 404: `Resource not found on ${platform}: ${context}. Verify the resource exists.`,
160
+ 422: `Invalid request to ${platform}: ${context}. Check your parameters.`,
161
+ 429: `${platform} rate limit exceeded. Please wait before retrying.`,
162
+ 500: `${platform} server error (${status}). This is temporary, please retry later.`,
163
+ 502: `${platform} bad gateway (${status}). This is temporary, please retry later.`,
164
+ 503: `${platform} service unavailable (${status}). This is temporary, please retry later.`,
165
+ };
166
+
167
+ const message = errorMessages[status] || `${platform} API error: ${status}`;
168
+ if (status === 429) {
169
+ throw new RateLimitError(message);
170
+ }
171
+ throw new Error(message);
172
+ }
173
+ }
@@ -0,0 +1,213 @@
1
+ /// <reference lib="dom" />
2
+ /**
3
+ * Browser Launcher Utility
4
+ * Provides Playwright-based browser automation
5
+ */
6
+
7
+ import { mkdir, writeFile } from 'node:fs/promises';
8
+ import { join } from 'node:path';
9
+ import { sdkLogger } from '../logger.js';
10
+ import type { Env } from '../types.js';
11
+ import { launchStealthBrowser } from './stealth.js';
12
+
13
+ export interface BrowserLaunchOptions {
14
+ debug?: boolean;
15
+ trace?: boolean;
16
+ screenshotDir?: string;
17
+ stealth?: boolean;
18
+ }
19
+
20
+ export interface EnhancedBrowser {
21
+ browser: any;
22
+ isPlaywright: boolean;
23
+ screenshotDir: string;
24
+ }
25
+
26
+ /**
27
+ * Add Puppeteer-compatible methods to Playwright Page
28
+ */
29
+ function addCompatibilityMethods(page: any): any {
30
+ if (!page.setUserAgent) {
31
+ page.setUserAgent = async (userAgent: string) => {
32
+ await page.setExtraHTTPHeaders({
33
+ 'User-Agent': userAgent,
34
+ });
35
+ };
36
+ }
37
+
38
+ return page;
39
+ }
40
+
41
+ /**
42
+ * Launch browser with Playwright
43
+ */
44
+ export async function launchBrowser(
45
+ _env: Env,
46
+ options: BrowserLaunchOptions = {}
47
+ ): Promise<EnhancedBrowser> {
48
+ const isDebug = options.debug ?? process.env.BROWSER_DEBUG === '1';
49
+ const enableTrace = options.trace ?? process.env.BROWSER_TRACE === '1';
50
+ const screenshotDir =
51
+ options.screenshotDir ?? process.env.BROWSER_SCREENSHOT_DIR ?? '/tmp/crawler-screenshots';
52
+
53
+ const useStealth = options.stealth ?? process.env.BROWSER_STEALTH === '1';
54
+
55
+ sdkLogger.info(
56
+ `[BrowserLauncher] Using Playwright (local) - headless: ${!isDebug}, stealth: ${useStealth}`
57
+ );
58
+
59
+ try {
60
+ if (useStealth) {
61
+ const stealthBrowser = await launchStealthBrowser({
62
+ headless: !isDebug,
63
+ debug: isDebug,
64
+ });
65
+
66
+ const browser = stealthBrowser.browser;
67
+ const originalNewPage = browser.newPage.bind(browser);
68
+
69
+ browser.newPage = async () => {
70
+ const page = await originalNewPage();
71
+ return addCompatibilityMethods(page);
72
+ };
73
+
74
+ sdkLogger.info('[BrowserLauncher] Stealth mode enabled - using anti-detection measures');
75
+
76
+ return {
77
+ browser,
78
+ isPlaywright: true,
79
+ screenshotDir,
80
+ };
81
+ }
82
+
83
+ const playwrightModule = 'playwright';
84
+ const { chromium } = await import(/* @vite-ignore */ playwrightModule);
85
+
86
+ const browser = await chromium.launch({
87
+ headless: !isDebug,
88
+ slowMo: isDebug ? 100 : 0,
89
+ args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
90
+ devtools: isDebug,
91
+ });
92
+
93
+ const originalNewPage = browser.newPage.bind(browser);
94
+ browser.newPage = async () => {
95
+ const page = await originalNewPage();
96
+ return addCompatibilityMethods(page);
97
+ };
98
+
99
+ if (isDebug) {
100
+ sdkLogger.info('[BrowserLauncher] Debug mode enabled - browser visible, slow motion active');
101
+ }
102
+
103
+ if (enableTrace) {
104
+ sdkLogger.info(
105
+ '[BrowserLauncher] Trace recording enabled - artifacts will be saved on error'
106
+ );
107
+ }
108
+
109
+ return {
110
+ browser,
111
+ isPlaywright: true,
112
+ screenshotDir,
113
+ };
114
+ } catch (error: any) {
115
+ if (error.message?.includes("Executable doesn't exist") || error.code === 'MODULE_NOT_FOUND') {
116
+ throw new Error(
117
+ 'Playwright not installed or Chromium browser missing.\n' +
118
+ 'Install with: npm install -D playwright && npx playwright install chromium'
119
+ );
120
+ }
121
+
122
+ sdkLogger.error({ error }, '[BrowserLauncher] Failed to launch Playwright browser:');
123
+ throw new Error(`Playwright launch failed: ${error.message}`);
124
+ }
125
+ }
126
+
127
+ /**
128
+ * Capture error artifacts (screenshot, HTML, trace) when crawler fails
129
+ */
130
+ export async function captureErrorArtifacts(
131
+ page: any,
132
+ error: Error,
133
+ crawlerType: string,
134
+ screenshotDir: string
135
+ ): Promise<void> {
136
+ try {
137
+ await mkdir(screenshotDir, { recursive: true });
138
+
139
+ const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
140
+ const baseFilename = `${crawlerType}-${timestamp}`;
141
+
142
+ const screenshotPath = join(screenshotDir, `${baseFilename}.png`);
143
+ try {
144
+ await page.screenshot({
145
+ path: screenshotPath,
146
+ fullPage: true,
147
+ timeout: 5000,
148
+ });
149
+ sdkLogger.error({ path: screenshotPath }, '[BrowserLauncher] Screenshot saved');
150
+ } catch (screenshotError) {
151
+ sdkLogger.warn({ error: screenshotError }, '[BrowserLauncher] Failed to capture screenshot');
152
+ }
153
+
154
+ const htmlPath = join(screenshotDir, `${baseFilename}.html`);
155
+ try {
156
+ const html = await page.content();
157
+ await writeFile(htmlPath, html, 'utf-8');
158
+ sdkLogger.error({ path: htmlPath }, '[BrowserLauncher] HTML saved');
159
+ } catch (htmlError) {
160
+ sdkLogger.warn({ error: htmlError }, '[BrowserLauncher] Failed to save HTML');
161
+ }
162
+
163
+ const logsPath = join(screenshotDir, `${baseFilename}.log`);
164
+ try {
165
+ const logs = await page
166
+ .evaluate(() => {
167
+ return (window.console as unknown as { history?: string[] })?.history || [];
168
+ })
169
+ .catch(() => []);
170
+
171
+ if (logs.length > 0) {
172
+ await writeFile(logsPath, logs.join('\n'), 'utf-8');
173
+ sdkLogger.error(`[BrowserLauncher] Console logs saved: ${logsPath}`);
174
+ }
175
+ } catch (_logError) {
176
+ // Console logs are optional
177
+ }
178
+
179
+ sdkLogger.error(
180
+ {
181
+ crawler_type: crawlerType,
182
+ error: error.message,
183
+ stack: error.stack,
184
+ artifacts: {
185
+ directory: screenshotDir,
186
+ screenshot: screenshotPath,
187
+ html: htmlPath,
188
+ },
189
+ debug_hint: `To debug: BROWSER_DEBUG=1 pnpm crawl ${crawlerType} [options]`,
190
+ },
191
+ '[BrowserLauncher] Crawler failed'
192
+ );
193
+ } catch (captureError) {
194
+ sdkLogger.error({ error: captureError }, '[BrowserLauncher] Failed to capture error artifacts');
195
+ }
196
+ }
197
+
198
+ /**
199
+ * Wrap crawler execution with error artifact capture
200
+ */
201
+ export async function withErrorCapture<T>(
202
+ fn: () => Promise<T>,
203
+ page: any,
204
+ crawlerType: string,
205
+ screenshotDir: string
206
+ ): Promise<T> {
207
+ try {
208
+ return await fn();
209
+ } catch (error: any) {
210
+ await captureErrorArtifacts(page, error, crawlerType, screenshotDir);
211
+ throw error;
212
+ }
213
+ }