npm - owletto - Versions diffs - 1.0.0 - Mend

owletto 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/package.json +58 -0
package/src/api-paginated.ts +185 -0
package/src/base.ts +173 -0
package/src/browser/launcher.ts +213 -0
package/src/browser/stealth.ts +297 -0
package/src/browser-paginated.ts +425 -0
package/src/cli.ts +438 -0
package/src/connector-runtime.ts +63 -0
package/src/connector-types.ts +299 -0
package/src/http.ts +82 -0
package/src/index.ts +106 -0
package/src/logger.ts +10 -0
package/src/paginated.ts +301 -0
package/src/retry.ts +168 -0
package/src/scoring.ts +57 -0
package/src/types.ts +289 -0

package/package.json ADDED Viewed

@@ -0,0 +1,58 @@
+{
+  "name": "owletto",
+  "version": "1.0.0",
+  "description": "Owletto SDK - build pluggable connectors for the Owletto integration platform",
+  "type": "module",
+  "main": "dist/index.js",
+  "types": "dist/index.d.ts",
+  "exports": {
+    ".": {
+      "import": "./src/index.ts",
+      "types": "./src/index.ts"
+    }
+  },
+  "bin": {
+    "owletto-sdk": "./src/cli.ts"
+  },
+  "files": [
+    "dist",
+    "src"
+  ],
+  "scripts": {
+    "build": "tsc",
+    "typecheck": "tsc --noEmit",
+    "clean": "rm -rf dist"
+  },
+  "dependencies": {
+    "@sinclair/typebox": "^0.34.33",
+    "esbuild": "^0.27.0",
+    "ky": "^1.14.0",
+    "p-retry": "^7.1.0",
+    "pino": "^10.1.0"
+  },
+  "peerDependencies": {
+    "playwright": "^1.40.0"
+  },
+  "peerDependenciesMeta": {
+    "playwright": {
+      "optional": true
+    }
+  },
+  "devDependencies": {
+    "@types/node": "^20.10.0",
+    "typescript": "^5.3.3"
+  },
+  "engines": {
+    "node": ">=18"
+  },
+  "license": "MIT",
+  "publishConfig": {
+    "access": "public"
+  },
+  "homepage": "https://github.com/lobu-ai/owletto",
+  "repository": {
+    "type": "git",
+    "url": "https://github.com/lobu-ai/owletto.git",
+    "directory": "packages/sdk"
+  }
+}

package/src/api-paginated.ts ADDED Viewed

@@ -0,0 +1,185 @@
+/**
+ * API Paginated Crawler Base Class
+ *
+ * Extends PaginatedCrawler for HTTP/REST API-based crawlers.
+ * Provides HTTP client setup and common API patterns.
+ */
+import type { KyInstance } from 'ky';
+import { HTTPError } from 'ky';
+import { RateLimitError } from './base.js';
+import { createAuthenticatedClient, createHttpClient, httpClient } from './http.js';
+import { sdkLogger } from './logger.js';
+import type { PageFetchResult, PaginatedCheckpoint } from './paginated.js';
+import { PaginatedCrawler } from './paginated.js';
+import { withHttpRetry } from './retry.js';
+import type { CrawlerOptions, Env, SessionState } from './types.js';
+/**
+ * API session state for OAuth/token-based crawlers
+ */
+export interface ApiSessionState extends SessionState {
+  /** OAuth/API access token */
+  access_token?: string;
+  /** OAuth refresh token (for token refresh flows) */
+  refresh_token?: string;
+  /** Token type (e.g., 'Bearer') */
+  token_type?: string;
+  /** Token expiration time (ISO string) */
+  expires_at?: string;
+  /** Additional headers to include in requests */
+  headers?: Record<string, string>;
+  /** API key (alternative to OAuth tokens) */
+  api_key?: string;
+}
+/**
+ * Base class for API-based crawlers with HTTP pagination
+ */
+export abstract class ApiPaginatedCrawler<
+  TItem,
+  TResponse = unknown,
+  TCheckpoint extends PaginatedCheckpoint = PaginatedCheckpoint,
+> extends PaginatedCrawler<TItem, TCheckpoint> {
+  readonly apiType = 'api' as const;
+  /**
+   * Session state for this crawl session
+   */
+  protected _sessionState: ApiSessionState | null = null;
+  /**
+   * Set session state for this crawl session
+   */
+  protected setSessionState(sessionState: SessionState | null | undefined): void {
+    this._sessionState = (sessionState as ApiSessionState) || null;
+    if (this._sessionState) {
+      sdkLogger.debug(
+        { hasToken: !!this._sessionState.access_token, hasApiKey: !!this._sessionState.api_key },
+        `[${this.type}] Session state set`
+      );
+    }
+  }
+  /**
+   * Get current session state
+   */
+  protected getSessionState(): ApiSessionState | null {
+    return this._sessionState;
+  }
+  /**
+   * ABSTRACT: Build URL for fetching a specific page
+   */
+  protected abstract buildPageUrl(cursor: string | null, options: CrawlerOptions): string;
+  /**
+   * ABSTRACT: Parse API response into items and next token
+   */
+  protected abstract parseResponse(
+    response: TResponse,
+    options: CrawlerOptions
+  ): PageFetchResult<TItem>;
+  /**
+   * Get configured HTTP client for this crawler
+   */
+  protected getHttpClient(_env: Env): KyInstance {
+    return httpClient;
+  }
+  /**
+   * Create HTTP client with Bearer token authentication
+   */
+  protected createBearerClient(
+    token: string,
+    additionalHeaders?: Record<string, string>
+  ): KyInstance {
+    return createAuthenticatedClient(`Bearer ${token}`, additionalHeaders);
+  }
+  /**
+   * Create HTTP client with custom headers
+   */
+  protected createClientWithHeaders(headers: Record<string, string>): KyInstance {
+    return createHttpClient({ headers });
+  }
+  /**
+   * Create HTTP client from session state
+   */
+  protected createClientFromSessionState(additionalHeaders?: Record<string, string>): KyInstance {
+    if (!this._sessionState) {
+      return additionalHeaders ? createHttpClient({ headers: additionalHeaders }) : httpClient;
+    }
+    const headers: Record<string, string> = {
+      ...this._sessionState.headers,
+      ...additionalHeaders,
+    };
+    if (this._sessionState.access_token) {
+      const tokenType = this._sessionState.token_type || 'Bearer';
+      return createAuthenticatedClient(`${tokenType} ${this._sessionState.access_token}`, headers);
+    }
+    if (this._sessionState.api_key) {
+      return createAuthenticatedClient(this._sessionState.api_key, headers);
+    }
+    return Object.keys(headers).length > 0 ? createHttpClient({ headers }) : httpClient;
+  }
+  /**
+   * Handle HTTP errors with platform-specific messages
+   */
+  protected handleHttpError(error: HTTPError, url: string): never {
+    this.handleHTTPError(error.response.status, url);
+  }
+  /**
+   * Default fetchPage implementation using HTTP client with retry
+   */
+  protected async fetchPage(
+    cursor: string | null,
+    options: CrawlerOptions,
+    env: Env
+  ): Promise<PageFetchResult<TItem>> {
+    const client = this.getHttpClient(env);
+    const url = this.buildPageUrl(cursor, options);
+    try {
+      const response = await withHttpRetry(async () => client.get(url).json<TResponse>(), {
+        operation: `${this.type} API fetch`,
+        context: { url, cursor },
+      });
+      return this.parseResponse(response, options);
+    } catch (error) {
+      if (error instanceof HTTPError) {
+        if (error.response.status === 429) {
+          const retryAfter = error.response.headers.get('retry-after');
+          let retryAfterMs: number | undefined;
+          if (retryAfter) {
+            const numericRetry = Number(retryAfter);
+            if (!Number.isNaN(numericRetry)) {
+              retryAfterMs = numericRetry * 1000;
+            } else {
+              const retryDate = Date.parse(retryAfter);
+              if (!Number.isNaN(retryDate)) {
+                retryAfterMs = retryDate - Date.now();
+              }
+            }
+          }
+          throw new RateLimitError(
+            `${this.displayName} rate limit exceeded. Please wait before retrying.`,
+            retryAfterMs && retryAfterMs > 0 ? retryAfterMs : undefined
+          );
+        }
+        this.handleHttpError(error, url);
+      }
+      sdkLogger.error({ error, url }, `[${this.type}] API fetch failed`);
+      throw error;
+    }
+  }
+}

package/src/base.ts ADDED Viewed

@@ -0,0 +1,173 @@
+import type { TObject } from '@sinclair/typebox';
+import { Value } from '@sinclair/typebox/value';
+import { sdkLogger } from './logger.js';
+import type {
+  Checkpoint,
+  Content,
+  CrawlerAuthSchema,
+  CrawlerOptions,
+  CrawlResult,
+  Env,
+  ICrawler,
+  ParentSourceDefinition,
+  ScoringConfig,
+  SessionState,
+} from './types.js';
+export class RateLimitError extends Error {
+  readonly retryAfterMs?: number;
+  constructor(message: string, retryAfterMs?: number) {
+    super(message);
+    this.name = 'RateLimitError';
+    this.retryAfterMs = retryAfterMs;
+  }
+}
+/**
+ * Base crawler implementation with common functionality
+ * All platform-specific crawlers should extend this class
+ */
+export abstract class BaseCrawler implements ICrawler {
+  abstract readonly type: string;
+  abstract readonly displayName: string;
+  abstract readonly apiType: 'api' | 'browser';
+  abstract readonly crawlerType: 'entity' | 'search';
+  abstract readonly optionsSchema: TObject;
+  abstract readonly defaultScoringConfig: ScoringConfig;
+  abstract readonly defaultScoringFormula: string;
+  readonly authSchema: CrawlerAuthSchema = { methods: [{ type: 'none' }] };
+  abstract pull(
+    options: CrawlerOptions,
+    checkpoint: Checkpoint | null,
+    env: Env,
+    sessionState?: SessionState | null,
+    updateCheckpointFn?: (checkpoint: Checkpoint) => Promise<void>
+  ): Promise<CrawlResult>;
+  abstract urlFromOptions(options: CrawlerOptions): string;
+  abstract displayLabelFromOptions(options: CrawlerOptions): string;
+  abstract validateOptions(options: CrawlerOptions): string | null;
+  getParentSourceDefinitions(_options: CrawlerOptions): ParentSourceDefinition[] {
+    return [];
+  }
+  /**
+   * Validate options using TypeBox schema
+   * Subclasses can call this for schema validation before adding custom business logic
+   */
+  protected validateWithSchema(options: CrawlerOptions): string | null {
+    try {
+      const errors = [...Value.Errors(this.optionsSchema, options)];
+      if (errors.length > 0) {
+        // Format first error for user-friendly message
+        const firstError = errors[0];
+        const field = firstError.path.replace(/^\//, '');
+        return `Invalid option ${field ? `"${field}"` : ''}: ${firstError.message}`;
+      }
+      return null;
+    } catch (error) {
+      sdkLogger.error({ error }, '[BaseCrawler] Schema validation error:');
+      return 'Invalid crawler options format';
+    }
+  }
+  /**
+   * Get rate limit information for this platform
+   * Override this method in platform-specific crawlers
+   * Default is conservative: 10 requests per minute
+   */
+  getRateLimit() {
+    return {
+      requests_per_minute: 10,
+      recommended_interval_ms: 6000, // 6 seconds
+    };
+  }
+  /**
+   * Helper to check if content is newer than checkpoint
+   */
+  protected isNewerThan(contentDate: Date, checkpoint: Checkpoint | null): boolean {
+    if (!checkpoint || !checkpoint.last_timestamp) return true;
+    return contentDate > checkpoint.last_timestamp;
+  }
+  /**
+   * Calculate lookback date from options
+   * @param options - Crawler options with optional lookback_days
+   * @param defaultDays - Default lookback period (default: 365)
+   */
+  protected getLookbackDate(options: CrawlerOptions, defaultDays: number = 365): Date {
+    const lookbackDays = options.lookback_days || defaultDays;
+    return new Date(Date.now() - lookbackDays * 24 * 60 * 60 * 1000);
+  }
+  /**
+   * Sleep for specified milliseconds (for rate limiting)
+   */
+  protected sleep(ms: number): Promise<void> {
+    return new Promise((resolve) => setTimeout(resolve, ms));
+  }
+  /**
+   * Check if crawler is in incremental mode
+   */
+  protected isIncrementalMode(
+    checkpoint: Checkpoint | null,
+    paginationToken?: string | null
+  ): boolean {
+    return !!checkpoint?.last_timestamp && !paginationToken;
+  }
+  /**
+   * Helper to deduplicate content by external_id
+   */
+  protected deduplicate(contents: Content[], seenIds: Set<string>): Content[] {
+    return contents.filter((c) => {
+      if (!c.external_id) return false;
+      if (seenIds.has(c.external_id)) return false;
+      seenIds.add(c.external_id);
+      return true;
+    });
+  }
+  /**
+   * Handle HTTP errors with structured logging and platform-specific messages
+   */
+  protected handleHTTPError(status: number, context: string, platformName?: string): never {
+    const platform = platformName || this.displayName;
+    sdkLogger.error(
+      {
+        status,
+        context,
+        platform: this.type,
+        timestamp: new Date().toISOString(),
+      },
+      `[${platform}Crawler] HTTP ${status} error:`
+    );
+    const errorMessages: Record<number, string> = {
+      400: `Bad request to ${platform}: ${context}. Check your crawler options.`,
+      401: `Authentication failed for ${platform}. Check your API credentials.`,
+      403: `Access forbidden to ${platform} resource: ${context}. The resource may be private or require authentication.`,
+      404: `Resource not found on ${platform}: ${context}. Verify the resource exists.`,
+      422: `Invalid request to ${platform}: ${context}. Check your parameters.`,
+      429: `${platform} rate limit exceeded. Please wait before retrying.`,
+      500: `${platform} server error (${status}). This is temporary, please retry later.`,
+      502: `${platform} bad gateway (${status}). This is temporary, please retry later.`,
+      503: `${platform} service unavailable (${status}). This is temporary, please retry later.`,
+    };
+    const message = errorMessages[status] || `${platform} API error: ${status}`;
+    if (status === 429) {
+      throw new RateLimitError(message);
+    }
+    throw new Error(message);
+  }
+}

package/src/browser/launcher.ts ADDED Viewed

@@ -0,0 +1,213 @@
+/// <reference lib="dom" />
+/**
+ * Browser Launcher Utility
+ * Provides Playwright-based browser automation
+ */
+import { mkdir, writeFile } from 'node:fs/promises';
+import { join } from 'node:path';
+import { sdkLogger } from '../logger.js';
+import type { Env } from '../types.js';
+import { launchStealthBrowser } from './stealth.js';
+export interface BrowserLaunchOptions {
+  debug?: boolean;
+  trace?: boolean;
+  screenshotDir?: string;
+  stealth?: boolean;
+}
+export interface EnhancedBrowser {
+  browser: any;
+  isPlaywright: boolean;
+  screenshotDir: string;
+}
+/**
+ * Add Puppeteer-compatible methods to Playwright Page
+ */
+function addCompatibilityMethods(page: any): any {
+  if (!page.setUserAgent) {
+    page.setUserAgent = async (userAgent: string) => {
+      await page.setExtraHTTPHeaders({
+        'User-Agent': userAgent,
+      });
+    };
+  }
+  return page;
+}
+/**
+ * Launch browser with Playwright
+ */
+export async function launchBrowser(
+  _env: Env,
+  options: BrowserLaunchOptions = {}
+): Promise<EnhancedBrowser> {
+  const isDebug = options.debug ?? process.env.BROWSER_DEBUG === '1';
+  const enableTrace = options.trace ?? process.env.BROWSER_TRACE === '1';
+  const screenshotDir =
+    options.screenshotDir ?? process.env.BROWSER_SCREENSHOT_DIR ?? '/tmp/crawler-screenshots';
+  const useStealth = options.stealth ?? process.env.BROWSER_STEALTH === '1';
+  sdkLogger.info(
+    `[BrowserLauncher] Using Playwright (local) - headless: ${!isDebug}, stealth: ${useStealth}`
+  );
+  try {
+    if (useStealth) {
+      const stealthBrowser = await launchStealthBrowser({
+        headless: !isDebug,
+        debug: isDebug,
+      });
+      const browser = stealthBrowser.browser;
+      const originalNewPage = browser.newPage.bind(browser);
+      browser.newPage = async () => {
+        const page = await originalNewPage();
+        return addCompatibilityMethods(page);
+      };
+      sdkLogger.info('[BrowserLauncher] Stealth mode enabled - using anti-detection measures');
+      return {
+        browser,
+        isPlaywright: true,
+        screenshotDir,
+      };
+    }
+    const playwrightModule = 'playwright';
+    const { chromium } = await import(/* @vite-ignore */ playwrightModule);
+    const browser = await chromium.launch({
+      headless: !isDebug,
+      slowMo: isDebug ? 100 : 0,
+      args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage'],
+      devtools: isDebug,
+    });
+    const originalNewPage = browser.newPage.bind(browser);
+    browser.newPage = async () => {
+      const page = await originalNewPage();
+      return addCompatibilityMethods(page);
+    };
+    if (isDebug) {
+      sdkLogger.info('[BrowserLauncher] Debug mode enabled - browser visible, slow motion active');
+    }
+    if (enableTrace) {
+      sdkLogger.info(
+        '[BrowserLauncher] Trace recording enabled - artifacts will be saved on error'
+      );
+    }
+    return {
+      browser,
+      isPlaywright: true,
+      screenshotDir,
+    };
+  } catch (error: any) {
+    if (error.message?.includes("Executable doesn't exist") || error.code === 'MODULE_NOT_FOUND') {
+      throw new Error(
+        'Playwright not installed or Chromium browser missing.\n' +
+          'Install with: npm install -D playwright && npx playwright install chromium'
+      );
+    }
+    sdkLogger.error({ error }, '[BrowserLauncher] Failed to launch Playwright browser:');
+    throw new Error(`Playwright launch failed: ${error.message}`);
+  }
+}
+/**
+ * Capture error artifacts (screenshot, HTML, trace) when crawler fails
+ */
+export async function captureErrorArtifacts(
+  page: any,
+  error: Error,
+  crawlerType: string,
+  screenshotDir: string
+): Promise<void> {
+  try {
+    await mkdir(screenshotDir, { recursive: true });
+    const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
+    const baseFilename = `${crawlerType}-${timestamp}`;
+    const screenshotPath = join(screenshotDir, `${baseFilename}.png`);
+    try {
+      await page.screenshot({
+        path: screenshotPath,
+        fullPage: true,
+        timeout: 5000,
+      });
+      sdkLogger.error({ path: screenshotPath }, '[BrowserLauncher] Screenshot saved');
+    } catch (screenshotError) {
+      sdkLogger.warn({ error: screenshotError }, '[BrowserLauncher] Failed to capture screenshot');
+    }
+    const htmlPath = join(screenshotDir, `${baseFilename}.html`);
+    try {
+      const html = await page.content();
+      await writeFile(htmlPath, html, 'utf-8');
+      sdkLogger.error({ path: htmlPath }, '[BrowserLauncher] HTML saved');
+    } catch (htmlError) {
+      sdkLogger.warn({ error: htmlError }, '[BrowserLauncher] Failed to save HTML');
+    }
+    const logsPath = join(screenshotDir, `${baseFilename}.log`);
+    try {
+      const logs = await page
+        .evaluate(() => {
+          return (window.console as unknown as { history?: string[] })?.history || [];
+        })
+        .catch(() => []);
+      if (logs.length > 0) {
+        await writeFile(logsPath, logs.join('\n'), 'utf-8');
+        sdkLogger.error(`[BrowserLauncher] Console logs saved: ${logsPath}`);
+      }
+    } catch (_logError) {
+      // Console logs are optional
+    }
+    sdkLogger.error(
+      {
+        crawler_type: crawlerType,
+        error: error.message,
+        stack: error.stack,
+        artifacts: {
+          directory: screenshotDir,
+          screenshot: screenshotPath,
+          html: htmlPath,
+        },
+        debug_hint: `To debug: BROWSER_DEBUG=1 pnpm crawl ${crawlerType} [options]`,
+      },
+      '[BrowserLauncher] Crawler failed'
+    );
+  } catch (captureError) {
+    sdkLogger.error({ error: captureError }, '[BrowserLauncher] Failed to capture error artifacts');
+  }
+}
+/**
+ * Wrap crawler execution with error artifact capture
+ */
+export async function withErrorCapture<T>(
+  fn: () => Promise<T>,
+  page: any,
+  crawlerType: string,
+  screenshotDir: string
+): Promise<T> {
+  try {
+    return await fn();
+  } catch (error: any) {
+    await captureErrorArtifacts(page, error, crawlerType, screenshotDir);
+    throw error;
+  }
+}