@dealcrawl/sdk 2.9.0 → 2.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -6,14 +6,25 @@ Official TypeScript SDK for the DealCrawl web scraping and crawling API.
6
6
  [![TypeScript](https://img.shields.io/badge/TypeScript-5.0+-blue.svg)](https://www.typescriptlang.org/)
7
7
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
8
 
9
- ## What's New in January 2026 🎉
9
+ ## What's New in v2.11.0 (January 2026) 🎉
10
10
 
11
- - **📸 Screenshot Storage (Phase 4)** - Automatic screenshot capture and storage via Supabase with public URLs
12
- - **🎯 Priority Crawl System (Phase 5)** - 3-tier queue system (high/medium/low) based on SmartFrontier deal scores for optimized resource allocation
13
- - **🤖 AI Deal Extraction** - LLM-powered deal extraction with customizable score thresholds and automatic database storage
14
- - **💾 Enhanced Data Persistence** - New `crawled_pages` and `crawled_deals` tables for comprehensive deal tracking
11
+ ### Breaking Changes ⚠️
12
+
13
+ - **SearchOptions**: `maxResults` `limit`, `autoScrape` `scrapeResults`, `autoScrapeLimit` `maxScrapeResults`
14
+ - **BatchScrapeOptions**: `delay` `delayMs`
15
+ - **ExtractModel**: Updated to match API (`claude-3-5-haiku-20241022`, `claude-3-5-sonnet-20241022`, etc.)
16
+ - **ApiKeyScope**: Removed `scrape:batch` and `search` (use `scrape` scope for both)
17
+
18
+ ### New Features
19
+
20
+ - **📸 Screenshot Storage (SEC-011)** - Private by default with configurable signed URL TTL
21
+ - **🎯 Priority Crawl System** - 3-tier queue system (high/medium/low) based on SmartFrontier deal scores
22
+ - **🤖 AI Deal Extraction** - LLM-powered extraction with customizable score thresholds
15
23
  - **📝 Markdown Output** - Convert scraped content to clean Markdown with GFM support
16
- - **🎬 Browser Actions** - Execute preset actions (click, scroll, write, etc.) before scraping for dynamic content
24
+ - **🎬 Browser Actions** - Execute preset actions (click, scroll, write, etc.) before scraping
25
+ - **🔴 Real-Time SSE Events** - Track jobs in real-time with Server-Sent Events (browser only)
26
+ - **🛡️ Batch Scrape** - Added `ignoreInvalidURLs` for Firecrawl-compatible error handling
27
+ - **🔄 HTML to Markdown** - New `client.convert.htmlToMarkdown()` utility
17
28
 
18
29
  ## Features
19
30
 
@@ -63,14 +74,138 @@ console.log(result.data.parsed.markdown); // Markdown content
63
74
  console.log(result.data.screenshot); // Public screenshot URL
64
75
  ```
65
76
 
77
+ ## Real-Time Events (SSE) - Browser Only 🔴
78
+
79
+ Track jobs in real-time using Server-Sent Events (SSE). **Browser only** - for Node.js, use polling via `client.waitForResult()`.
80
+
81
+ ```typescript
82
+ // 1. Generate SSE token (required for EventSource)
83
+ const { token, expiresAt } = await client.auth.generateSSEToken();
84
+ console.log(`Token expires at: ${expiresAt}`); // 5 minutes
85
+
86
+ // 2. Subscribe to all events
87
+ const eventSource = client.events.subscribe(token, {
88
+ onEvent: (event) => {
89
+ console.log('Event:', event.type);
90
+ const data = JSON.parse(event.data);
91
+ console.log('Data:', data);
92
+ },
93
+ onError: (error) => {
94
+ console.error('SSE error:', error);
95
+ }
96
+ });
97
+
98
+ // 3. Listen for specific event types
99
+ eventSource.addEventListener('job.completed', (event) => {
100
+ const data = JSON.parse(event.data);
101
+ console.log('Job completed!', data.summary);
102
+ eventSource.close(); // Clean up
103
+ });
104
+
105
+ eventSource.addEventListener('job.progress', (event) => {
106
+ const data = JSON.parse(event.data);
107
+ console.log(`Progress: ${data.progress}%`);
108
+ });
109
+
110
+ eventSource.addEventListener('deal.found', (event) => {
111
+ const data = JSON.parse(event.data);
112
+ console.log('Deal found!', data.title, data.score);
113
+ });
114
+
115
+ // 4. Subscribe to specific job only
116
+ const job = await client.scrape.create({ url: "https://example.com" });
117
+ const jobToken = await client.auth.generateSSEToken({ jobId: job.jobId });
118
+
119
+ const jobEvents = client.events.subscribeToJob(job.jobId, jobToken.token, {
120
+ onEvent: (event) => {
121
+ const data = JSON.parse(event.data);
122
+ console.log(`[${event.type}]`, data);
123
+ }
124
+ });
125
+
126
+ // 5. Check connection limits before subscribing
127
+ const limits = await client.auth.getLimits();
128
+ console.log(`Available SSE connections: ${limits.sse.available}/${limits.sse.maxConnections}`);
129
+ // Free: 2 concurrent, Pro: 10 concurrent, Enterprise: 50 concurrent
130
+
131
+ // 6. Helper: Wait for completion via SSE
132
+ const result = await client.events.waitForCompletion(job.jobId, (progress) => {
133
+ console.log(`Progress: ${progress}%`);
134
+ });
135
+ ```
136
+
137
+ **Available Event Types:**
138
+
139
+ | Event Type | Description |
140
+ | ---------- | ----------- |
141
+ | `job.created` | Job was created |
142
+ | `job.queued` | Job entered queue |
143
+ | `job.started` | Worker picked up job |
144
+ | `job.progress` | Progress update (includes `progress`, `stats`, `eta`) |
145
+ | `job.status` | Status changed |
146
+ | `job.completed` | Job finished successfully |
147
+ | `job.failed` | Job failed (includes error details) |
148
+ | `job.cancelled` | Job was cancelled |
149
+ | `job.log` | Important log message |
150
+ | `job.metric` | Performance/business metric |
151
+ | `job.alert` | Important alert (quota warning, etc.) |
152
+ | `job.checkpoint` | Checkpoint saved (for resumable jobs) |
153
+ | `deal.found` | Deal detected during crawl |
154
+ | `deal.validated` | Deal scored/validated |
155
+ | `ping` | Keepalive (every 15 seconds) |
156
+ | `connection.open` | SSE connection established |
157
+ | `connection.close` | SSE connection closing |
158
+ | `error` | Error occurred |
159
+
160
+ **TypeScript Support:**
161
+
162
+ ```typescript
163
+ import type {
164
+ SSEEvent,
165
+ JobProgressEvent,
166
+ JobCompletedEvent,
167
+ DealFoundEvent
168
+ } from "@dealcrawl/sdk";
169
+
170
+ // Type-safe event handling
171
+ eventSource.addEventListener('job.progress', (event: MessageEvent) => {
172
+ const data = JSON.parse(event.data) as JobProgressEvent['data'];
173
+ console.log(`Progress: ${data.progress}%`);
174
+ console.log(`ETA: ${data.eta?.remainingFormatted}`);
175
+ console.log(`Deals found: ${data.stats?.dealsFound}`);
176
+ });
177
+
178
+ eventSource.addEventListener('job.completed', (event: MessageEvent) => {
179
+ const data = JSON.parse(event.data) as JobCompletedEvent['data'];
180
+ console.log('Completed in:', data.durationMs, 'ms');
181
+ console.log('Summary:', data.summary);
182
+ });
183
+ ```
184
+
185
+ **Features:**
186
+
187
+ - ✅ Automatic reconnection on disconnect
188
+ - ✅ Event replay via `Last-Event-ID` (up to 50 missed events)
189
+ - ✅ Keepalive pings every 15 seconds
190
+ - ✅ Max connection time: 1 hour (auto-reconnect after)
191
+ - ✅ Multi-tenant isolation (only see your events)
192
+ - ✅ Token-based auth (works with EventSource)
193
+
194
+ **Security:**
195
+
196
+ - Tokens expire after 5 minutes
197
+ - Tokens can be restricted to specific jobs
198
+ - Tokens stored in Redis (revocable)
199
+ - Connection limits per tier (Free: 2, Pro: 10, Enterprise: 50)
200
+
66
201
  ## January 2026 Features in Detail
67
202
 
68
- ### 📸 Screenshot Storage
203
+ ### 📸 Screenshot Storage (SEC-011)
69
204
 
70
- Automatically capture and store screenshots with public URLs:
205
+ **Private by default** with configurable signed URL expiration:
71
206
 
72
207
  ```typescript
73
- // With screenshot options
208
+ // Basic screenshot (private with tier-specific TTL)
74
209
  const job = await client.scrape.create({
75
210
  url: "https://example.com",
76
211
  screenshot: {
@@ -78,14 +213,52 @@ const job = await client.scrape.create({
78
213
  fullPage: true,
79
214
  format: "webp",
80
215
  quality: 85,
216
+ signedUrlTtl: 604800, // 7 days (default for Pro/Enterprise)
81
217
  },
82
218
  });
83
219
 
84
220
  const result = await client.waitForResult(job.jobId);
85
- console.log(result.data.screenshot);
86
- // → "https://...supabase.co/storage/v1/object/public/screenshots/..."
221
+ console.log(result.data.screenshotMetadata);
222
+ // {
223
+ // url: "https://...supabase.co/storage/v1/object/sign/screenshots-private/...",
224
+ // isPublic: false,
225
+ // expiresAt: "2026-01-25T12:00:00Z",
226
+ // width: 1280,
227
+ // height: 720,
228
+ // format: "webp",
229
+ // sizeBytes: 125000
230
+ // }
231
+
232
+ // Refresh signed URL before expiration
233
+ const refreshed = await client.screenshots.refresh({
234
+ path: "job_abc123/1234567890_nanoid_example.png",
235
+ ttl: 604800 // Extend for another 7 days
236
+ });
237
+ console.log(refreshed.url); // New signed URL
238
+ console.log(refreshed.expiresAt); // "2026-02-01T12:00:00Z"
239
+
240
+ // Get tier-specific TTL limits
241
+ const limits = await client.screenshots.getLimits();
242
+ console.log(limits);
243
+ // {
244
+ // tier: "pro",
245
+ // limits: { min: 3600, max: 604800, default: 604800 },
246
+ // formattedLimits: { min: "1 hour", max: "7 days", default: "7 days" }
247
+ // }
248
+
249
+ // Enterprise: Public URLs (opt-in)
250
+ const jobPublic = await client.scrape.create({
251
+ url: "https://example.com",
252
+ screenshot: {
253
+ enabled: true,
254
+ publicUrl: true, // ⚠️ Enterprise only - exposes data publicly
255
+ },
256
+ });
257
+ // → Public URL without expiration (Enterprise tier only)
87
258
  ```
88
259
 
260
+ **Security Note:** Screenshots are private by default to prevent exposure of personal data, copyrighted content, or sensitive tokens. Public URLs require Enterprise tier + explicit opt-in.
261
+
89
262
  ### 🎯 Priority Crawl System
90
263
 
91
264
  3-tier queue system automatically prioritizes high-value pages:
@@ -250,6 +423,8 @@ const batch = await client.scrape.batch({
250
423
  detectSignals: true,
251
424
  timeout: 30000,
252
425
  },
426
+ delayMs: 500, // ✨ Was: delay
427
+ ignoreInvalidURLs: true, // ✨ NEW: Skip invalid URLs instead of failing
253
428
  });
254
429
 
255
430
  // Get batch status
@@ -261,14 +436,14 @@ const results = await client.waitForAll(batch.jobIds);
261
436
 
262
437
  **Batch Options:**
263
438
 
264
- | Option | Type | Default | Description |
265
- | ------------ | ------ | -------- | ----------------------------------------- |
266
- | `urls` | array | required | 1-100 URL objects with optional overrides |
267
- | `defaults` | object | - | Default options applied to all URLs |
268
- | `priority` | number | 5 | Priority 1-10 (higher = faster) |
269
- | `delay` | number | 0 | Delay between URLs (0-5000ms) |
270
- | `webhookUrl` | string | - | Webhook for batch completion |
271
- | `ref` | string | - | Custom reference ID for tracking |
439
+ | Option | Type | Default | Description |
440
+ | ------------------ | ------- | -------- | ---------------------------------------------------- |
441
+ | `urls` | array | required | 1-100 URL objects with optional overrides |
442
+ | `defaults` | object | - | Default options applied to all URLs |
443
+ | `priority` | number | 5 | Priority 1-10 (higher = faster) |
444
+ | `delayMs` | number | 0 | Delay between URLs (0-5000ms) |
445
+ | `webhookUrl` | string | - | Webhook for batch completion |
446
+ | `ignoreInvalidURLs`| boolean | false | Continue on invalid URLs (Firecrawl-compatible) |
272
447
 
273
448
  ### Search - Web Search with AI
274
449
 
@@ -276,7 +451,7 @@ const results = await client.waitForAll(batch.jobIds);
276
451
  // Basic search
277
452
  const job = await client.search.create({
278
453
  query: "laptop deals black friday",
279
- maxResults: 20,
454
+ limit: 20, // ✨ Was: maxResults
280
455
  });
281
456
 
282
457
  // AI-optimized search with deal scoring
@@ -291,8 +466,8 @@ const job = await client.search.create({
291
466
  // Search with auto-scraping of results
292
467
  const job = await client.search.create({
293
468
  query: "promo codes electronics",
294
- autoScrape: true,
295
- autoScrapeLimit: 5,
469
+ scrapeResults: true, // ✨ Was: autoScrape
470
+ maxScrapeResults: 5, // ✨ Was: autoScrapeLimit
296
471
  });
297
472
 
298
473
  // Filtered search
@@ -302,7 +477,7 @@ const job = await client.search.create({
302
477
  location: "fr",
303
478
  language: "fr",
304
479
  dateRange: "month",
305
- domains: ["amazon.fr", "cdiscount.com"],
480
+ domain: "amazon.fr", // Single domain filter
306
481
  },
307
482
  });
308
483
 
@@ -321,14 +496,14 @@ const result = await client.searchAndWait({
321
496
  | Option | Type | Default | Description |
322
497
  | ------------------- | ------- | -------- | ----------------------------------------------- |
323
498
  | `query` | string | required | Search query |
324
- | `maxResults` | number | 10 | Results to return (1-100) |
499
+ | `limit` | number | 10 | Results to return (1-100) |
325
500
  | `useAiOptimization` | boolean | false | AI-enhance the query |
326
501
  | `aiProvider` | string | "openai" | "openai" or "anthropic" |
327
502
  | `aiModel` | string | - | Model ID (gpt-4o-mini, claude-3-5-sonnet, etc.) |
328
503
  | `useDealScoring` | boolean | false | Score results for deal relevance |
329
- | `autoScrape` | boolean | false | Auto-scrape top results |
330
- | `autoScrapeLimit` | number | 3 | Number of results to scrape |
331
- | `filters` | object | - | Location, language, date, domains |
504
+ | `scrapeResults` | boolean | false | Auto-scrape top results |
505
+ | `maxScrapeResults` | number | 5 | Number of results to scrape (1-10) |
506
+ | `filters` | object | - | Location, language, date, domain |
332
507
 
333
508
  ### Crawl - Website Crawling
334
509
 
@@ -553,16 +728,16 @@ const job = await client.agent.withClaude(
553
728
 
554
729
  **Action Types:**
555
730
 
556
- | Action | Key Parameters | Description |
557
- |--------------|---------------------------------------------------|---------------------------|
558
- | `click` | `selector`, `waitAfter?`, `button?`, `force?` | Click an element |
559
- | `scroll` | `direction`, `amount?`, `smooth?` | Scroll page/to element |
560
- | `write` | `selector`, `text`, `clearFirst?`, `typeDelay?` | Type text into input |
561
- | `wait` | `milliseconds?`, `selector?`, `condition?` | Wait for time or element |
562
- | `press` | `key`, `modifiers?` | Press keyboard key |
563
- | `screenshot` | `fullPage?`, `selector?`, `name?` | Capture screenshot |
564
- | `hover` | `selector`, `duration?` | Hover over element |
565
- | `select` | `selector`, `value`, `byLabel?` | Select dropdown option |
731
+ | Action | Key Parameters | Description |
732
+ |--------------|---------------------------------------------------|--------------------------|
733
+ | `click` | `selector`, `waitAfter?`, `button?`, `force?` | Click an element |
734
+ | `scroll` | `direction`, `amount?`, `smooth?` | Scroll page/to element |
735
+ | `write` | `selector`, `text`, `clearFirst?`, `typeDelay?` | Type text into input |
736
+ | `wait` | `milliseconds?`, `selector?`, `condition?` | Wait for time or element |
737
+ | `press` | `key`, `modifiers?` | Press keyboard key |
738
+ | `screenshot` | `fullPage?`, `selector?`, `name?` | Capture screenshot |
739
+ | `hover` | `selector`, `duration?` | Hover over element |
740
+ | `select` | `selector`, `value`, `byLabel?` | Select dropdown option |
566
741
 
567
742
  **Action Resilience (all actions support):**
568
743
 
@@ -687,6 +862,44 @@ await client.webhooks.delete(webhookId);
687
862
  - `crawl.completed` - Crawl job finished
688
863
  - `crawl.failed` - Crawl job failed
689
864
 
865
+ ### Screenshots - Signed URL Management
866
+
867
+ Manage screenshot signed URLs with configurable TTL and automatic refresh:
868
+
869
+ ```typescript
870
+ // Refresh a signed URL before expiration
871
+ const refreshed = await client.screenshots.refresh({
872
+ path: "job_abc123/1234567890_nanoid_example.png",
873
+ ttl: 604800 // Optional: 7 days (defaults to tier default)
874
+ });
875
+ console.log(refreshed.url); // New signed URL
876
+ console.log(refreshed.expiresAt); // "2026-01-25T12:00:00Z"
877
+ console.log(refreshed.tierLimits); // { min: 3600, max: 604800, default: 604800 }
878
+
879
+ // Get tier-specific TTL limits
880
+ const limits = await client.screenshots.getLimits();
881
+ console.log(limits.tier); // "pro"
882
+ console.log(limits.limits); // { min: 3600, max: 604800, default: 604800 }
883
+ console.log(limits.formattedLimits); // { min: "1 hour", max: "7 days", default: "7 days" }
884
+
885
+ // Specify custom bucket (defaults to 'screenshots-private')
886
+ const refreshed = await client.screenshots.refresh({
887
+ path: "job_xyz/screenshot.png",
888
+ ttl: 86400, // 1 day
889
+ bucket: "screenshots-private"
890
+ });
891
+ ```
892
+
893
+ **TTL Limits by Tier:**
894
+
895
+ | Tier | Min TTL | Max TTL | Default TTL |
896
+ |------------|---------|---------|-------------|
897
+ | Free | 1 hour | 24 hours| 24 hours |
898
+ | Pro | 1 hour | 7 days | 7 days |
899
+ | Enterprise | 1 hour | 30 days | 7 days |
900
+
901
+ **Security Note:** All screenshots are private by default. Public URLs (Enterprise only) don't require refresh as they don't expire.
902
+
690
903
  ### Keys - API Key Management
691
904
 
692
905
  ```typescript
@@ -717,20 +930,18 @@ const stats = await client.keys.getStats(keyId, { days: 30 });
717
930
 
718
931
  **Available Scopes:**
719
932
 
720
- | Scope | Endpoint | Description |
721
- | ----------------- | ----------------------- | ------------------------ |
722
- | `scrape` | `POST /v1/scrape` | Create scrape jobs |
723
- | `scrape:batch` | `POST /v1/scrape/batch` | Create batch scrape jobs |
724
- | `search` | `POST /v1/search` | Create search jobs |
725
- | `crawl` | `POST /v1/crawl` | Create crawl jobs |
726
- | `dork` | `POST /v1/dork` | Create dork searches |
727
- | `extract` | `POST /v1/extract` | Create extraction jobs |
728
- | `agent` | `POST /v1/agent` | Create AI agent jobs |
729
- | `status` | `GET /v1/status/:id` | Read job status |
730
- | `data:read` | `GET /v1/data/*` | Read jobs/deals |
731
- | `data:export` | `GET /v1/data/export` | Export data |
732
- | `keys:manage` | `/v1/keys` | Manage API keys |
733
- | `webhooks:manage` | `/v1/webhooks` | Manage webhooks |
933
+ | Scope | Endpoint | Description |
934
+ | ----------------- | --------------------------------- | ------------------------- |
935
+ | `scrape` | `POST /v1/scrape`, `/v1/scrape/batch` | Create scrape jobs |
936
+ | `crawl` | `POST /v1/crawl` | Create crawl jobs |
937
+ | `dork` | `POST /v1/dork` | Create dork searches |
938
+ | `extract` | `POST /v1/extract` | Create extraction jobs |
939
+ | `agent` | `POST /v1/agent` | Create AI agent jobs |
940
+ | `status` | `GET /v1/status/:id` | Read job status |
941
+ | `data:read` | `GET /v1/data/*` | Read jobs/deals |
942
+ | `data:export` | `GET /v1/data/export` | Export data |
943
+ | `keys:manage` | `/v1/keys` | Manage API keys |
944
+ | `webhooks:manage` | `/v1/webhooks` | Manage webhooks |
734
945
 
735
946
  **Scope Examples:**
736
947
 
@@ -756,7 +967,6 @@ await client.keys.create({
756
967
  "dork",
757
968
  "extract",
758
969
  "agent",
759
- "search",
760
970
  "status",
761
971
  "data:read",
762
972
  "data:export",
@@ -958,9 +1168,12 @@ import type {
958
1168
  HoverAction,
959
1169
  SelectAction,
960
1170
 
961
- // Screenshot Options
1171
+ // Screenshot Options & Responses
962
1172
  ScreenshotOptions,
963
1173
  ScreenshotResult,
1174
+ RefreshScreenshotOptions,
1175
+ ScreenshotRefreshResponse,
1176
+ ScreenshotLimitsResponse,
964
1177
 
965
1178
  // Re-exports from @dealcrawl/shared
966
1179
  ScrapeResult,
@@ -1073,6 +1286,58 @@ const client = new DealCrawl({
1073
1286
 
1074
1287
  > **Warning:** Never expose your API key in client-side code. Use a backend proxy or edge function.
1075
1288
 
1289
+ ## Migration Guide (v2.10.x → v2.11.0)
1290
+
1291
+ ### SearchOptions
1292
+
1293
+ ```diff
1294
+ const result = await client.search.create({
1295
+ query: "laptop deals",
1296
+ - maxResults: 20,
1297
+ + limit: 20,
1298
+ - autoScrape: true,
1299
+ + scrapeResults: true,
1300
+ - autoScrapeLimit: 5,
1301
+ + maxScrapeResults: 5,
1302
+ });
1303
+ ```
1304
+
1305
+ ### BatchScrapeOptions
1306
+
1307
+ ```diff
1308
+ const batch = await client.scrape.batch({
1309
+ urls: [...],
1310
+ - delay: 500,
1311
+ + delayMs: 500,
1312
+ + ignoreInvalidURLs: true, // NEW: Firecrawl-compatible
1313
+ });
1314
+ ```
1315
+
1316
+ ### ExtractModel
1317
+
1318
+ ```diff
1319
+ const job = await client.extract.create({
1320
+ url: "...",
1321
+ - model: "claude-3-haiku",
1322
+ + model: "claude-3-5-haiku-20241022",
1323
+ });
1324
+ ```
1325
+
1326
+ ### ApiKeyScope
1327
+
1328
+ ```diff
1329
+ await client.keys.create({
1330
+ name: "My Key",
1331
+ scopes: [
1332
+ "scrape",
1333
+ - "scrape:batch", // REMOVED - use "scrape" instead
1334
+ - "search", // REMOVED - use "scrape" instead
1335
+ "crawl",
1336
+ "status",
1337
+ ],
1338
+ });
1339
+ ```
1340
+
1076
1341
  ## Compatibility
1077
1342
 
1078
1343
  - **Node.js**: 18.0+