@dealcrawl/sdk 2.6.0 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -6,6 +6,15 @@ Official TypeScript SDK for the DealCrawl web scraping and crawling API.
6
6
  [![TypeScript](https://img.shields.io/badge/TypeScript-5.0+-blue.svg)](https://www.typescriptlang.org/)
7
7
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
8
 
9
+ ## What's New in January 2026 🎉
10
+
11
+ - **📸 Screenshot Storage (Phase 4)** - Automatic screenshot capture and storage via Supabase with public URLs
12
+ - **🎯 Priority Crawl System (Phase 5)** - 3-tier queue system (high/medium/low) based on SmartFrontier deal scores for optimized resource allocation
13
+ - **🤖 AI Deal Extraction** - LLM-powered deal extraction with customizable score thresholds and automatic database storage
14
+ - **💾 Enhanced Data Persistence** - New `crawled_pages` and `crawled_deals` tables for comprehensive deal tracking
15
+ - **📝 Markdown Output** - Convert scraped content to clean Markdown with GFM support
16
+ - **🎬 Browser Actions** - Execute preset actions (click, scroll, write, etc.) before scraping for dynamic content
17
+
9
18
  ## Features
10
19
 
11
20
  - 🚀 **Full API Coverage** - Access all 50+ DealCrawl API endpoints
@@ -40,15 +49,134 @@ const client = new DealCrawl({
40
49
  apiKey: process.env.DEALCRAWL_API_KEY!,
41
50
  });
42
51
 
43
- // Scrape a single page with deal extraction
52
+ // Scrape a single page with deal extraction and screenshot
44
53
  const job = await client.scrape.create({
45
54
  url: "https://shop.example.com/product",
46
55
  extractDeal: true,
56
+ screenshot: { enabled: true },
57
+ outputMarkdown: true, // NEW: Get clean markdown output
47
58
  });
48
59
 
49
60
  // Wait for result with automatic polling
50
61
  const result = await client.waitForResult(job.jobId);
51
- console.log(result);
62
+ console.log(result.data.parsed.markdown); // Markdown content
63
+ console.log(result.data.screenshot); // Public screenshot URL
64
+ ```
65
+
66
+ ## January 2026 Features in Detail
67
+
68
+ ### 📸 Screenshot Storage
69
+
70
+ Automatically capture and store screenshots with public URLs:
71
+
72
+ ```typescript
73
+ // With screenshot options
74
+ const job = await client.scrape.create({
75
+ url: "https://example.com",
76
+ screenshot: {
77
+ enabled: true,
78
+ fullPage: true,
79
+ format: "webp",
80
+ quality: 85,
81
+ },
82
+ });
83
+
84
+ const result = await client.waitForResult(job.jobId);
85
+ console.log(result.data.screenshot);
86
+ // → "https://...supabase.co/storage/v1/object/public/screenshots/..."
87
+ ```
88
+
89
+ ### 🎯 Priority Crawl System
90
+
91
+ 3-tier queue system automatically prioritizes high-value pages:
92
+
93
+ ```typescript
94
+ // Crawl with automatic prioritization
95
+ const job = await client.crawl.create({
96
+ url: "https://shop.example.com",
97
+ extractDeal: true,
98
+ minDealScore: 50, // Only extract deals scoring 50+
99
+ });
100
+
101
+ // Behind the scenes:
102
+ // - Pages scoring 70+ → High priority queue (5 workers, 30/min)
103
+ // - Pages scoring 40-69 → Medium priority queue (10 workers, 60/min)
104
+ // - Pages scoring <40 → Low priority queue (20 workers, 120/min)
105
+ ```
106
+
107
+ ### 🤖 AI Deal Extraction
108
+
109
+ Extract deals with LLM-powered analysis:
110
+
111
+ ```typescript
112
+ // Extract deals during crawl
113
+ const job = await client.crawl.create({
114
+ url: "https://marketplace.example.com",
115
+ extractDeal: true,
116
+ minDealScore: 30, // Only extract if score >= 30
117
+ maxPages: 200,
118
+ });
119
+
120
+ // Get extracted deals
121
+ const deals = await client.status.getDeals(job.jobId, {
122
+ minScore: 70, // Filter for high-quality deals
123
+ limit: 50,
124
+ });
125
+
126
+ console.log(deals.deals); // Array of ExtractedDeal objects
127
+ ```
128
+
129
+ ### 📝 Markdown Output
130
+
131
+ Convert HTML to clean, structured markdown:
132
+
133
+ ```typescript
134
+ // Single page markdown
135
+ const job = await client.scrape.create({
136
+ url: "https://blog.example.com/article",
137
+ outputMarkdown: true,
138
+ markdownBaseUrl: "https://blog.example.com", // Resolve relative URLs
139
+ onlyMainContent: true,
140
+ });
141
+
142
+ const result = await client.waitForResult(job.jobId);
143
+ console.log(result.data.parsed.markdown);
144
+ // Clean markdown with:
145
+ // - GFM tables, strikethrough, task lists
146
+ // - Code blocks with syntax detection
147
+ // - Absolute URLs
148
+ // - Noise removal (ads, navigation)
149
+ ```
150
+
151
+ ### 🎬 Browser Actions
152
+
153
+ Execute actions before scraping for dynamic content:
154
+
155
+ ```typescript
156
+ // Handle cookie popups and load more content
157
+ const job = await client.scrape.create({
158
+ url: "https://shop.example.com/products",
159
+ actions: [
160
+ { type: "click", selector: "#accept-cookies", optional: true },
161
+ { type: "wait", milliseconds: 500 },
162
+ { type: "scroll", direction: "down", amount: 500 },
163
+ { type: "click", selector: ".load-more", retries: 3 },
164
+ { type: "wait", selector: ".products-loaded" },
165
+ ],
166
+ extractMultipleDeals: true,
167
+ });
168
+
169
+ // Search and extract
170
+ const job2 = await client.scrape.create({
171
+ url: "https://marketplace.com",
172
+ actions: [
173
+ { type: "write", selector: "input[name='search']", text: "laptop deals" },
174
+ { type: "press", key: "Enter" },
175
+ { type: "wait", selector: ".results" },
176
+ ],
177
+ extractMultipleDeals: true,
178
+ maxDeals: 30,
179
+ });
52
180
  ```
53
181
 
54
182
  ## Configuration
@@ -86,25 +214,29 @@ const job = await client.scrape.withScreenshot("https://example.com", {
86
214
  ```
87
215
 
88
216
  **Options:**
89
- | Option | Type | Default | Description |
90
- |--------|------|---------|-------------|
91
- | `url` | string | required | URL to scrape |
92
- | `noStore` | boolean | false | Zero Data Retention - don't save results (Pro/Enterprise) |
93
- | `detectSignals` | boolean | true | Detect prices, discounts, urgency |
94
- | `extractDeal` | boolean | false | Extract deal information |
95
- | `extractMultipleDeals` | boolean | false | Extract multiple deals from list pages |
96
- | `maxDeals` | number | 20 | Max deals to extract (max: 50) |
97
- | `extractWithAI` | boolean | false | Use AI for extraction |
98
- | `useAdvancedModel` | boolean | false | Use GPT-4o (higher cost) |
99
- | `minDealScore` | number | 0 | Minimum deal score (0-100) |
100
- | `screenshot` | object | - | Screenshot options |
101
- | `excludeTags` | string[] | - | HTML tags to exclude |
102
- | `excludeSelectors` | string[] | - | CSS selectors to exclude |
103
- | `onlyMainContent` | boolean | true | Extract main content only |
104
- | `headers` | object | - | Custom HTTP headers |
105
- | `timeout` | number | 30000 | Request timeout in ms (max: 120000) |
106
-
107
- ### Batch Scrape - Bulk URL Scraping (NEW)
217
+
218
+ | Option | Type | Default | Description |
219
+ | ---------------------- | -------- | -------- | --------------------------------------------------------- |
220
+ | `url` | string | required | URL to scrape |
221
+ | `noStore` | boolean | false | Zero Data Retention - don't save results (Pro/Enterprise) |
222
+ | `detectSignals` | boolean | true | Detect prices, discounts, urgency |
223
+ | `extractDeal` | boolean | false | Extract deal information |
224
+ | `extractMultipleDeals` | boolean | false | Extract multiple deals from list pages |
225
+ | `maxDeals` | number | 20 | Max deals to extract (max: 50) |
226
+ | `extractWithAI` | boolean | false | Use AI for extraction |
227
+ | `useAdvancedModel` | boolean | false | Use GPT-4o (higher cost) |
228
+ | `minDealScore` | number | 0 | Minimum deal score (0-100) |
229
+ | `screenshot` | object | - | Screenshot options |
230
+ | `excludeTags` | string[] | - | HTML tags to exclude |
231
+ | `excludeSelectors` | string[] | - | CSS selectors to exclude |
232
+ | `onlyMainContent` | boolean | true | Extract main content only |
233
+ | `headers` | object | - | Custom HTTP headers |
234
+ | `timeout` | number | 30000 | Request timeout in ms (max: 120000) |
235
+ | `outputMarkdown` | boolean | false | Convert content to Markdown (GFM) |
236
+ | `markdownBaseUrl` | string | - | Base URL for resolving relative URLs in markdown |
237
+ | `actions` | array | - | Browser actions to execute before scraping |
238
+
239
+ ### Batch Scrape - Bulk URL Scraping
108
240
 
109
241
  ```typescript
110
242
  // Scrape multiple URLs in one request (1-100 URLs)
@@ -128,16 +260,17 @@ const results = await client.waitForAll(batch.jobIds);
128
260
  ```
129
261
 
130
262
  **Batch Options:**
131
- | Option | Type | Default | Description |
132
- |--------|------|---------|-------------|
133
- | `urls` | array | required | 1-100 URL objects with optional overrides |
134
- | `defaults` | object | - | Default options applied to all URLs |
135
- | `priority` | number | 5 | Priority 1-10 (higher = faster) |
136
- | `delay` | number | 0 | Delay between URLs (0-5000ms) |
137
- | `webhookUrl` | string | - | Webhook for batch completion |
138
- | `ref` | string | - | Custom reference ID for tracking |
139
263
 
140
- ### Search - Web Search with AI (NEW)
264
+ | Option | Type | Default | Description |
265
+ | ------------ | ------ | -------- | ----------------------------------------- |
266
+ | `urls` | array | required | 1-100 URL objects with optional overrides |
267
+ | `defaults` | object | - | Default options applied to all URLs |
268
+ | `priority` | number | 5 | Priority 1-10 (higher = faster) |
269
+ | `delay` | number | 0 | Delay between URLs (0-5000ms) |
270
+ | `webhookUrl` | string | - | Webhook for batch completion |
271
+ | `ref` | string | - | Custom reference ID for tracking |
272
+
273
+ ### Search - Web Search with AI
141
274
 
142
275
  ```typescript
143
276
  // Basic search
@@ -184,17 +317,18 @@ const result = await client.searchAndWait({
184
317
  ```
185
318
 
186
319
  **Search Options:**
187
- | Option | Type | Default | Description |
188
- |--------|------|---------|-------------|
189
- | `query` | string | required | Search query |
190
- | `maxResults` | number | 10 | Results to return (1-100) |
191
- | `useAiOptimization` | boolean | false | AI-enhance the query |
192
- | `aiProvider` | string | "openai" | "openai" or "anthropic" |
193
- | `aiModel` | string | - | Model ID (gpt-4o-mini, claude-3-5-sonnet, etc.) |
194
- | `useDealScoring` | boolean | false | Score results for deal relevance |
195
- | `autoScrape` | boolean | false | Auto-scrape top results |
196
- | `autoScrapeLimit` | number | 3 | Number of results to scrape |
197
- | `filters` | object | - | Location, language, date, domains |
320
+
321
+ | Option | Type | Default | Description |
322
+ | ------------------- | ------- | -------- | ----------------------------------------------- |
323
+ | `query` | string | required | Search query |
324
+ | `maxResults` | number | 10 | Results to return (1-100) |
325
+ | `useAiOptimization` | boolean | false | AI-enhance the query |
326
+ | `aiProvider` | string | "openai" | "openai" or "anthropic" |
327
+ | `aiModel` | string | - | Model ID (gpt-4o-mini, claude-3-5-sonnet, etc.) |
328
+ | `useDealScoring` | boolean | false | Score results for deal relevance |
329
+ | `autoScrape` | boolean | false | Auto-scrape top results |
330
+ | `autoScrapeLimit` | number | 3 | Number of results to scrape |
331
+ | `filters` | object | - | Location, language, date, domains |
198
332
 
199
333
  ### Crawl - Website Crawling
200
334
 
@@ -253,26 +387,30 @@ const job = await client.crawl.create({
253
387
  - `custom` - No preset, use your own settings
254
388
 
255
389
  **Crawl Options:**
256
- | Option | Type | Default | Description |
257
- |--------|------|---------|-------------|
258
- | `url` | string | required | Starting URL |
259
- | `maxDepth` | number | 3 | Max crawl depth (1-5) |
260
- | `maxPages` | number | 100 | Max pages to crawl (1-1000) |
261
- | `detectSignals` | boolean | true | Detect prices, discounts |
262
- | `extractDeal` | boolean | false | Extract deal info with AI |
263
- | `minDealScore` | number | 30 | Min deal score threshold (0-100) |
264
- | `categories` | array | - | Filter: courses, software, physical, services, other |
265
- | `priceRange` | object | - | Filter: { min, max } price |
266
- | `onlyHighQuality` | boolean | false | Only deals scoring 70+ |
267
- | `allowedMerchants` | string[] | - | Only these merchants |
268
- | `blockedMerchants` | string[] | - | Exclude these merchants |
269
- | `webhookUrl` | string | - | Real-time notifications URL |
270
- | `syncToDealup` | boolean | false | Auto-sync to DealUp |
271
- | `template` | string | - | Job template to use |
272
- | `useSmartRouting` | boolean | true | Auto-detect best settings |
273
- | `priority` | string | - | Queue priority (Enterprise only) |
274
- | `requireJS` | boolean | false | Force JavaScript rendering |
275
- | `bypassAntiBot` | boolean | false | Advanced anti-bot techniques |
390
+
391
+ | Option | Type | Default | Description |
392
+ | ------------------ | -------- | -------- | ---------------------------------------------------- |
393
+ | `url` | string | required | Starting URL |
394
+ | `maxDepth` | number | 3 | Max crawl depth (1-5) |
395
+ | `maxPages` | number | 100 | Max pages to crawl (1-1000) |
396
+ | `detectSignals` | boolean | true | Detect prices, discounts |
397
+ | `extractDeal` | boolean | false | Extract deal info with AI |
398
+ | `minDealScore` | number | 30 | Min deal score threshold (0-100) |
399
+ | `categories` | array | - | Filter: courses, software, physical, services, other |
400
+ | `priceRange` | object | - | Filter: { min, max } price |
401
+ | `onlyHighQuality` | boolean | false | Only deals scoring 70+ |
402
+ | `allowedMerchants` | string[] | - | Only these merchants |
403
+ | `blockedMerchants` | string[] | - | Exclude these merchants |
404
+ | `webhookUrl` | string | - | Real-time notifications URL |
405
+ | `syncToDealup` | boolean | false | Auto-sync to DealUp |
406
+ | `template` | string | - | Job template to use |
407
+ | `useSmartRouting` | boolean | true | Auto-detect best settings |
408
+ | `priority` | string | - | Queue priority (Enterprise only) |
409
+ | `requireJS` | boolean | false | Force JavaScript rendering |
410
+ | `bypassAntiBot` | boolean | false | Advanced anti-bot techniques |
411
+ | `outputMarkdown` | boolean | false | Convert pages to Markdown (GFM) |
412
+ | `markdownBaseUrl` | string | - | Base URL for relative links in markdown |
413
+ | `noStore` | boolean | false | Zero Data Retention (Pro/Enterprise only) |
276
414
 
277
415
  ### Extract - LLM-Based Extraction
278
416
 
@@ -327,7 +465,7 @@ const query = client.dork.buildQuery({
327
465
  // Returns: "laptop deals site:amazon.com intitle:discount"
328
466
  ```
329
467
 
330
- ### Agent - AI Autonomous Navigation (NEW)
468
+ ### Agent - AI Autonomous Navigation
331
469
 
332
470
  Create AI agents that can navigate websites, interact with elements, and extract structured data using natural language instructions.
333
471
 
@@ -335,7 +473,8 @@ Create AI agents that can navigate websites, interact with elements, and extract
335
473
  // Basic agent - navigate and extract data
336
474
  const job = await client.agent.create({
337
475
  url: "https://amazon.com",
338
- prompt: "Search for wireless headphones under $50 and extract the top 5 results",
476
+ prompt:
477
+ "Search for wireless headphones under $50 and extract the top 5 results",
339
478
  schema: {
340
479
  type: "object",
341
480
  properties: {
@@ -399,28 +538,64 @@ const job = await client.agent.withClaude(
399
538
  ```
400
539
 
401
540
  **Agent Options:**
402
- | Option | Type | Default | Description |
403
- |--------|------|---------|-------------|
404
- | `url` | string | required | Starting URL |
405
- | `prompt` | string | required | Natural language instructions (10-2000 chars) |
406
- | `schema` | object | - | JSON Schema for structured output |
407
- | `maxSteps` | number | 10 | Maximum navigation steps (max: 25) |
408
- | `actions` | array | - | Preset actions to execute first |
409
- | `model` | string | "openai" | LLM provider: "openai" or "anthropic" |
410
- | `timeout` | number | 30000 | Per-step timeout in ms (max: 60000) |
411
- | `takeScreenshots` | boolean | false | Capture screenshot at each step |
412
- | `onlyMainContent` | boolean | true | Extract main content only |
541
+
542
+ | Option | Type | Default | Description |
543
+ | ----------------- | ------- | -------- | --------------------------------------------- |
544
+ | `url` | string | required | Starting URL |
545
+ | `prompt` | string | required | Natural language instructions (10-2000 chars) |
546
+ | `schema` | object | - | JSON Schema for structured output |
547
+ | `maxSteps` | number | 10 | Maximum navigation steps (max: 25) |
548
+ | `actions` | array | - | Preset actions to execute first |
549
+ | `model` | string | "openai" | LLM provider: "openai" or "anthropic" |
550
+ | `timeout` | number | 30000 | Per-step timeout in ms (max: 60000) |
551
+ | `takeScreenshots` | boolean | false | Capture screenshot at each step |
552
+ | `onlyMainContent` | boolean | true | Extract main content only |
413
553
 
414
554
  **Action Types:**
415
555
 
416
- - `click` - Click an element
417
- - `scroll` - Scroll page or to element
418
- - `write` - Type text into input
419
- - `wait` - Wait for time or element
420
- - `press` - Press keyboard key
421
- - `screenshot` - Capture screenshot
422
- - `hover` - Hover over element
423
- - `select` - Select dropdown option
556
+ | Action | Key Parameters | Description |
557
+ |--------------|---------------------------------------------------|---------------------------|
558
+ | `click` | `selector`, `waitAfter?`, `button?`, `force?` | Click an element |
559
+ | `scroll` | `direction`, `amount?`, `smooth?` | Scroll page/to element |
560
+ | `write` | `selector`, `text`, `clearFirst?`, `typeDelay?` | Type text into input |
561
+ | `wait` | `milliseconds?`, `selector?`, `condition?` | Wait for time or element |
562
+ | `press` | `key`, `modifiers?` | Press keyboard key |
563
+ | `screenshot` | `fullPage?`, `selector?`, `name?` | Capture screenshot |
564
+ | `hover` | `selector`, `duration?` | Hover over element |
565
+ | `select` | `selector`, `value`, `byLabel?` | Select dropdown option |
566
+
567
+ **Action Resilience (all actions support):**
568
+
569
+ - `optional: boolean` - Don't fail job if action fails
570
+ - `retries: number` - Retry failed action (1-5 times)
571
+ - `delayBefore: number` - Delay before executing action (ms)
572
+
573
+ **Schema Generation:**
574
+
575
+ ```typescript
576
+ // Generate JSON Schema from natural language
577
+ const schemaResult = await client.agent.generateSchema({
578
+ prompt: "Find e-commerce product deals with prices and discounts",
579
+ context: {
580
+ domains: ["e-commerce", "retail"], // Help AI understand context
581
+ dataTypes: ["prices", "discounts"], // Expected data types
582
+ format: "json", // Output format
583
+ clarifications: ["Include shipping info"] // Additional requirements
584
+ },
585
+ });
586
+
587
+ // Use the generated schema
588
+ const job = await client.agent.create({
589
+ url: "https://shop.example.com",
590
+ prompt: schemaResult.refinedPrompt, // AI-improved prompt
591
+ schema: schemaResult.schema, // Generated JSON Schema
592
+ });
593
+
594
+ // Check confidence - if low, ask clarifying questions
595
+ if (schemaResult.confidence < 0.7) {
596
+ console.log("Consider clarifying:", schemaResult.suggestedQuestions);
597
+ }
598
+ ```
424
599
 
425
600
  ### Status - Job Management
426
601
 
@@ -650,7 +825,7 @@ const result = await client.crawlAndWait({
650
825
  });
651
826
  ```
652
827
 
653
- ## Field Selection (NEW)
828
+ ## Field Selection
654
829
 
655
830
  Reduce response payload size by selecting only the fields you need:
656
831
 
@@ -670,6 +845,22 @@ const deals = await client.data.listDeals({
670
845
  const jobs = await client.data.listJobs({
671
846
  fields: ["id", "status", "result.deals.title", "result.deals.price"],
672
847
  });
848
+
849
+ // Agent job field selection
850
+ const agentStatus = await client.status.get(agentJobId, {
851
+ fields: [
852
+ "id",
853
+ "status",
854
+ "data.extractedData", // Final extracted data
855
+ "data.steps.action", // Just action details (skip observations)
856
+ "data.totalSteps",
857
+ ],
858
+ });
859
+
860
+ // Markdown content selection
861
+ const scrapeResult = await client.status.get(scrapeJobId, {
862
+ fields: ["id", "status", "result.parsed.markdown", "result.parsed.title"],
863
+ });
673
864
  ```
674
865
 
675
866
  **Benefits:**
@@ -757,11 +948,26 @@ import type {
757
948
  SearchJobResponse,
758
949
  BatchScrapeResponse,
759
950
 
951
+ // Action Types
952
+ ActionInput,
953
+ ClickAction,
954
+ ScrollAction,
955
+ WriteAction,
956
+ WaitAction,
957
+ PressAction,
958
+ HoverAction,
959
+ SelectAction,
960
+
961
+ // Screenshot Options
962
+ ScreenshotOptions,
963
+ ScreenshotResult,
964
+
760
965
  // Re-exports from @dealcrawl/shared
761
966
  ScrapeResult,
762
967
  CrawlResult,
763
968
  ExtractedDeal,
764
969
  Signal,
970
+ ParsedPage, // Includes markdown field
765
971
  } from "@dealcrawl/sdk";
766
972
  ```
767
973
 
@@ -876,4 +1082,6 @@ const client = new DealCrawl({
876
1082
 
877
1083
  ## License
878
1084
 
1085
+ By @Shipfastgo
1086
+
879
1087
  MIT © [DealUp](https://dealup.cc)