@upcrawl/sdk 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,266 @@
1
+ # Upcrawl
2
+
3
+ Official Node.js/Browser SDK for the [Upcrawl](https://upcrawl.dev) API. Extract data from any website with a single API call.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ npm install upcrawl
9
+ ```
10
+
11
+ Or with yarn:
12
+
13
+ ```bash
14
+ yarn add upcrawl
15
+ ```
16
+
17
+ ## Quick Start
18
+
19
+ ```typescript
20
+ import Upcrawl from 'upcrawl';
21
+
22
+ // Set your API key (get one at https://upcrawl.dev)
23
+ Upcrawl.setApiKey('uc-your-api-key');
24
+
25
+ // Scrape a webpage
26
+ const result = await Upcrawl.scrape({
27
+ url: 'https://example.com',
28
+ type: 'markdown'
29
+ });
30
+
31
+ console.log(result.markdown);
32
+ ```
33
+
34
+ ## Usage
35
+
36
+ ### Setting API Key
37
+
38
+ The API key must be set before making any requests:
39
+
40
+ ```typescript
41
+ import Upcrawl from 'upcrawl';
42
+
43
+ Upcrawl.setApiKey('uc-your-api-key');
44
+ ```
45
+
46
+ Or using named imports:
47
+
48
+ ```typescript
49
+ import { setApiKey } from 'upcrawl';
50
+
51
+ setApiKey('uc-your-api-key');
52
+ ```
53
+
54
+ ### Scraping a Single URL
55
+
56
+ ```typescript
57
+ import Upcrawl from 'upcrawl';
58
+
59
+ Upcrawl.setApiKey('uc-your-api-key');
60
+
61
+ const result = await Upcrawl.scrape({
62
+ url: 'https://example.com',
63
+ type: 'markdown', // 'markdown' or 'html'
64
+ onlyMainContent: true, // Remove nav, ads, footers
65
+ extractMetadata: true // Get title, description, etc.
66
+ });
67
+
68
+ console.log(result.markdown);
69
+ console.log(result.metadata?.title);
70
+ ```
71
+
72
+ ### Batch Scraping
73
+
74
+ Scrape multiple URLs in a single request:
75
+
76
+ ```typescript
77
+ const result = await Upcrawl.batchScrape({
78
+ urls: [
79
+ 'https://example.com/page1',
80
+ 'https://example.com/page2',
81
+ // You can also pass detailed options per URL:
82
+ { url: 'https://example.com/page3', type: 'html' }
83
+ ],
84
+ type: 'markdown'
85
+ });
86
+
87
+ console.log(`Scraped ${result.successful} of ${result.total} pages`);
88
+
89
+ result.results.forEach(page => {
90
+ if (page.success) {
91
+ console.log(`${page.url}: ${page.markdown?.length} chars`);
92
+ } else {
93
+ console.log(`${page.url}: Failed - ${page.error}`);
94
+ }
95
+ });
96
+ ```
97
+
98
+ ### Web Search
99
+
100
+ Search the web and get structured results:
101
+
102
+ ```typescript
103
+ const result = await Upcrawl.search({
104
+ queries: ['latest AI news 2025'],
105
+ limit: 10,
106
+ location: 'US'
107
+ });
108
+
109
+ result.results.forEach(queryResult => {
110
+ console.log(`Query: ${queryResult.query}`);
111
+ queryResult.results.forEach(item => {
112
+ console.log(`- ${item.title}`);
113
+ console.log(` ${item.url}`);
114
+ });
115
+ });
116
+ ```
117
+
118
+ ### Domain Filtering
119
+
120
+ Filter search results by domain:
121
+
122
+ ```typescript
123
+ // Only include specific domains
124
+ const result = await Upcrawl.search({
125
+ queries: ['machine learning tutorials'],
126
+ includeDomains: ['medium.com', 'towardsdatascience.com']
127
+ });
128
+
129
+ // Or exclude domains
130
+ const result2 = await Upcrawl.search({
131
+ queries: ['javascript frameworks'],
132
+ excludeDomains: ['pinterest.com', 'quora.com']
133
+ });
134
+ ```
135
+
136
+ ### LLM Summarization
137
+
138
+ Ask the API to summarize scraped content:
139
+
140
+ ```typescript
141
+ const result = await Upcrawl.scrape({
142
+ url: 'https://example.com/product',
143
+ type: 'markdown',
144
+ summary: {
145
+ query: 'Extract the product name, price, and key features in JSON format'
146
+ }
147
+ });
148
+
149
+ console.log(result.content); // Summarized content
150
+ ```
151
+
152
+ ## Configuration
153
+
154
+ ### Custom Base URL
155
+
156
+ For self-hosted instances or testing:
157
+
158
+ ```typescript
159
+ Upcrawl.setBaseUrl('https://your-instance.com/v1');
160
+ ```
161
+
162
+ ### Request Timeout
163
+
164
+ Set a custom timeout (in milliseconds):
165
+
166
+ ```typescript
167
+ Upcrawl.setTimeout(60000); // 60 seconds
168
+ ```
169
+
170
+ ### Configure Multiple Options
171
+
172
+ ```typescript
173
+ Upcrawl.configure({
174
+ apiKey: 'uc-your-api-key',
175
+ baseUrl: 'https://api.upcrawl.dev/v1',
176
+ timeout: 120000
177
+ });
178
+ ```
179
+
180
+ ## Error Handling
181
+
182
+ The SDK throws `UpcrawlError` for API errors:
183
+
184
+ ```typescript
185
+ import Upcrawl, { UpcrawlError } from 'upcrawl';
186
+
187
+ try {
188
+ const result = await Upcrawl.scrape({ url: 'https://example.com' });
189
+ } catch (error) {
190
+ if (error instanceof UpcrawlError) {
191
+ console.error(`Error ${error.status}: ${error.message}`);
192
+ console.error(`Code: ${error.code}`);
193
+ }
194
+ }
195
+ ```
196
+
197
+ Common error codes:
198
+
199
+ | Status | Code | Description |
200
+ |--------|------|-------------|
201
+ | 401 | `UNAUTHORIZED` | Invalid or missing API key |
202
+ | 403 | `FORBIDDEN` | Access forbidden |
203
+ | 429 | `RATE_LIMIT_EXCEEDED` | Too many requests |
204
+ | 500 | `INTERNAL_ERROR` | Server error |
205
+
206
+ ## TypeScript Support
207
+
208
+ The SDK includes full TypeScript definitions:
209
+
210
+ ```typescript
211
+ import Upcrawl, {
212
+ ScrapeOptions,
213
+ ScrapeResponse,
214
+ SearchOptions,
215
+ SearchResponse,
216
+ BatchScrapeOptions,
217
+ BatchScrapeResponse
218
+ } from 'upcrawl';
219
+
220
+ const options: ScrapeOptions = {
221
+ url: 'https://example.com',
222
+ type: 'markdown'
223
+ };
224
+
225
+ const result: ScrapeResponse = await Upcrawl.scrape(options);
226
+ ```
227
+
228
+ ## API Reference
229
+
230
+ ### Methods
231
+
232
+ | Method | Description |
233
+ |--------|-------------|
234
+ | `Upcrawl.setApiKey(key)` | Set the API key globally |
235
+ | `Upcrawl.setBaseUrl(url)` | Set custom base URL |
236
+ | `Upcrawl.setTimeout(ms)` | Set request timeout |
237
+ | `Upcrawl.configure(config)` | Configure multiple options |
238
+ | `Upcrawl.scrape(options)` | Scrape a single URL |
239
+ | `Upcrawl.batchScrape(options)` | Scrape multiple URLs |
240
+ | `Upcrawl.search(options)` | Search the web |
241
+
242
+ ### Scrape Options
243
+
244
+ | Option | Type | Description |
245
+ |--------|------|-------------|
246
+ | `url` | `string` | URL to scrape (required) |
247
+ | `type` | `'markdown' \| 'html'` | Output format |
248
+ | `onlyMainContent` | `boolean` | Remove nav, ads, footers |
249
+ | `extractMetadata` | `boolean` | Extract page metadata |
250
+ | `timeoutMs` | `number` | Request timeout (1000-120000) |
251
+ | `waitUntil` | `string` | Page load strategy |
252
+ | `summary` | `{ query: string }` | LLM summarization |
253
+
254
+ ### Search Options
255
+
256
+ | Option | Type | Description |
257
+ |--------|------|-------------|
258
+ | `queries` | `string[]` | Search queries (1-20) |
259
+ | `limit` | `number` | Results per query (1-100) |
260
+ | `location` | `string` | Search location (e.g., 'US') |
261
+ | `includeDomains` | `string[]` | Only include these domains |
262
+ | `excludeDomains` | `string[]` | Exclude these domains |
263
+
264
+ ## License
265
+
266
+ MIT
@@ -0,0 +1,373 @@
1
+ /**
2
+ * Upcrawl SDK Types
3
+ * Type definitions for all API requests and responses
4
+ */
5
+ interface UpcrawlConfig {
6
+ apiKey?: string;
7
+ baseUrl?: string;
8
+ timeout?: number;
9
+ }
10
+ interface SummaryQuery {
11
+ /** Query/instruction for content summarization */
12
+ query: string;
13
+ }
14
+ interface ScrapeOptions {
15
+ /** URL to scrape (required) */
16
+ url: string;
17
+ /** Output format: html or markdown. Defaults to "html" */
18
+ type?: 'html' | 'markdown';
19
+ /** Extract only main content (removes nav, ads, footers). Defaults to true */
20
+ onlyMainContent?: boolean;
21
+ /** Whether to extract page metadata */
22
+ extractMetadata?: boolean;
23
+ /** Summary query for LLM summarization */
24
+ summary?: SummaryQuery;
25
+ /** Custom timeout in milliseconds (1000-120000) */
26
+ timeoutMs?: number;
27
+ /** Wait strategy for page load */
28
+ waitUntil?: 'load' | 'domcontentloaded' | 'networkidle';
29
+ }
30
+ interface ScrapeMetadata {
31
+ title?: string;
32
+ description?: string;
33
+ canonicalUrl?: string;
34
+ finalUrl?: string;
35
+ contentType?: string;
36
+ contentLength?: number;
37
+ }
38
+ interface ScrapeResponse {
39
+ /** Original URL that was scraped */
40
+ url: string;
41
+ /** Rendered HTML content (when type is html) */
42
+ html?: string | null;
43
+ /** Content converted to Markdown (when type is markdown) */
44
+ markdown?: string | null;
45
+ /** HTTP status code */
46
+ statusCode: number | null;
47
+ /** Whether scraping was successful */
48
+ success: boolean;
49
+ /** Error message if scraping failed */
50
+ error?: string;
51
+ /** ISO timestamp when scraping completed */
52
+ timestamp: string;
53
+ /** Time taken to load and render the page in milliseconds */
54
+ loadTimeMs: number;
55
+ /** Additional page metadata */
56
+ metadata?: ScrapeMetadata;
57
+ /** Number of retry attempts made */
58
+ retryCount: number;
59
+ /** Cost in USD for this scrape operation */
60
+ cost?: number;
61
+ /** Content after summarization (when summary query provided) */
62
+ content?: string | null;
63
+ }
64
+ interface BatchScrapeOptions {
65
+ /** Array of URLs to scrape (strings or detailed request objects) */
66
+ urls: (string | ScrapeOptions)[];
67
+ /** Output format: html or markdown */
68
+ type?: 'html' | 'markdown';
69
+ /** Extract only main content (removes nav, ads, footers) */
70
+ onlyMainContent?: boolean;
71
+ /** Summary query for LLM summarization */
72
+ summary?: SummaryQuery;
73
+ /** Global timeout for entire batch operation in milliseconds (10000-600000) */
74
+ batchTimeoutMs?: number;
75
+ /** Whether to stop on first error */
76
+ failFast?: boolean;
77
+ }
78
+ interface BatchScrapeResponse {
79
+ /** Array of scrape results */
80
+ results: ScrapeResponse[];
81
+ /** Total number of URLs processed */
82
+ total: number;
83
+ /** Number of successful scrapes */
84
+ successful: number;
85
+ /** Number of failed scrapes */
86
+ failed: number;
87
+ /** Total time taken for batch operation in milliseconds */
88
+ totalTimeMs: number;
89
+ /** Timestamp when batch operation completed */
90
+ timestamp: string;
91
+ /** Total cost in USD for all scrape operations */
92
+ cost?: number;
93
+ }
94
+ interface SearchOptions {
95
+ /** Array of search queries to execute (1-20) */
96
+ queries: string[];
97
+ /** Number of results per query (1-100). Defaults to 10 */
98
+ limit?: number;
99
+ /** Location for search (e.g., "IN", "US") */
100
+ location?: string;
101
+ /** Domains to include (will add site: to query) */
102
+ includeDomains?: string[];
103
+ /** Domains to exclude (will add -site: to query) */
104
+ excludeDomains?: string[];
105
+ }
106
+ interface SearchResultWeb {
107
+ /** URL of the search result */
108
+ url: string;
109
+ /** Title of the search result */
110
+ title: string;
111
+ /** Description/snippet of the search result */
112
+ description: string;
113
+ }
114
+ interface SearchResultItem {
115
+ /** The search query */
116
+ query: string;
117
+ /** Whether the search was successful */
118
+ success: boolean;
119
+ /** Parsed search result links */
120
+ results: SearchResultWeb[];
121
+ /** Error message if failed */
122
+ error?: string;
123
+ /** Time taken in milliseconds */
124
+ loadTimeMs?: number;
125
+ /** Cost in USD for this query */
126
+ cost?: number;
127
+ }
128
+ interface SearchResponse {
129
+ /** Array of search results per query */
130
+ results: SearchResultItem[];
131
+ /** Total number of queries */
132
+ total: number;
133
+ /** Number of successful searches */
134
+ successful: number;
135
+ /** Number of failed searches */
136
+ failed: number;
137
+ /** Total time in milliseconds */
138
+ totalTimeMs: number;
139
+ /** ISO timestamp */
140
+ timestamp: string;
141
+ /** Total cost in USD */
142
+ cost?: number;
143
+ }
144
+ interface UpcrawlErrorResponse {
145
+ error: {
146
+ code: string;
147
+ message: string;
148
+ };
149
+ statusCode?: number;
150
+ }
151
+ declare class UpcrawlError extends Error {
152
+ readonly status: number;
153
+ readonly code: string;
154
+ constructor(message: string, status: number, code?: string);
155
+ }
156
+
157
+ /**
158
+ * Upcrawl API Client
159
+ * Handles all HTTP communication with the Upcrawl API
160
+ */
161
+
162
+ /**
163
+ * Set the API key globally
164
+ * @param apiKey - Your Upcrawl API key (starts with 'uc-')
165
+ */
166
+ declare function setApiKey(apiKey: string): void;
167
+ /**
168
+ * Set a custom base URL (useful for self-hosted or testing)
169
+ * @param baseUrl - Custom API base URL
170
+ */
171
+ declare function setBaseUrl(baseUrl: string): void;
172
+ /**
173
+ * Set request timeout in milliseconds
174
+ * @param timeout - Timeout in milliseconds
175
+ */
176
+ declare function setTimeout(timeout: number): void;
177
+ /**
178
+ * Configure multiple options at once
179
+ * @param config - Configuration object
180
+ */
181
+ declare function configure(config: UpcrawlConfig): void;
182
+ /**
183
+ * Get current configuration (for debugging)
184
+ */
185
+ declare function getConfig(): Omit<UpcrawlConfig, 'apiKey'> & {
186
+ apiKeySet: boolean;
187
+ };
188
+ /**
189
+ * Reset configuration to defaults
190
+ */
191
+ declare function resetConfig(): void;
192
+ /**
193
+ * Scrape a single URL
194
+ * @param options - Scrape options including the URL to scrape
195
+ * @returns Promise with scrape response
196
+ *
197
+ * @example
198
+ * ```typescript
199
+ * import { scrape, setApiKey } from 'upcrawl';
200
+ *
201
+ * setApiKey('uc-your-api-key');
202
+ *
203
+ * const result = await scrape({
204
+ * url: 'https://example.com',
205
+ * type: 'markdown',
206
+ * onlyMainContent: true
207
+ * });
208
+ *
209
+ * console.log(result.markdown);
210
+ * ```
211
+ */
212
+ declare function scrape(options: ScrapeOptions): Promise<ScrapeResponse>;
213
+ /**
214
+ * Scrape multiple URLs in a batch
215
+ * @param options - Batch scrape options including URLs to scrape
216
+ * @returns Promise with batch scrape response
217
+ *
218
+ * @example
219
+ * ```typescript
220
+ * import { batchScrape, setApiKey } from 'upcrawl';
221
+ *
222
+ * setApiKey('uc-your-api-key');
223
+ *
224
+ * const result = await batchScrape({
225
+ * urls: [
226
+ * 'https://example.com/page1',
227
+ * 'https://example.com/page2',
228
+ * { url: 'https://example.com/page3', type: 'html' }
229
+ * ],
230
+ * type: 'markdown'
231
+ * });
232
+ *
233
+ * console.log(`Scraped ${result.successful} of ${result.total} pages`);
234
+ * ```
235
+ */
236
+ declare function batchScrape(options: BatchScrapeOptions): Promise<BatchScrapeResponse>;
237
+ /**
238
+ * Search the web
239
+ * @param options - Search options including queries
240
+ * @returns Promise with search response
241
+ *
242
+ * @example
243
+ * ```typescript
244
+ * import { search, setApiKey } from 'upcrawl';
245
+ *
246
+ * setApiKey('uc-your-api-key');
247
+ *
248
+ * const result = await search({
249
+ * queries: ['latest AI news 2025'],
250
+ * limit: 10,
251
+ * location: 'US'
252
+ * });
253
+ *
254
+ * result.results.forEach(queryResult => {
255
+ * console.log(`Query: ${queryResult.query}`);
256
+ * queryResult.results.forEach(item => {
257
+ * console.log(`- ${item.title}: ${item.url}`);
258
+ * });
259
+ * });
260
+ * ```
261
+ */
262
+ declare function search(options: SearchOptions): Promise<SearchResponse>;
263
+
264
+ /**
265
+ * Upcrawl SDK
266
+ * Official Node.js/Browser SDK for the Upcrawl API
267
+ *
268
+ * @example
269
+ * ```typescript
270
+ * // Using the Upcrawl namespace (recommended)
271
+ * import Upcrawl from 'upcrawl';
272
+ *
273
+ * Upcrawl.setApiKey('uc-your-api-key');
274
+ *
275
+ * const result = await Upcrawl.scrape({
276
+ * url: 'https://example.com',
277
+ * type: 'markdown'
278
+ * });
279
+ * ```
280
+ *
281
+ * @example
282
+ * ```typescript
283
+ * // Using named imports
284
+ * import { setApiKey, scrape, search } from 'upcrawl';
285
+ *
286
+ * setApiKey('uc-your-api-key');
287
+ *
288
+ * const result = await scrape({ url: 'https://example.com' });
289
+ * ```
290
+ */
291
+
292
+ /**
293
+ * Upcrawl namespace object
294
+ * Provides a convenient way to access all SDK functionality
295
+ *
296
+ * @example
297
+ * ```typescript
298
+ * import Upcrawl from 'upcrawl';
299
+ *
300
+ * // Set API key globally
301
+ * Upcrawl.setApiKey('uc-your-api-key');
302
+ *
303
+ * // Scrape a single URL
304
+ * const page = await Upcrawl.scrape({
305
+ * url: 'https://example.com',
306
+ * type: 'markdown'
307
+ * });
308
+ *
309
+ * // Batch scrape multiple URLs
310
+ * const pages = await Upcrawl.batchScrape({
311
+ * urls: ['https://example.com/1', 'https://example.com/2']
312
+ * });
313
+ *
314
+ * // Search the web
315
+ * const results = await Upcrawl.search({
316
+ * queries: ['AI trends 2025']
317
+ * });
318
+ * ```
319
+ */
320
+ declare const Upcrawl: {
321
+ /**
322
+ * Set the API key globally
323
+ * @param apiKey - Your Upcrawl API key (starts with 'uc-')
324
+ */
325
+ readonly setApiKey: typeof setApiKey;
326
+ /**
327
+ * Set a custom base URL (useful for self-hosted or testing)
328
+ * @param baseUrl - Custom API base URL
329
+ */
330
+ readonly setBaseUrl: typeof setBaseUrl;
331
+ /**
332
+ * Set request timeout in milliseconds
333
+ * @param timeout - Timeout in milliseconds
334
+ */
335
+ readonly setTimeout: typeof setTimeout;
336
+ /**
337
+ * Configure multiple options at once
338
+ * @param config - Configuration object
339
+ */
340
+ readonly configure: typeof configure;
341
+ /**
342
+ * Get current configuration (for debugging)
343
+ */
344
+ readonly getConfig: typeof getConfig;
345
+ /**
346
+ * Reset configuration to defaults
347
+ */
348
+ readonly resetConfig: typeof resetConfig;
349
+ /**
350
+ * Scrape a single URL
351
+ * @param options - Scrape options including the URL to scrape
352
+ * @returns Promise with scrape response
353
+ */
354
+ readonly scrape: typeof scrape;
355
+ /**
356
+ * Scrape multiple URLs in a batch
357
+ * @param options - Batch scrape options including URLs to scrape
358
+ * @returns Promise with batch scrape response
359
+ */
360
+ readonly batchScrape: typeof batchScrape;
361
+ /**
362
+ * Search the web
363
+ * @param options - Search options including queries
364
+ * @returns Promise with search response
365
+ */
366
+ readonly search: typeof search;
367
+ /**
368
+ * Error class for Upcrawl API errors
369
+ */
370
+ readonly UpcrawlError: typeof UpcrawlError;
371
+ };
372
+
373
+ export { type BatchScrapeOptions, type BatchScrapeResponse, type ScrapeMetadata, type ScrapeOptions, type ScrapeResponse, type SearchOptions, type SearchResponse, type SearchResultItem, type SearchResultWeb, type SummaryQuery, type UpcrawlConfig, UpcrawlError, type UpcrawlErrorResponse, batchScrape, configure, Upcrawl as default, getConfig, resetConfig, scrape, search, setApiKey, setBaseUrl, setTimeout };