xscrape 3.0.1 → 3.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +340 -104
- package/dist/index.cjs +4 -8
- package/dist/index.d.cts +6 -5
- package/dist/index.d.ts +6 -5
- package/dist/index.js +4 -8
- package/package.json +17 -12
package/README.md
CHANGED
|
@@ -1,42 +1,88 @@
|
|
|
1
|
-
|
|
1
|
+
<p align="center">
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
<h1 align="center">🕷️<br/><code>xscrape</code></h1>
|
|
4
|
+
<p align="center">Extract and transform HTML with your own schema, powered by <code>Standard Schema</code> compatibility.
|
|
5
|
+
<br/>
|
|
6
|
+
by <a href="https://github.com/johnie">@johnie</a>
|
|
7
|
+
</p>
|
|
8
|
+
</p>
|
|
9
|
+
<br/>
|
|
4
10
|
|
|
5
|
-
|
|
11
|
+
<p align="center">
|
|
12
|
+
<a href="https://opensource.org/licenses/MIT" rel="nofollow"><img src="https://img.shields.io/github/license/johnie/xscrape" alt="License"></a>
|
|
13
|
+
<a href="https://www.npmjs.com/package/xscrape" rel="nofollow"><img src="https://img.shields.io/npm/v/xscrape.svg" alt="npm"></a>
|
|
14
|
+
<a href="https://github.com/johnie/xscrape/actions"><img src="https://github.com/johnie/xscrape/actions/workflows/ci.yml/badge.svg" alt="Build Status"></a>
|
|
15
|
+
<a href="https://github.com/johnie/xscrape" rel="nofollow"><img src="https://img.shields.io/github/stars/johnie/xscrape" alt="stars"></a>
|
|
16
|
+
</p>
|
|
17
|
+
|
|
18
|
+
<br/>
|
|
19
|
+
<br/>
|
|
20
|
+
|
|
21
|
+
## Overview
|
|
6
22
|
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
* **Default Values**: Define default values for missing data fields through your chosen schema library's features.
|
|
11
|
-
* **Nested Field Support**: Define and extract nested data structures from HTML elements.
|
|
23
|
+
xscrape is a powerful HTML scraping library that combines the flexibility of query selectors with the safety of schema validation. It works with any validation library that implements the [Standard Schema](https://standardschema.dev) specification, including Zod, Valibot, ArkType, and Effect Schema.
|
|
24
|
+
|
|
25
|
+
## Features
|
|
12
26
|
|
|
13
|
-
|
|
27
|
+
- **HTML Parsing**: Extract data from HTML using query selectors powered by [cheerio](https://github.com/cheeriojs/cheerio)
|
|
28
|
+
- **Universal Schema Support**: Works with any [Standard Schema](https://standardschema.dev) compatible library
|
|
29
|
+
- **Type Safety**: Full TypeScript support with inferred types from your schemas
|
|
30
|
+
- **Flexible Extraction**: Support for nested objects, arrays, and custom transformation functions
|
|
31
|
+
- **Error Handling**: Comprehensive error handling with detailed validation feedback
|
|
32
|
+
- **Custom Transformations**: Apply post-processing transformations to validated data
|
|
33
|
+
- **Default Values**: Handle missing data gracefully through schema defaults
|
|
14
34
|
|
|
15
35
|
## Installation
|
|
16
36
|
|
|
17
|
-
|
|
37
|
+
Install xscrape with your preferred package manager:
|
|
18
38
|
|
|
19
39
|
```bash
|
|
40
|
+
npm install xscrape
|
|
41
|
+
# or
|
|
20
42
|
pnpm add xscrape
|
|
21
43
|
# or
|
|
22
|
-
|
|
44
|
+
bun add xscrape
|
|
23
45
|
```
|
|
24
46
|
|
|
25
|
-
|
|
47
|
+
## Quick Start
|
|
26
48
|
|
|
27
|
-
```
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
49
|
+
```typescript
|
|
50
|
+
import { defineScraper } from 'xscrape';
|
|
51
|
+
import { z } from 'zod';
|
|
52
|
+
|
|
53
|
+
// Define your schema
|
|
54
|
+
const schema = z.object({
|
|
55
|
+
title: z.string(),
|
|
56
|
+
description: z.string(),
|
|
57
|
+
keywords: z.array(z.string()),
|
|
58
|
+
views: z.coerce.number(),
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
// Create a scraper
|
|
62
|
+
const scraper = defineScraper({
|
|
63
|
+
schema,
|
|
64
|
+
extract: {
|
|
65
|
+
title: { selector: 'title' },
|
|
66
|
+
description: { selector: 'meta[name="description"]', value: 'content' },
|
|
67
|
+
keywords: {
|
|
68
|
+
selector: 'meta[name="keywords"]',
|
|
69
|
+
value: (el) => el.attribs['content']?.split(',') || [],
|
|
70
|
+
},
|
|
71
|
+
views: { selector: 'meta[name="views"]', value: 'content' },
|
|
72
|
+
},
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
// Use the scraper
|
|
76
|
+
const { data, error } = await scraper(htmlString);
|
|
31
77
|
```
|
|
32
78
|
|
|
33
|
-
|
|
79
|
+
## Usage Examples
|
|
34
80
|
|
|
35
|
-
|
|
81
|
+
### Basic Extraction
|
|
36
82
|
|
|
37
|
-
|
|
83
|
+
Extract basic metadata from an HTML page:
|
|
38
84
|
|
|
39
|
-
```
|
|
85
|
+
```typescript
|
|
40
86
|
import { defineScraper } from 'xscrape';
|
|
41
87
|
import { z } from 'zod';
|
|
42
88
|
|
|
@@ -44,27 +90,12 @@ const scraper = defineScraper({
|
|
|
44
90
|
schema: z.object({
|
|
45
91
|
title: z.string(),
|
|
46
92
|
description: z.string(),
|
|
47
|
-
|
|
48
|
-
views: z.coerce.number(),
|
|
93
|
+
author: z.string(),
|
|
49
94
|
}),
|
|
50
95
|
extract: {
|
|
51
|
-
title: {
|
|
52
|
-
|
|
53
|
-
},
|
|
54
|
-
description: {
|
|
55
|
-
selector: 'meta[name="description"]',
|
|
56
|
-
value: 'content',
|
|
57
|
-
},
|
|
58
|
-
keywords: {
|
|
59
|
-
selector: 'meta[name="keywords"]',
|
|
60
|
-
value(el) {
|
|
61
|
-
return el.attribs['content']?.split(',');
|
|
62
|
-
},
|
|
63
|
-
},
|
|
64
|
-
views: {
|
|
65
|
-
selector: 'meta[name="views"]',
|
|
66
|
-
value: 'content',
|
|
67
|
-
},
|
|
96
|
+
title: { selector: 'title' },
|
|
97
|
+
description: { selector: 'meta[name="description"]', value: 'content' },
|
|
98
|
+
author: { selector: 'meta[name="author"]', value: 'content' },
|
|
68
99
|
},
|
|
69
100
|
});
|
|
70
101
|
|
|
@@ -72,115 +103,320 @@ const html = `
|
|
|
72
103
|
<!DOCTYPE html>
|
|
73
104
|
<html>
|
|
74
105
|
<head>
|
|
75
|
-
<
|
|
76
|
-
<meta name="
|
|
77
|
-
<meta name="
|
|
78
|
-
<title>Example Title</title>
|
|
106
|
+
<title>My Blog Post</title>
|
|
107
|
+
<meta name="description" content="An interesting blog post">
|
|
108
|
+
<meta name="author" content="John Doe">
|
|
79
109
|
</head>
|
|
80
|
-
<body
|
|
110
|
+
<body>...</body>
|
|
81
111
|
</html>
|
|
82
112
|
`;
|
|
83
113
|
|
|
84
114
|
const { data, error } = await scraper(html);
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
// Outputs:
|
|
88
|
-
// {
|
|
89
|
-
// title: 'Example Title',
|
|
90
|
-
// description: 'An example description.',
|
|
91
|
-
// keywords: ['typescript', 'html', 'parsing'],
|
|
92
|
-
// views: 1234
|
|
93
|
-
// }
|
|
115
|
+
// data: { title: "My Blog Post", description: "An interesting blog post", author: "John Doe" }
|
|
94
116
|
```
|
|
95
117
|
|
|
96
118
|
### Handling Missing Data
|
|
97
119
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
```ts
|
|
101
|
-
import { defineScraper } from 'xscrape';
|
|
102
|
-
import { z } from 'zod';
|
|
120
|
+
Use schema defaults to handle missing data gracefully:
|
|
103
121
|
|
|
122
|
+
```typescript
|
|
104
123
|
const scraper = defineScraper({
|
|
105
124
|
schema: z.object({
|
|
106
|
-
title: z.string().default('
|
|
107
|
-
description: z.string().default('No description'),
|
|
125
|
+
title: z.string().default('Untitled'),
|
|
126
|
+
description: z.string().default('No description available'),
|
|
127
|
+
publishedAt: z.string().optional(),
|
|
108
128
|
views: z.coerce.number().default(0),
|
|
109
129
|
}),
|
|
110
130
|
extract: {
|
|
111
|
-
title: {
|
|
112
|
-
|
|
113
|
-
},
|
|
114
|
-
|
|
115
|
-
selector: 'meta[name="description"]',
|
|
116
|
-
value: 'content',
|
|
117
|
-
},
|
|
118
|
-
views: {
|
|
119
|
-
selector: 'meta[name="views"]',
|
|
120
|
-
value: 'content',
|
|
121
|
-
},
|
|
131
|
+
title: { selector: 'title' },
|
|
132
|
+
description: { selector: 'meta[name="description"]', value: 'content' },
|
|
133
|
+
publishedAt: { selector: 'meta[name="published"]', value: 'content' },
|
|
134
|
+
views: { selector: 'meta[name="views"]', value: 'content' },
|
|
122
135
|
},
|
|
123
136
|
});
|
|
137
|
+
|
|
138
|
+
// Even with incomplete HTML, you get sensible defaults
|
|
139
|
+
const { data } = await scraper('<html><head><title>Test</title></head></html>');
|
|
140
|
+
// data: { title: "Test", description: "No description available", views: 0 }
|
|
124
141
|
```
|
|
125
142
|
|
|
126
|
-
###
|
|
143
|
+
### Extracting Arrays
|
|
127
144
|
|
|
128
|
-
|
|
145
|
+
Extract multiple elements as arrays:
|
|
129
146
|
|
|
130
|
-
```
|
|
131
|
-
|
|
132
|
-
|
|
147
|
+
```typescript
|
|
148
|
+
const scraper = defineScraper({
|
|
149
|
+
schema: z.object({
|
|
150
|
+
links: z.array(z.string()),
|
|
151
|
+
headings: z.array(z.string()),
|
|
152
|
+
}),
|
|
153
|
+
extract: {
|
|
154
|
+
links: [{ selector: 'a', value: 'href' }],
|
|
155
|
+
headings: [{ selector: 'h1, h2, h3' }],
|
|
156
|
+
},
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
const html = `
|
|
160
|
+
<html>
|
|
161
|
+
<body>
|
|
162
|
+
<h1>Main Title</h1>
|
|
163
|
+
<h2>Subtitle</h2>
|
|
164
|
+
<a href="/page1">Link 1</a>
|
|
165
|
+
<a href="/page2">Link 2</a>
|
|
166
|
+
</body>
|
|
167
|
+
</html>
|
|
168
|
+
`;
|
|
133
169
|
|
|
170
|
+
const { data } = await scraper(html);
|
|
171
|
+
// data: {
|
|
172
|
+
// links: ["/page1", "/page2"],
|
|
173
|
+
// headings: ["Main Title", "Subtitle"]
|
|
174
|
+
// }
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Nested Objects
|
|
178
|
+
|
|
179
|
+
Extract complex nested data structures:
|
|
180
|
+
|
|
181
|
+
```typescript
|
|
134
182
|
const scraper = defineScraper({
|
|
135
183
|
schema: z.object({
|
|
136
184
|
title: z.string(),
|
|
137
|
-
|
|
138
|
-
|
|
185
|
+
socialMedia: z.object({
|
|
186
|
+
image: z.string().url(),
|
|
139
187
|
width: z.coerce.number(),
|
|
140
188
|
height: z.coerce.number(),
|
|
141
|
-
|
|
189
|
+
type: z.string(),
|
|
190
|
+
}),
|
|
142
191
|
}),
|
|
143
192
|
extract: {
|
|
144
|
-
title: {
|
|
145
|
-
|
|
146
|
-
},
|
|
147
|
-
image: {
|
|
193
|
+
title: { selector: 'title' },
|
|
194
|
+
socialMedia: {
|
|
148
195
|
selector: 'head',
|
|
149
196
|
value: {
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
},
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
197
|
+
image: { selector: 'meta[property="og:image"]', value: 'content' },
|
|
198
|
+
width: { selector: 'meta[property="og:image:width"]', value: 'content' },
|
|
199
|
+
height: { selector: 'meta[property="og:image:height"]', value: 'content' },
|
|
200
|
+
type: { selector: 'meta[property="og:type"]', value: 'content' },
|
|
201
|
+
},
|
|
202
|
+
},
|
|
203
|
+
},
|
|
204
|
+
});
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
### Custom Value Transformations
|
|
208
|
+
|
|
209
|
+
Apply custom logic to extracted values:
|
|
210
|
+
|
|
211
|
+
```typescript
|
|
212
|
+
const scraper = defineScraper({
|
|
213
|
+
schema: z.object({
|
|
214
|
+
tags: z.array(z.string()),
|
|
215
|
+
publishedDate: z.date(),
|
|
216
|
+
readingTime: z.number(),
|
|
217
|
+
}),
|
|
218
|
+
extract: {
|
|
219
|
+
tags: {
|
|
220
|
+
selector: 'meta[name="keywords"]',
|
|
221
|
+
value: (el) => el.attribs['content']?.split(',').map(tag => tag.trim()) || [],
|
|
222
|
+
},
|
|
223
|
+
publishedDate: {
|
|
224
|
+
selector: 'meta[name="published"]',
|
|
225
|
+
value: (el) => new Date(el.attribs['content']),
|
|
226
|
+
},
|
|
227
|
+
readingTime: {
|
|
228
|
+
selector: 'article',
|
|
229
|
+
value: (el) => {
|
|
230
|
+
const text = el.text();
|
|
231
|
+
const wordsPerMinute = 200;
|
|
232
|
+
const wordCount = text.split(/\s+/).length;
|
|
233
|
+
return Math.ceil(wordCount / wordsPerMinute);
|
|
162
234
|
},
|
|
163
235
|
},
|
|
164
236
|
},
|
|
165
237
|
});
|
|
166
238
|
```
|
|
167
239
|
|
|
168
|
-
|
|
240
|
+
### Post-Processing with Transform
|
|
241
|
+
|
|
242
|
+
Apply transformations to the validated data:
|
|
243
|
+
|
|
244
|
+
```typescript
|
|
245
|
+
const scraper = defineScraper({
|
|
246
|
+
schema: z.object({
|
|
247
|
+
title: z.string(),
|
|
248
|
+
description: z.string(),
|
|
249
|
+
tags: z.array(z.string()),
|
|
250
|
+
}),
|
|
251
|
+
extract: {
|
|
252
|
+
title: { selector: 'title' },
|
|
253
|
+
description: { selector: 'meta[name="description"]', value: 'content' },
|
|
254
|
+
tags: {
|
|
255
|
+
selector: 'meta[name="keywords"]',
|
|
256
|
+
value: (el) => el.attribs['content']?.split(',') || [],
|
|
257
|
+
},
|
|
258
|
+
},
|
|
259
|
+
transform: (data) => ({
|
|
260
|
+
...data,
|
|
261
|
+
slug: data.title.toLowerCase().replace(/\s+/g, '-'),
|
|
262
|
+
tagCount: data.tags.length,
|
|
263
|
+
summary: data.description.substring(0, 100) + '...',
|
|
264
|
+
}),
|
|
265
|
+
});
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
## Schema Library Examples
|
|
269
|
+
|
|
270
|
+
### Zod
|
|
271
|
+
|
|
272
|
+
```typescript
|
|
273
|
+
import { z } from 'zod';
|
|
274
|
+
|
|
275
|
+
const schema = z.object({
|
|
276
|
+
title: z.string(),
|
|
277
|
+
price: z.coerce.number(),
|
|
278
|
+
inStock: z.boolean().default(false),
|
|
279
|
+
});
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
### Valibot
|
|
283
|
+
|
|
284
|
+
```typescript
|
|
285
|
+
import * as v from 'valibot';
|
|
286
|
+
|
|
287
|
+
const schema = v.object({
|
|
288
|
+
title: v.string(),
|
|
289
|
+
price: v.pipe(v.string(), v.transform(Number)),
|
|
290
|
+
inStock: v.optional(v.boolean(), false),
|
|
291
|
+
});
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
### ArkType
|
|
295
|
+
|
|
296
|
+
```typescript
|
|
297
|
+
import { type } from 'arktype';
|
|
298
|
+
|
|
299
|
+
const schema = type({
|
|
300
|
+
title: 'string',
|
|
301
|
+
price: 'number',
|
|
302
|
+
inStock: 'boolean = false',
|
|
303
|
+
});
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
### Effect Schema
|
|
307
|
+
|
|
308
|
+
```typescript
|
|
309
|
+
import { Schema } from 'effect';
|
|
310
|
+
|
|
311
|
+
const schema = Schema.Struct({
|
|
312
|
+
title: Schema.String,
|
|
313
|
+
price: Schema.NumberFromString,
|
|
314
|
+
inStock: Schema.optionalWith(Schema.Boolean, { default: () => false }),
|
|
315
|
+
});
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
## API Reference
|
|
319
|
+
|
|
320
|
+
### `defineScraper(config)`
|
|
321
|
+
|
|
322
|
+
Creates a scraper function with the specified configuration.
|
|
323
|
+
|
|
324
|
+
#### Parameters
|
|
325
|
+
|
|
326
|
+
- `config.schema`: A Standard Schema compatible schema object
|
|
327
|
+
- `config.extract`: Extraction configuration object
|
|
328
|
+
- `config.transform?`: Optional post-processing function
|
|
329
|
+
|
|
330
|
+
#### Returns
|
|
331
|
+
|
|
332
|
+
A scraper function that takes HTML string and returns `Promise<{ data?: T, error?: unknown }>`.
|
|
333
|
+
|
|
334
|
+
### Extraction Configuration
|
|
335
|
+
|
|
336
|
+
The `extract` object defines how to extract data from HTML:
|
|
337
|
+
|
|
338
|
+
```typescript
|
|
339
|
+
type ExtractConfig = {
|
|
340
|
+
[key: string]: ExtractDescriptor | [ExtractDescriptor];
|
|
341
|
+
};
|
|
342
|
+
|
|
343
|
+
type ExtractDescriptor = {
|
|
344
|
+
selector: string;
|
|
345
|
+
value?: string | ((el: Element) => any) | ExtractConfig;
|
|
346
|
+
};
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
#### Properties
|
|
350
|
+
|
|
351
|
+
- `selector`: CSS selector to find elements
|
|
352
|
+
- `value`: How to extract the value:
|
|
353
|
+
- `string`: Attribute name (e.g., `'href'`, `'content'`)
|
|
354
|
+
- `function`: Custom extraction function
|
|
355
|
+
- `object`: Nested extraction configuration
|
|
356
|
+
- `undefined`: Extract text content
|
|
169
357
|
|
|
170
|
-
|
|
358
|
+
#### Array Extraction
|
|
171
359
|
|
|
172
|
-
|
|
360
|
+
Wrap the descriptor in an array to extract multiple elements:
|
|
361
|
+
|
|
362
|
+
```typescript
|
|
363
|
+
{
|
|
364
|
+
links: [{ selector: 'a', value: 'href' }]
|
|
365
|
+
}
|
|
366
|
+
```
|
|
173
367
|
|
|
174
|
-
|
|
175
|
-
* **`extract`**: An object that determines how fields are extracted from the HTML using CSS selectors.
|
|
176
|
-
* **`transform`** (optional): A function to apply custom transformations to the validated data.
|
|
368
|
+
## Error Handling
|
|
177
369
|
|
|
178
|
-
|
|
370
|
+
xscrape provides comprehensive error handling:
|
|
371
|
+
|
|
372
|
+
```typescript
|
|
373
|
+
const { data, error } = await scraper(html);
|
|
374
|
+
|
|
375
|
+
if (error) {
|
|
376
|
+
// Handle validation errors, extraction errors, or transform errors
|
|
377
|
+
console.error('Scraping failed:', error);
|
|
378
|
+
} else {
|
|
379
|
+
// Use the validated data
|
|
380
|
+
console.log('Extracted data:', data);
|
|
381
|
+
}
|
|
382
|
+
```
|
|
383
|
+
|
|
384
|
+
## Best Practices
|
|
385
|
+
|
|
386
|
+
1. **Use Specific Selectors**: Be as specific as possible with CSS selectors to avoid unexpected matches
|
|
387
|
+
2. **Handle Missing Data**: Use schema defaults or optional fields for data that might not be present
|
|
388
|
+
3. **Validate URLs**: Use URL validation in your schema for href attributes
|
|
389
|
+
4. **Transform Data Early**: Use custom value functions rather than post-processing when possible
|
|
390
|
+
5. **Type Safety**: Let TypeScript infer types from your schema for better developer experience
|
|
391
|
+
|
|
392
|
+
## Common Use Cases
|
|
393
|
+
|
|
394
|
+
- **Web Scraping**: Extract structured data from websites
|
|
395
|
+
- **Meta Tag Extraction**: Get social media and SEO metadata
|
|
396
|
+
- **Content Migration**: Transform HTML content to structured data
|
|
397
|
+
- **Testing**: Validate HTML structure in tests
|
|
398
|
+
- **RSS/Feed Processing**: Extract article data from HTML feeds
|
|
399
|
+
|
|
400
|
+
## Performance Considerations
|
|
401
|
+
|
|
402
|
+
- xscrape uses cheerio for fast HTML parsing
|
|
403
|
+
- Schema validation is performed once after extraction
|
|
404
|
+
- Consider using streaming for large HTML documents
|
|
405
|
+
- Cache scrapers when processing many similar documents
|
|
179
406
|
|
|
180
407
|
## Contributing
|
|
181
408
|
|
|
182
|
-
|
|
409
|
+
We welcome contributions! Please see our [Contributing Guide](https://github.com/johnie/xscrape/blob/main/CONTRIBUTING.md) for details.
|
|
183
410
|
|
|
184
411
|
## License
|
|
185
412
|
|
|
186
|
-
|
|
413
|
+
MIT License. See the [LICENSE](https://github.com/johnie/xscrape/blob/main/LICENSE) file for details.
|
|
414
|
+
|
|
415
|
+
## Related Projects
|
|
416
|
+
|
|
417
|
+
- [cheerio](https://github.com/cheeriojs/cheerio) - jQuery-like server-side HTML parsing
|
|
418
|
+
- [Standard Schema](https://standardschema.dev) - Universal schema specification
|
|
419
|
+
- [Zod](https://zod.dev) - TypeScript-first schema validation
|
|
420
|
+
- [Valibot](https://valibot.dev) - Modular and type-safe schema library
|
|
421
|
+
- [Effect](https://effect.website) - Maximum Type-safety (incl. error handling)
|
|
422
|
+
- [ArkType](https://arktype.io) - TypeScript's 1:1 validator, optimized from editor to runtime
|
package/dist/index.cjs
CHANGED
|
@@ -55,14 +55,10 @@ function defineScraper(config) {
|
|
|
55
55
|
};
|
|
56
56
|
}
|
|
57
57
|
if (config.transform) {
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
return { data: transformed };
|
|
63
|
-
} catch (error) {
|
|
64
|
-
return { error };
|
|
65
|
-
}
|
|
58
|
+
const transformed = await Promise.resolve(
|
|
59
|
+
config.transform(validationResult.value)
|
|
60
|
+
);
|
|
61
|
+
return { data: transformed };
|
|
66
62
|
}
|
|
67
63
|
return { data: validationResult.value };
|
|
68
64
|
} catch (error) {
|
package/dist/index.d.cts
CHANGED
|
@@ -7,13 +7,14 @@ interface ExtractDescriptor {
|
|
|
7
7
|
value?: string | ExtractDescriptorFn | ExtractMap;
|
|
8
8
|
}
|
|
9
9
|
type ExtractValue = string | ExtractDescriptor | [string | ExtractDescriptor];
|
|
10
|
-
|
|
11
|
-
[key: string]: ExtractValue;
|
|
12
|
-
}
|
|
10
|
+
type ExtractMap = Record<string, ExtractValue>;
|
|
13
11
|
|
|
12
|
+
type SchemaAwareExtractMap<T> = {
|
|
13
|
+
[K in keyof T]: ExtractMap[string];
|
|
14
|
+
};
|
|
14
15
|
type ScraperConfig<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>> = {
|
|
15
16
|
schema: S;
|
|
16
|
-
extract:
|
|
17
|
+
extract: SchemaAwareExtractMap<StandardSchemaV1.InferOutput<S>>;
|
|
17
18
|
transform?: (data: StandardSchemaV1.InferOutput<S>) => Promise<R> | R;
|
|
18
19
|
};
|
|
19
20
|
type ValidationResult<T> = {
|
|
@@ -26,6 +27,6 @@ type ScraperResult<T> = {
|
|
|
26
27
|
error?: unknown;
|
|
27
28
|
};
|
|
28
29
|
|
|
29
|
-
declare function defineScraper<S extends StandardSchemaV1,
|
|
30
|
+
declare function defineScraper<S extends StandardSchemaV1, T extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>, R extends T = T>(config: ScraperConfig<S, R>): (html: string) => Promise<ScraperResult<R>>;
|
|
30
31
|
|
|
31
32
|
export { type ScraperConfig, type ScraperResult, type ValidationResult, defineScraper };
|
package/dist/index.d.ts
CHANGED
|
@@ -7,13 +7,14 @@ interface ExtractDescriptor {
|
|
|
7
7
|
value?: string | ExtractDescriptorFn | ExtractMap;
|
|
8
8
|
}
|
|
9
9
|
type ExtractValue = string | ExtractDescriptor | [string | ExtractDescriptor];
|
|
10
|
-
|
|
11
|
-
[key: string]: ExtractValue;
|
|
12
|
-
}
|
|
10
|
+
type ExtractMap = Record<string, ExtractValue>;
|
|
13
11
|
|
|
12
|
+
type SchemaAwareExtractMap<T> = {
|
|
13
|
+
[K in keyof T]: ExtractMap[string];
|
|
14
|
+
};
|
|
14
15
|
type ScraperConfig<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>> = {
|
|
15
16
|
schema: S;
|
|
16
|
-
extract:
|
|
17
|
+
extract: SchemaAwareExtractMap<StandardSchemaV1.InferOutput<S>>;
|
|
17
18
|
transform?: (data: StandardSchemaV1.InferOutput<S>) => Promise<R> | R;
|
|
18
19
|
};
|
|
19
20
|
type ValidationResult<T> = {
|
|
@@ -26,6 +27,6 @@ type ScraperResult<T> = {
|
|
|
26
27
|
error?: unknown;
|
|
27
28
|
};
|
|
28
29
|
|
|
29
|
-
declare function defineScraper<S extends StandardSchemaV1,
|
|
30
|
+
declare function defineScraper<S extends StandardSchemaV1, T extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>, R extends T = T>(config: ScraperConfig<S, R>): (html: string) => Promise<ScraperResult<R>>;
|
|
30
31
|
|
|
31
32
|
export { type ScraperConfig, type ScraperResult, type ValidationResult, defineScraper };
|
package/dist/index.js
CHANGED
|
@@ -19,14 +19,10 @@ function defineScraper(config) {
|
|
|
19
19
|
};
|
|
20
20
|
}
|
|
21
21
|
if (config.transform) {
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
return { data: transformed };
|
|
27
|
-
} catch (error) {
|
|
28
|
-
return { error };
|
|
29
|
-
}
|
|
22
|
+
const transformed = await Promise.resolve(
|
|
23
|
+
config.transform(validationResult.value)
|
|
24
|
+
);
|
|
25
|
+
return { data: transformed };
|
|
30
26
|
}
|
|
31
27
|
return { data: validationResult.value };
|
|
32
28
|
} catch (error) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "xscrape",
|
|
3
|
-
"version": "3.0.
|
|
3
|
+
"version": "3.0.3",
|
|
4
4
|
"description": "A flexible and powerful library designed to extract and transform data from HTML documents using user-defined schemas",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"exports": {
|
|
@@ -26,7 +26,8 @@
|
|
|
26
26
|
"zod",
|
|
27
27
|
"valibot",
|
|
28
28
|
"arktype",
|
|
29
|
-
"effect-schema"
|
|
29
|
+
"effect-schema",
|
|
30
|
+
"standard-schema"
|
|
30
31
|
],
|
|
31
32
|
"author": "Johnie Hjelm <johnie@hjelm.im>",
|
|
32
33
|
"license": "MIT",
|
|
@@ -40,35 +41,39 @@
|
|
|
40
41
|
"homepage": "https://github.com/johnie/xscrape#readme",
|
|
41
42
|
"devDependencies": {
|
|
42
43
|
"@arethetypeswrong/cli": "^0.18.2",
|
|
43
|
-
"@biomejs/biome": "2.1.
|
|
44
|
+
"@biomejs/biome": "2.1.2",
|
|
44
45
|
"@changesets/changelog-github": "^0.5.1",
|
|
45
46
|
"@changesets/cli": "^2.29.5",
|
|
46
47
|
"arktype": "^2.1.20",
|
|
47
|
-
"effect": "^3.
|
|
48
|
+
"effect": "^3.17.0",
|
|
48
49
|
"jsdom": "^26.1.0",
|
|
49
50
|
"lefthook": "^1.12.2",
|
|
50
51
|
"tsup": "^8.5.0",
|
|
51
52
|
"typescript": "^5.8.3",
|
|
52
53
|
"valibot": "^1.1.0",
|
|
53
|
-
"vite": "^7.0.
|
|
54
|
+
"vite": "^7.0.5",
|
|
55
|
+
"vitepress": "^1.6.3",
|
|
54
56
|
"vitest": "^3.2.4",
|
|
55
|
-
"zod": "^4.0.
|
|
57
|
+
"zod": "^4.0.5"
|
|
56
58
|
},
|
|
57
59
|
"dependencies": {
|
|
58
60
|
"@standard-schema/spec": "^1.0.0",
|
|
59
|
-
"cheerio": "^1.1.
|
|
61
|
+
"cheerio": "^1.1.2",
|
|
60
62
|
"domhandler": "^5.0.3"
|
|
61
63
|
},
|
|
62
64
|
"scripts": {
|
|
63
65
|
"build": "tsup",
|
|
64
|
-
"ci": "
|
|
65
|
-
"
|
|
66
|
+
"ci": "pnpm run build && pnpm run lint && pnpm run typecheck && pnpm run check-exports && pnpm run test",
|
|
67
|
+
"typecheck": "tsc",
|
|
66
68
|
"test": "vitest run",
|
|
67
69
|
"test:watch": "vitest",
|
|
68
70
|
"format": "biome format --write ./src",
|
|
69
|
-
"
|
|
71
|
+
"lint": "biome check ./src",
|
|
70
72
|
"check-exports": "attw --pack .",
|
|
71
|
-
"local-release": "
|
|
72
|
-
"release": "
|
|
73
|
+
"local-release": "pnpm run ci && changeset version && changeset publish",
|
|
74
|
+
"release": "pnpm run ci && changeset publish",
|
|
75
|
+
"docs:dev": "vitepress dev docs",
|
|
76
|
+
"docs:build": "vitepress build docs",
|
|
77
|
+
"docs:preview": "vitepress preview docs"
|
|
73
78
|
}
|
|
74
79
|
}
|