xscrape 3.0.2 → 3.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +323 -102
- package/dist/index.cjs +4 -8
- package/dist/index.js +4 -8
- package/package.json +15 -11
package/README.md
CHANGED
|
@@ -11,47 +11,78 @@
|
|
|
11
11
|
<p align="center">
|
|
12
12
|
<a href="https://opensource.org/licenses/MIT" rel="nofollow"><img src="https://img.shields.io/github/license/johnie/xscrape" alt="License"></a>
|
|
13
13
|
<a href="https://www.npmjs.com/package/xscrape" rel="nofollow"><img src="https://img.shields.io/npm/v/xscrape.svg" alt="npm"></a>
|
|
14
|
+
<a href="https://github.com/johnie/xscrape/actions"><img src="https://github.com/johnie/xscrape/actions/workflows/ci.yml/badge.svg" alt="Build Status"></a>
|
|
14
15
|
<a href="https://github.com/johnie/xscrape" rel="nofollow"><img src="https://img.shields.io/github/stars/johnie/xscrape" alt="stars"></a>
|
|
15
16
|
</p>
|
|
16
17
|
|
|
17
18
|
<br/>
|
|
18
19
|
<br/>
|
|
19
20
|
|
|
20
|
-
##
|
|
21
|
+
## Overview
|
|
22
|
+
|
|
23
|
+
xscrape is a powerful HTML scraping library that combines the flexibility of query selectors with the safety of schema validation. It works with any validation library that implements the [Standard Schema](https://standardschema.dev) specification, including Zod, Valibot, ArkType, and Effect Schema.
|
|
21
24
|
|
|
22
|
-
|
|
23
|
-
* **Flexible Schema Validation**: Validate and transform extracted data with any validation library that implements the [Standard Schema](https://standardschema.dev), such as Zod, Valibot, ArkType, and Effect Schema.
|
|
24
|
-
* **Custom Transformations**: Provide custom transformations for extracted attributes.
|
|
25
|
-
* **Default Values**: Define default values for missing data fields through your chosen schema library's features.
|
|
26
|
-
* **Nested Field Support**: Define and extract nested data structures from HTML elements.
|
|
25
|
+
## Features
|
|
27
26
|
|
|
28
|
-
|
|
27
|
+
- **HTML Parsing**: Extract data from HTML using query selectors powered by [cheerio](https://github.com/cheeriojs/cheerio)
|
|
28
|
+
- **Universal Schema Support**: Works with any [Standard Schema](https://standardschema.dev) compatible library
|
|
29
|
+
- **Type Safety**: Full TypeScript support with inferred types from your schemas
|
|
30
|
+
- **Flexible Extraction**: Support for nested objects, arrays, and custom transformation functions
|
|
31
|
+
- **Error Handling**: Comprehensive error handling with detailed validation feedback
|
|
32
|
+
- **Custom Transformations**: Apply post-processing transformations to validated data
|
|
33
|
+
- **Default Values**: Handle missing data gracefully through schema defaults
|
|
29
34
|
|
|
30
35
|
## Installation
|
|
31
36
|
|
|
32
|
-
|
|
37
|
+
Install xscrape with your preferred package manager:
|
|
33
38
|
|
|
34
39
|
```bash
|
|
40
|
+
npm install xscrape
|
|
41
|
+
# or
|
|
35
42
|
pnpm add xscrape
|
|
36
43
|
# or
|
|
37
|
-
|
|
44
|
+
bun add xscrape
|
|
38
45
|
```
|
|
39
46
|
|
|
40
|
-
|
|
47
|
+
## Quick Start
|
|
41
48
|
|
|
42
|
-
```
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
49
|
+
```typescript
|
|
50
|
+
import { defineScraper } from 'xscrape';
|
|
51
|
+
import { z } from 'zod';
|
|
52
|
+
|
|
53
|
+
// Define your schema
|
|
54
|
+
const schema = z.object({
|
|
55
|
+
title: z.string(),
|
|
56
|
+
description: z.string(),
|
|
57
|
+
keywords: z.array(z.string()),
|
|
58
|
+
views: z.coerce.number(),
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
// Create a scraper
|
|
62
|
+
const scraper = defineScraper({
|
|
63
|
+
schema,
|
|
64
|
+
extract: {
|
|
65
|
+
title: { selector: 'title' },
|
|
66
|
+
description: { selector: 'meta[name="description"]', value: 'content' },
|
|
67
|
+
keywords: {
|
|
68
|
+
selector: 'meta[name="keywords"]',
|
|
69
|
+
value: (el) => el.attribs['content']?.split(',') || [],
|
|
70
|
+
},
|
|
71
|
+
views: { selector: 'meta[name="views"]', value: 'content' },
|
|
72
|
+
},
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
// Use the scraper
|
|
76
|
+
const { data, error } = await scraper(htmlString);
|
|
46
77
|
```
|
|
47
78
|
|
|
48
|
-
|
|
79
|
+
## Usage Examples
|
|
49
80
|
|
|
50
|
-
|
|
81
|
+
### Basic Extraction
|
|
51
82
|
|
|
52
|
-
|
|
83
|
+
Extract basic metadata from an HTML page:
|
|
53
84
|
|
|
54
|
-
```
|
|
85
|
+
```typescript
|
|
55
86
|
import { defineScraper } from 'xscrape';
|
|
56
87
|
import { z } from 'zod';
|
|
57
88
|
|
|
@@ -59,27 +90,12 @@ const scraper = defineScraper({
|
|
|
59
90
|
schema: z.object({
|
|
60
91
|
title: z.string(),
|
|
61
92
|
description: z.string(),
|
|
62
|
-
|
|
63
|
-
views: z.coerce.number(),
|
|
93
|
+
author: z.string(),
|
|
64
94
|
}),
|
|
65
95
|
extract: {
|
|
66
|
-
title: {
|
|
67
|
-
|
|
68
|
-
},
|
|
69
|
-
description: {
|
|
70
|
-
selector: 'meta[name="description"]',
|
|
71
|
-
value: 'content',
|
|
72
|
-
},
|
|
73
|
-
keywords: {
|
|
74
|
-
selector: 'meta[name="keywords"]',
|
|
75
|
-
value(el) {
|
|
76
|
-
return el.attribs['content']?.split(',');
|
|
77
|
-
},
|
|
78
|
-
},
|
|
79
|
-
views: {
|
|
80
|
-
selector: 'meta[name="views"]',
|
|
81
|
-
value: 'content',
|
|
82
|
-
},
|
|
96
|
+
title: { selector: 'title' },
|
|
97
|
+
description: { selector: 'meta[name="description"]', value: 'content' },
|
|
98
|
+
author: { selector: 'meta[name="author"]', value: 'content' },
|
|
83
99
|
},
|
|
84
100
|
});
|
|
85
101
|
|
|
@@ -87,115 +103,320 @@ const html = `
|
|
|
87
103
|
<!DOCTYPE html>
|
|
88
104
|
<html>
|
|
89
105
|
<head>
|
|
90
|
-
<
|
|
91
|
-
<meta name="
|
|
92
|
-
<meta name="
|
|
93
|
-
<title>Example Title</title>
|
|
106
|
+
<title>My Blog Post</title>
|
|
107
|
+
<meta name="description" content="An interesting blog post">
|
|
108
|
+
<meta name="author" content="John Doe">
|
|
94
109
|
</head>
|
|
95
|
-
<body
|
|
110
|
+
<body>...</body>
|
|
96
111
|
</html>
|
|
97
112
|
`;
|
|
98
113
|
|
|
99
114
|
const { data, error } = await scraper(html);
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
// Outputs:
|
|
103
|
-
// {
|
|
104
|
-
// title: 'Example Title',
|
|
105
|
-
// description: 'An example description.',
|
|
106
|
-
// keywords: ['typescript', 'html', 'parsing'],
|
|
107
|
-
// views: 1234
|
|
108
|
-
// }
|
|
115
|
+
// data: { title: "My Blog Post", description: "An interesting blog post", author: "John Doe" }
|
|
109
116
|
```
|
|
110
117
|
|
|
111
118
|
### Handling Missing Data
|
|
112
119
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
```ts
|
|
116
|
-
import { defineScraper } from 'xscrape';
|
|
117
|
-
import { z } from 'zod';
|
|
120
|
+
Use schema defaults to handle missing data gracefully:
|
|
118
121
|
|
|
122
|
+
```typescript
|
|
119
123
|
const scraper = defineScraper({
|
|
120
124
|
schema: z.object({
|
|
121
|
-
title: z.string().default('
|
|
122
|
-
description: z.string().default('No description'),
|
|
125
|
+
title: z.string().default('Untitled'),
|
|
126
|
+
description: z.string().default('No description available'),
|
|
127
|
+
publishedAt: z.string().optional(),
|
|
123
128
|
views: z.coerce.number().default(0),
|
|
124
129
|
}),
|
|
125
130
|
extract: {
|
|
126
|
-
title: {
|
|
127
|
-
|
|
128
|
-
},
|
|
129
|
-
|
|
130
|
-
selector: 'meta[name="description"]',
|
|
131
|
-
value: 'content',
|
|
132
|
-
},
|
|
133
|
-
views: {
|
|
134
|
-
selector: 'meta[name="views"]',
|
|
135
|
-
value: 'content',
|
|
136
|
-
},
|
|
131
|
+
title: { selector: 'title' },
|
|
132
|
+
description: { selector: 'meta[name="description"]', value: 'content' },
|
|
133
|
+
publishedAt: { selector: 'meta[name="published"]', value: 'content' },
|
|
134
|
+
views: { selector: 'meta[name="views"]', value: 'content' },
|
|
137
135
|
},
|
|
138
136
|
});
|
|
137
|
+
|
|
138
|
+
// Even with incomplete HTML, you get sensible defaults
|
|
139
|
+
const { data } = await scraper('<html><head><title>Test</title></head></html>');
|
|
140
|
+
// data: { title: "Test", description: "No description available", views: 0 }
|
|
139
141
|
```
|
|
140
142
|
|
|
141
|
-
###
|
|
143
|
+
### Extracting Arrays
|
|
142
144
|
|
|
143
|
-
|
|
145
|
+
Extract multiple elements as arrays:
|
|
144
146
|
|
|
145
|
-
```
|
|
146
|
-
|
|
147
|
-
|
|
147
|
+
```typescript
|
|
148
|
+
const scraper = defineScraper({
|
|
149
|
+
schema: z.object({
|
|
150
|
+
links: z.array(z.string()),
|
|
151
|
+
headings: z.array(z.string()),
|
|
152
|
+
}),
|
|
153
|
+
extract: {
|
|
154
|
+
links: [{ selector: 'a', value: 'href' }],
|
|
155
|
+
headings: [{ selector: 'h1, h2, h3' }],
|
|
156
|
+
},
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
const html = `
|
|
160
|
+
<html>
|
|
161
|
+
<body>
|
|
162
|
+
<h1>Main Title</h1>
|
|
163
|
+
<h2>Subtitle</h2>
|
|
164
|
+
<a href="/page1">Link 1</a>
|
|
165
|
+
<a href="/page2">Link 2</a>
|
|
166
|
+
</body>
|
|
167
|
+
</html>
|
|
168
|
+
`;
|
|
148
169
|
|
|
170
|
+
const { data } = await scraper(html);
|
|
171
|
+
// data: {
|
|
172
|
+
// links: ["/page1", "/page2"],
|
|
173
|
+
// headings: ["Main Title", "Subtitle"]
|
|
174
|
+
// }
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### Nested Objects
|
|
178
|
+
|
|
179
|
+
Extract complex nested data structures:
|
|
180
|
+
|
|
181
|
+
```typescript
|
|
149
182
|
const scraper = defineScraper({
|
|
150
183
|
schema: z.object({
|
|
151
184
|
title: z.string(),
|
|
152
|
-
|
|
153
|
-
|
|
185
|
+
socialMedia: z.object({
|
|
186
|
+
image: z.string().url(),
|
|
154
187
|
width: z.coerce.number(),
|
|
155
188
|
height: z.coerce.number(),
|
|
156
|
-
|
|
189
|
+
type: z.string(),
|
|
190
|
+
}),
|
|
157
191
|
}),
|
|
158
192
|
extract: {
|
|
159
|
-
title: {
|
|
160
|
-
|
|
161
|
-
},
|
|
162
|
-
image: {
|
|
193
|
+
title: { selector: 'title' },
|
|
194
|
+
socialMedia: {
|
|
163
195
|
selector: 'head',
|
|
164
196
|
value: {
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
},
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
197
|
+
image: { selector: 'meta[property="og:image"]', value: 'content' },
|
|
198
|
+
width: { selector: 'meta[property="og:image:width"]', value: 'content' },
|
|
199
|
+
height: { selector: 'meta[property="og:image:height"]', value: 'content' },
|
|
200
|
+
type: { selector: 'meta[property="og:type"]', value: 'content' },
|
|
201
|
+
},
|
|
202
|
+
},
|
|
203
|
+
},
|
|
204
|
+
});
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
### Custom Value Transformations
|
|
208
|
+
|
|
209
|
+
Apply custom logic to extracted values:
|
|
210
|
+
|
|
211
|
+
```typescript
|
|
212
|
+
const scraper = defineScraper({
|
|
213
|
+
schema: z.object({
|
|
214
|
+
tags: z.array(z.string()),
|
|
215
|
+
publishedDate: z.date(),
|
|
216
|
+
readingTime: z.number(),
|
|
217
|
+
}),
|
|
218
|
+
extract: {
|
|
219
|
+
tags: {
|
|
220
|
+
selector: 'meta[name="keywords"]',
|
|
221
|
+
value: (el) => el.attribs['content']?.split(',').map(tag => tag.trim()) || [],
|
|
222
|
+
},
|
|
223
|
+
publishedDate: {
|
|
224
|
+
selector: 'meta[name="published"]',
|
|
225
|
+
value: (el) => new Date(el.attribs['content']),
|
|
226
|
+
},
|
|
227
|
+
readingTime: {
|
|
228
|
+
selector: 'article',
|
|
229
|
+
value: (el) => {
|
|
230
|
+
const text = el.text();
|
|
231
|
+
const wordsPerMinute = 200;
|
|
232
|
+
const wordCount = text.split(/\s+/).length;
|
|
233
|
+
return Math.ceil(wordCount / wordsPerMinute);
|
|
177
234
|
},
|
|
178
235
|
},
|
|
179
236
|
},
|
|
180
237
|
});
|
|
181
238
|
```
|
|
182
239
|
|
|
183
|
-
|
|
240
|
+
### Post-Processing with Transform
|
|
241
|
+
|
|
242
|
+
Apply transformations to the validated data:
|
|
243
|
+
|
|
244
|
+
```typescript
|
|
245
|
+
const scraper = defineScraper({
|
|
246
|
+
schema: z.object({
|
|
247
|
+
title: z.string(),
|
|
248
|
+
description: z.string(),
|
|
249
|
+
tags: z.array(z.string()),
|
|
250
|
+
}),
|
|
251
|
+
extract: {
|
|
252
|
+
title: { selector: 'title' },
|
|
253
|
+
description: { selector: 'meta[name="description"]', value: 'content' },
|
|
254
|
+
tags: {
|
|
255
|
+
selector: 'meta[name="keywords"]',
|
|
256
|
+
value: (el) => el.attribs['content']?.split(',') || [],
|
|
257
|
+
},
|
|
258
|
+
},
|
|
259
|
+
transform: (data) => ({
|
|
260
|
+
...data,
|
|
261
|
+
slug: data.title.toLowerCase().replace(/\s+/g, '-'),
|
|
262
|
+
tagCount: data.tags.length,
|
|
263
|
+
summary: data.description.substring(0, 100) + '...',
|
|
264
|
+
}),
|
|
265
|
+
});
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
## Schema Library Examples
|
|
269
|
+
|
|
270
|
+
### Zod
|
|
184
271
|
|
|
185
|
-
|
|
272
|
+
```typescript
|
|
273
|
+
import { z } from 'zod';
|
|
186
274
|
|
|
187
|
-
|
|
275
|
+
const schema = z.object({
|
|
276
|
+
title: z.string(),
|
|
277
|
+
price: z.coerce.number(),
|
|
278
|
+
inStock: z.boolean().default(false),
|
|
279
|
+
});
|
|
280
|
+
```
|
|
188
281
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
282
|
+
### Valibot
|
|
283
|
+
|
|
284
|
+
```typescript
|
|
285
|
+
import * as v from 'valibot';
|
|
286
|
+
|
|
287
|
+
const schema = v.object({
|
|
288
|
+
title: v.string(),
|
|
289
|
+
price: v.pipe(v.string(), v.transform(Number)),
|
|
290
|
+
inStock: v.optional(v.boolean(), false),
|
|
291
|
+
});
|
|
292
|
+
```
|
|
192
293
|
|
|
193
|
-
|
|
294
|
+
### ArkType
|
|
295
|
+
|
|
296
|
+
```typescript
|
|
297
|
+
import { type } from 'arktype';
|
|
298
|
+
|
|
299
|
+
const schema = type({
|
|
300
|
+
title: 'string',
|
|
301
|
+
price: 'number',
|
|
302
|
+
inStock: 'boolean = false',
|
|
303
|
+
});
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
### Effect Schema
|
|
307
|
+
|
|
308
|
+
```typescript
|
|
309
|
+
import { Schema } from 'effect';
|
|
310
|
+
|
|
311
|
+
const schema = Schema.Struct({
|
|
312
|
+
title: Schema.String,
|
|
313
|
+
price: Schema.NumberFromString,
|
|
314
|
+
inStock: Schema.optionalWith(Schema.Boolean, { default: () => false }),
|
|
315
|
+
});
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
## API Reference
|
|
319
|
+
|
|
320
|
+
### `defineScraper(config)`
|
|
321
|
+
|
|
322
|
+
Creates a scraper function with the specified configuration.
|
|
323
|
+
|
|
324
|
+
#### Parameters
|
|
325
|
+
|
|
326
|
+
- `config.schema`: A Standard Schema compatible schema object
|
|
327
|
+
- `config.extract`: Extraction configuration object
|
|
328
|
+
- `config.transform?`: Optional post-processing function
|
|
329
|
+
|
|
330
|
+
#### Returns
|
|
331
|
+
|
|
332
|
+
A scraper function that takes HTML string and returns `Promise<{ data?: T, error?: unknown }>`.
|
|
333
|
+
|
|
334
|
+
### Extraction Configuration
|
|
335
|
+
|
|
336
|
+
The `extract` object defines how to extract data from HTML:
|
|
337
|
+
|
|
338
|
+
```typescript
|
|
339
|
+
type ExtractConfig = {
|
|
340
|
+
[key: string]: ExtractDescriptor | [ExtractDescriptor];
|
|
341
|
+
};
|
|
342
|
+
|
|
343
|
+
type ExtractDescriptor = {
|
|
344
|
+
selector: string;
|
|
345
|
+
value?: string | ((el: Element) => any) | ExtractConfig;
|
|
346
|
+
};
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
#### Properties
|
|
350
|
+
|
|
351
|
+
- `selector`: CSS selector to find elements
|
|
352
|
+
- `value`: How to extract the value:
|
|
353
|
+
- `string`: Attribute name (e.g., `'href'`, `'content'`)
|
|
354
|
+
- `function`: Custom extraction function
|
|
355
|
+
- `object`: Nested extraction configuration
|
|
356
|
+
- `undefined`: Extract text content
|
|
357
|
+
|
|
358
|
+
#### Array Extraction
|
|
359
|
+
|
|
360
|
+
Wrap the descriptor in an array to extract multiple elements:
|
|
361
|
+
|
|
362
|
+
```typescript
|
|
363
|
+
{
|
|
364
|
+
links: [{ selector: 'a', value: 'href' }]
|
|
365
|
+
}
|
|
366
|
+
```
|
|
367
|
+
|
|
368
|
+
## Error Handling
|
|
369
|
+
|
|
370
|
+
xscrape provides comprehensive error handling:
|
|
371
|
+
|
|
372
|
+
```typescript
|
|
373
|
+
const { data, error } = await scraper(html);
|
|
374
|
+
|
|
375
|
+
if (error) {
|
|
376
|
+
// Handle validation errors, extraction errors, or transform errors
|
|
377
|
+
console.error('Scraping failed:', error);
|
|
378
|
+
} else {
|
|
379
|
+
// Use the validated data
|
|
380
|
+
console.log('Extracted data:', data);
|
|
381
|
+
}
|
|
382
|
+
```
|
|
383
|
+
|
|
384
|
+
## Best Practices
|
|
385
|
+
|
|
386
|
+
1. **Use Specific Selectors**: Be as specific as possible with CSS selectors to avoid unexpected matches
|
|
387
|
+
2. **Handle Missing Data**: Use schema defaults or optional fields for data that might not be present
|
|
388
|
+
3. **Validate URLs**: Use URL validation in your schema for href attributes
|
|
389
|
+
4. **Transform Data Early**: Use custom value functions rather than post-processing when possible
|
|
390
|
+
5. **Type Safety**: Let TypeScript infer types from your schema for better developer experience
|
|
391
|
+
|
|
392
|
+
## Common Use Cases
|
|
393
|
+
|
|
394
|
+
- **Web Scraping**: Extract structured data from websites
|
|
395
|
+
- **Meta Tag Extraction**: Get social media and SEO metadata
|
|
396
|
+
- **Content Migration**: Transform HTML content to structured data
|
|
397
|
+
- **Testing**: Validate HTML structure in tests
|
|
398
|
+
- **RSS/Feed Processing**: Extract article data from HTML feeds
|
|
399
|
+
|
|
400
|
+
## Performance Considerations
|
|
401
|
+
|
|
402
|
+
- xscrape uses cheerio for fast HTML parsing
|
|
403
|
+
- Schema validation is performed once after extraction
|
|
404
|
+
- Consider using streaming for large HTML documents
|
|
405
|
+
- Cache scrapers when processing many similar documents
|
|
194
406
|
|
|
195
407
|
## Contributing
|
|
196
408
|
|
|
197
|
-
|
|
409
|
+
We welcome contributions! Please see our [Contributing Guide](https://github.com/johnie/xscrape/blob/main/CONTRIBUTING.md) for details.
|
|
198
410
|
|
|
199
411
|
## License
|
|
200
412
|
|
|
201
|
-
|
|
413
|
+
MIT License. See the [LICENSE](https://github.com/johnie/xscrape/blob/main/LICENSE) file for details.
|
|
414
|
+
|
|
415
|
+
## Related Projects
|
|
416
|
+
|
|
417
|
+
- [cheerio](https://github.com/cheeriojs/cheerio) - jQuery-like server-side HTML parsing
|
|
418
|
+
- [Standard Schema](https://standardschema.dev) - Universal schema specification
|
|
419
|
+
- [Zod](https://zod.dev) - TypeScript-first schema validation
|
|
420
|
+
- [Valibot](https://valibot.dev) - Modular and type-safe schema library
|
|
421
|
+
- [Effect](https://effect.website) - Maximum Type-safety (incl. error handling)
|
|
422
|
+
- [ArkType](https://arktype.io) - TypeScript's 1:1 validator, optimized from editor to runtime
|
package/dist/index.cjs
CHANGED
|
@@ -55,14 +55,10 @@ function defineScraper(config) {
|
|
|
55
55
|
};
|
|
56
56
|
}
|
|
57
57
|
if (config.transform) {
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
return { data: transformed };
|
|
63
|
-
} catch (error) {
|
|
64
|
-
return { error };
|
|
65
|
-
}
|
|
58
|
+
const transformed = await Promise.resolve(
|
|
59
|
+
config.transform(validationResult.value)
|
|
60
|
+
);
|
|
61
|
+
return { data: transformed };
|
|
66
62
|
}
|
|
67
63
|
return { data: validationResult.value };
|
|
68
64
|
} catch (error) {
|
package/dist/index.js
CHANGED
|
@@ -19,14 +19,10 @@ function defineScraper(config) {
|
|
|
19
19
|
};
|
|
20
20
|
}
|
|
21
21
|
if (config.transform) {
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
return { data: transformed };
|
|
27
|
-
} catch (error) {
|
|
28
|
-
return { error };
|
|
29
|
-
}
|
|
22
|
+
const transformed = await Promise.resolve(
|
|
23
|
+
config.transform(validationResult.value)
|
|
24
|
+
);
|
|
25
|
+
return { data: transformed };
|
|
30
26
|
}
|
|
31
27
|
return { data: validationResult.value };
|
|
32
28
|
} catch (error) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "xscrape",
|
|
3
|
-
"version": "3.0.
|
|
3
|
+
"version": "3.0.4",
|
|
4
4
|
"description": "A flexible and powerful library designed to extract and transform data from HTML documents using user-defined schemas",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"exports": {
|
|
@@ -41,35 +41,39 @@
|
|
|
41
41
|
"homepage": "https://github.com/johnie/xscrape#readme",
|
|
42
42
|
"devDependencies": {
|
|
43
43
|
"@arethetypeswrong/cli": "^0.18.2",
|
|
44
|
-
"@biomejs/biome": "2.1.
|
|
44
|
+
"@biomejs/biome": "2.1.2",
|
|
45
45
|
"@changesets/changelog-github": "^0.5.1",
|
|
46
46
|
"@changesets/cli": "^2.29.5",
|
|
47
47
|
"arktype": "^2.1.20",
|
|
48
|
-
"effect": "^3.
|
|
48
|
+
"effect": "^3.17.0",
|
|
49
49
|
"jsdom": "^26.1.0",
|
|
50
50
|
"lefthook": "^1.12.2",
|
|
51
51
|
"tsup": "^8.5.0",
|
|
52
52
|
"typescript": "^5.8.3",
|
|
53
53
|
"valibot": "^1.1.0",
|
|
54
|
-
"vite": "^7.0.
|
|
54
|
+
"vite": "^7.0.5",
|
|
55
|
+
"vitepress": "^1.6.3",
|
|
55
56
|
"vitest": "^3.2.4",
|
|
56
|
-
"zod": "^4.0.
|
|
57
|
+
"zod": "^4.0.5"
|
|
57
58
|
},
|
|
58
59
|
"dependencies": {
|
|
59
60
|
"@standard-schema/spec": "^1.0.0",
|
|
60
|
-
"cheerio": "^1.1.
|
|
61
|
+
"cheerio": "^1.1.2",
|
|
61
62
|
"domhandler": "^5.0.3"
|
|
62
63
|
},
|
|
63
64
|
"scripts": {
|
|
64
65
|
"build": "tsup",
|
|
65
|
-
"ci": "
|
|
66
|
-
"
|
|
66
|
+
"ci": "pnpm run build && pnpm run lint && pnpm run typecheck && pnpm run check-exports && pnpm run test",
|
|
67
|
+
"typecheck": "tsc",
|
|
67
68
|
"test": "vitest run",
|
|
68
69
|
"test:watch": "vitest",
|
|
69
70
|
"format": "biome format --write ./src",
|
|
70
|
-
"
|
|
71
|
+
"lint": "biome check ./src",
|
|
71
72
|
"check-exports": "attw --pack .",
|
|
72
|
-
"local-release": "
|
|
73
|
-
"release": "
|
|
73
|
+
"local-release": "pnpm run ci && changeset version && changeset publish",
|
|
74
|
+
"release": "pnpm run ci && changeset publish",
|
|
75
|
+
"docs:dev": "vitepress dev docs",
|
|
76
|
+
"docs:build": "vitepress build docs",
|
|
77
|
+
"docs:preview": "vitepress preview docs"
|
|
74
78
|
}
|
|
75
79
|
}
|