xscrape 2.0.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +139 -126
- package/dist/index.cjs +14 -54
- package/dist/index.d.cts +6 -41
- package/dist/index.d.ts +6 -41
- package/dist/index.js +11 -51
- package/package.json +16 -15
package/README.md
CHANGED
|
@@ -1,30 +1,20 @@
|
|
|
1
1
|
# xscrape
|
|
2
2
|
|
|
3
|
-
`xscrape` is a powerful and flexible library designed for extracting and transforming data from HTML documents using user-defined schemas.
|
|
3
|
+
`xscrape` is a powerful and flexible library designed for extracting and transforming data from HTML documents using user-defined schemas. It now supports any validation library that implements the **Standard Schema**, allowing you to bring your own schema for robust, type-safe data validation.
|
|
4
4
|
|
|
5
5
|
## Features
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
[
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
- **Nested Field Support**: Define and extract nested data structures from
|
|
13
|
-
HTML elements.
|
|
7
|
+
* **HTML Parsing**: Extract data from HTML using CSS selectors with the help of [cheerio](https://github.com/cheeriojs/cheerio).
|
|
8
|
+
* **Flexible Schema Validation**: Validate and transform extracted data with any validation library that implements the [Standard Schema](https://standardschema.dev), such as Zod, Valibot, ArkType, and Effect Schema.
|
|
9
|
+
* **Custom Transformations**: Provide custom transformations for extracted attributes.
|
|
10
|
+
* **Default Values**: Define default values for missing data fields through your chosen schema library's features.
|
|
11
|
+
* **Nested Field Support**: Define and extract nested data structures from HTML elements.
|
|
14
12
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
| Schema Library | Status | Notes |
|
|
18
|
-
| ---------------------------------------------------- | ------------------- | ------------------------------------------------------------- |
|
|
19
|
-
| [Zod](https://github.com/colinhacks/zod) | ✅ Supported | Default schema tool for `xscrape` |
|
|
20
|
-
| [Effect/Schema](https://github.com/Effect-TS/effect) | 🔄 In Consideration | Support for Effect/Schema for additional flexibility |
|
|
21
|
-
| [Joi](https://github.com/sideway/joi) | 🔄 In Consideration | Support for Joi for validation |
|
|
22
|
-
| [Yup](https://github.com/jquense/yup) | 🔄 In Consideration | Support for Yup for validation |
|
|
23
|
-
| Others... | 🔄 In Consideration | Potential support for other schema tools as per user feedback |
|
|
13
|
+
-----
|
|
24
14
|
|
|
25
15
|
## Installation
|
|
26
16
|
|
|
27
|
-
To install this library, use
|
|
17
|
+
To install this library, use your preferred package manager:
|
|
28
18
|
|
|
29
19
|
```bash
|
|
30
20
|
pnpm add xscrape
|
|
@@ -32,142 +22,165 @@ pnpm add xscrape
|
|
|
32
22
|
npm install xscrape
|
|
33
23
|
```
|
|
34
24
|
|
|
35
|
-
|
|
25
|
+
You will also need to install your chosen schema validation library, for example, Zod:
|
|
36
26
|
|
|
37
|
-
|
|
38
|
-
|
|
27
|
+
```bash
|
|
28
|
+
pnpm add zod
|
|
29
|
+
# or
|
|
30
|
+
npm install zod
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
-----
|
|
39
34
|
|
|
40
|
-
|
|
35
|
+
## Usage
|
|
36
|
+
|
|
37
|
+
Below is an example of how to use `xscrape` with a Zod schema to extract and transform data from an HTML document.
|
|
41
38
|
|
|
42
39
|
```ts
|
|
40
|
+
import { defineScraper } from 'xscrape';
|
|
43
41
|
import { z } from 'zod';
|
|
44
42
|
|
|
45
|
-
const
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
}
|
|
56
|
-
|
|
57
|
-
|
|
43
|
+
const scraper = defineScraper({
|
|
44
|
+
schema: z.object({
|
|
45
|
+
title: z.string(),
|
|
46
|
+
description: z.string(),
|
|
47
|
+
keywords: z.array(z.string()),
|
|
48
|
+
views: z.coerce.number(),
|
|
49
|
+
}),
|
|
50
|
+
extract: {
|
|
51
|
+
title: {
|
|
52
|
+
selector: 'title',
|
|
53
|
+
},
|
|
54
|
+
description: {
|
|
55
|
+
selector: 'meta[name="description"]',
|
|
56
|
+
value: 'content',
|
|
57
|
+
},
|
|
58
|
+
keywords: {
|
|
59
|
+
selector: 'meta[name="keywords"]',
|
|
60
|
+
value(el) {
|
|
61
|
+
return el.attribs['content']?.split(',');
|
|
62
|
+
},
|
|
63
|
+
},
|
|
64
|
+
views: {
|
|
65
|
+
selector: 'meta[name="views"]',
|
|
66
|
+
value: 'content',
|
|
67
|
+
},
|
|
68
|
+
},
|
|
58
69
|
});
|
|
59
|
-
```
|
|
60
70
|
|
|
61
|
-
|
|
71
|
+
const html = `
|
|
72
|
+
<!DOCTYPE html>
|
|
73
|
+
<html>
|
|
74
|
+
<head>
|
|
75
|
+
<meta name="description" content="An example description.">
|
|
76
|
+
<meta name="keywords" content="typescript,html,parsing">
|
|
77
|
+
<meta name="views" content="1234">
|
|
78
|
+
<title>Example Title</title>
|
|
79
|
+
</head>
|
|
80
|
+
<body></body>
|
|
81
|
+
</html>
|
|
82
|
+
`;
|
|
83
|
+
|
|
84
|
+
const { data, error } = await scraper(html);
|
|
85
|
+
console.log(data);
|
|
62
86
|
|
|
63
|
-
|
|
64
|
-
|
|
87
|
+
// Outputs:
|
|
88
|
+
// {
|
|
89
|
+
// title: 'Example Title',
|
|
90
|
+
// description: 'An example description.',
|
|
91
|
+
// keywords: ['typescript', 'html', 'parsing'],
|
|
92
|
+
// views: 1234
|
|
93
|
+
// }
|
|
94
|
+
```
|
|
65
95
|
|
|
66
|
-
|
|
96
|
+
### Handling Missing Data
|
|
67
97
|
|
|
68
|
-
|
|
69
|
-
title: { selector: 'title' },
|
|
70
|
-
description: {
|
|
71
|
-
selector: 'meta[name="description"]',
|
|
72
|
-
attribute: 'content',
|
|
98
|
+
You can handle missing data by using the features of your chosen schema library, such as default values in Zod.
|
|
73
99
|
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
width: {
|
|
96
|
-
selector: 'meta[property="og:image:width"]',
|
|
97
|
-
attribute: 'content',
|
|
98
|
-
transform: (value) => parseInt(value, 10),
|
|
99
|
-
},
|
|
100
|
-
height: {
|
|
101
|
-
selector: 'meta[property="og:image:height"]',
|
|
102
|
-
attribute: 'content',
|
|
103
|
-
transform: (value) => parseInt(value, 10),
|
|
104
|
-
},
|
|
100
|
+
```ts
|
|
101
|
+
import { defineScraper } from 'xscrape';
|
|
102
|
+
import { z } from 'zod';
|
|
103
|
+
|
|
104
|
+
const scraper = defineScraper({
|
|
105
|
+
schema: z.object({
|
|
106
|
+
title: z.string().default('No title'),
|
|
107
|
+
description: z.string().default('No description'),
|
|
108
|
+
views: z.coerce.number().default(0),
|
|
109
|
+
}),
|
|
110
|
+
extract: {
|
|
111
|
+
title: {
|
|
112
|
+
selector: 'title',
|
|
113
|
+
},
|
|
114
|
+
description: {
|
|
115
|
+
selector: 'meta[name="description"]',
|
|
116
|
+
value: 'content',
|
|
117
|
+
},
|
|
118
|
+
views: {
|
|
119
|
+
selector: 'meta[name="views"]',
|
|
120
|
+
value: 'content',
|
|
105
121
|
},
|
|
106
122
|
},
|
|
107
|
-
};
|
|
123
|
+
});
|
|
108
124
|
```
|
|
109
125
|
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
```ts
|
|
113
|
-
import { createScraper, ZodValidator } from 'xscrape';
|
|
126
|
+
### Nested Fields
|
|
114
127
|
|
|
115
|
-
|
|
116
|
-
const scraper = createScraper({ fields, validator });
|
|
128
|
+
`xscrape` also supports extracting nested data structures.
|
|
117
129
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
<head>
|
|
122
|
-
<meta name="description" content="An example description.">
|
|
123
|
-
<meta name="keywords" content="typescript,html,parsing">
|
|
124
|
-
<meta name="views" content="1234">
|
|
125
|
-
<meta property="og:image" content="https://example.se/images/c12ffe73-3227-4a4a-b8ad-a3003cdf1d70?h=708&tight=false&w=1372">
|
|
126
|
-
<meta property="og:image:width" content="1372">
|
|
127
|
-
<meta property="og:image:height" content="708">
|
|
128
|
-
<title>Example Title</title>
|
|
129
|
-
</head>
|
|
130
|
-
<body></body>
|
|
131
|
-
</html>
|
|
132
|
-
`;
|
|
133
|
-
|
|
134
|
-
const data = scraper(html);
|
|
135
|
-
console.log(data);
|
|
130
|
+
```ts
|
|
131
|
+
import { defineScraper } from 'xscrape';
|
|
132
|
+
import { z } from 'zod';
|
|
136
133
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
134
|
+
const scraper = defineScraper({
|
|
135
|
+
schema: z.object({
|
|
136
|
+
title: z.string(),
|
|
137
|
+
image: z.object({
|
|
138
|
+
url: z.string().url(),
|
|
139
|
+
width: z.coerce.number(),
|
|
140
|
+
height: z.coerce.number(),
|
|
141
|
+
}).default({ url: '', width: 0, height: 0 }).optional(),
|
|
142
|
+
}),
|
|
143
|
+
extract: {
|
|
144
|
+
title: {
|
|
145
|
+
selector: 'title',
|
|
146
|
+
},
|
|
147
|
+
image: {
|
|
148
|
+
selector: 'head',
|
|
149
|
+
value: {
|
|
150
|
+
url: {
|
|
151
|
+
selector: 'meta[property="og:image"]',
|
|
152
|
+
value: 'content',
|
|
153
|
+
},
|
|
154
|
+
width: {
|
|
155
|
+
selector: 'meta[property="og:image:width"]',
|
|
156
|
+
value: 'content',
|
|
157
|
+
},
|
|
158
|
+
height: {
|
|
159
|
+
selector: 'meta[property="og:image:height"]',
|
|
160
|
+
value: 'content',
|
|
161
|
+
},
|
|
162
|
+
},
|
|
163
|
+
},
|
|
164
|
+
},
|
|
165
|
+
});
|
|
149
166
|
```
|
|
150
167
|
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
xscrape offers a range of configuration options through the types provided,
|
|
154
|
-
allowing for detailed customization and robust data extraction and validation:
|
|
168
|
+
-----
|
|
155
169
|
|
|
156
|
-
|
|
157
|
-
- `SchemaValidator`: Validates the extracted data according to defined schemas.
|
|
170
|
+
## Configuration
|
|
158
171
|
|
|
159
|
-
|
|
172
|
+
The `defineScraper` function accepts a configuration object with the following properties:
|
|
160
173
|
|
|
161
|
-
|
|
162
|
-
|
|
174
|
+
* **`schema`**: A schema object from any library that implements the [Standard Schema](https://standardschema.dev) interface. This schema defines the shape and validation rules for the extracted data.
|
|
175
|
+
* **`extract`**: An object that determines how fields are extracted from the HTML using CSS selectors.
|
|
176
|
+
* **`transform`** (optional): A function to apply custom transformations to the validated data.
|
|
163
177
|
|
|
164
|
-
|
|
178
|
+
-----
|
|
165
179
|
|
|
166
180
|
## Contributing
|
|
167
181
|
|
|
168
|
-
Contributions are welcome
|
|
182
|
+
Contributions are welcome\! Please see the [Contributing Guide](https://github.com/johnie/xscrape/blob/main/CONTRIBUTING.md) for more information.
|
|
169
183
|
|
|
170
184
|
## License
|
|
171
185
|
|
|
172
|
-
This project is licensed under the MIT License. See the LICENSE
|
|
173
|
-
https://github.com/johnie/xscrape/blob/main/LICENSE file for details.
|
|
186
|
+
This project is licensed under the MIT License. See the [LICENSE](https://github.com/johnie/xscrape/blob/main/LICENSE) file for details.
|
package/dist/index.cjs
CHANGED
|
@@ -28,88 +28,48 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
28
28
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
29
|
|
|
30
30
|
// src/index.ts
|
|
31
|
-
var
|
|
32
|
-
__export(
|
|
31
|
+
var index_exports = {};
|
|
32
|
+
__export(index_exports, {
|
|
33
33
|
defineScraper: () => defineScraper
|
|
34
34
|
});
|
|
35
|
-
module.exports = __toCommonJS(
|
|
35
|
+
module.exports = __toCommonJS(index_exports);
|
|
36
36
|
|
|
37
37
|
// src/defineScraper.ts
|
|
38
38
|
var cheerio = __toESM(require("cheerio"), 1);
|
|
39
|
-
|
|
40
|
-
// src/validators.ts
|
|
41
|
-
var import_zod = require("zod");
|
|
42
|
-
var Validator = class {
|
|
43
|
-
constructor(schema, validateFunction) {
|
|
44
|
-
this.schema = schema;
|
|
45
|
-
this.validateFunction = validateFunction;
|
|
46
|
-
}
|
|
47
|
-
validate(data) {
|
|
48
|
-
try {
|
|
49
|
-
const result = this.validateFunction(this.schema, data);
|
|
50
|
-
return { success: true, data: result };
|
|
51
|
-
} catch (error) {
|
|
52
|
-
return { success: false, error };
|
|
53
|
-
}
|
|
54
|
-
}
|
|
55
|
-
};
|
|
56
|
-
function getSchemaBuilder(type) {
|
|
57
|
-
switch (type) {
|
|
58
|
-
case "zod":
|
|
59
|
-
return import_zod.z;
|
|
60
|
-
default:
|
|
61
|
-
throw new Error(`Unsupported validator type: ${type}`);
|
|
62
|
-
}
|
|
63
|
-
}
|
|
64
|
-
function createValidator(type, schemaFn) {
|
|
65
|
-
const builder = getSchemaBuilder(type);
|
|
66
|
-
const schema = schemaFn(builder);
|
|
67
|
-
switch (type) {
|
|
68
|
-
case "zod":
|
|
69
|
-
return new Validator(
|
|
70
|
-
schema,
|
|
71
|
-
(schema2, data) => schema2.parse(data)
|
|
72
|
-
);
|
|
73
|
-
default:
|
|
74
|
-
throw new Error(`Unsupported validator type: ${type}`);
|
|
75
|
-
}
|
|
76
|
-
}
|
|
77
|
-
|
|
78
|
-
// src/defineScraper.ts
|
|
79
39
|
function defineScraper(config) {
|
|
80
|
-
const validator = createValidator(config.validator, config.schema);
|
|
81
40
|
return async (html) => {
|
|
82
41
|
try {
|
|
83
42
|
const $ = cheerio.load(html);
|
|
84
43
|
const extractedData = $.extract(config.extract);
|
|
85
|
-
const validationResult =
|
|
86
|
-
|
|
87
|
-
|
|
44
|
+
const validationResult = await Promise.resolve(
|
|
45
|
+
config.schema["~standard"].validate(extractedData)
|
|
46
|
+
);
|
|
47
|
+
if (validationResult.issues) {
|
|
48
|
+
return { error: validationResult.issues };
|
|
88
49
|
}
|
|
89
|
-
if (!validationResult
|
|
50
|
+
if (!("value" in validationResult)) {
|
|
90
51
|
return {
|
|
91
|
-
error: new Error(
|
|
52
|
+
error: new Error(
|
|
53
|
+
"xscrape: Validation succeeded but no data was returned"
|
|
54
|
+
)
|
|
92
55
|
};
|
|
93
56
|
}
|
|
94
57
|
if (config.transform) {
|
|
95
58
|
try {
|
|
96
59
|
const transformed = await Promise.resolve(
|
|
97
|
-
config.transform(validationResult.
|
|
60
|
+
config.transform(validationResult.value)
|
|
98
61
|
);
|
|
99
62
|
return { data: transformed };
|
|
100
63
|
} catch (error) {
|
|
101
64
|
return { error };
|
|
102
65
|
}
|
|
103
66
|
}
|
|
104
|
-
return { data: validationResult.
|
|
67
|
+
return { data: validationResult.value };
|
|
105
68
|
} catch (error) {
|
|
106
69
|
return { error };
|
|
107
70
|
}
|
|
108
71
|
};
|
|
109
72
|
}
|
|
110
|
-
|
|
111
|
-
// src/types/main.ts
|
|
112
|
-
var import_zod2 = require("zod");
|
|
113
73
|
// Annotate the CommonJS export names for ESM import in node:
|
|
114
74
|
0 && (module.exports = {
|
|
115
75
|
defineScraper
|
package/dist/index.d.cts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { StandardSchemaV1 } from '@standard-schema/spec';
|
|
2
2
|
import { Element } from 'domhandler';
|
|
3
3
|
|
|
4
4
|
type ExtractDescriptorFn = (el: Element, key: string, obj: Record<string, unknown>) => unknown;
|
|
@@ -11,34 +11,10 @@ interface ExtractMap {
|
|
|
11
11
|
[key: string]: ExtractValue;
|
|
12
12
|
}
|
|
13
13
|
|
|
14
|
-
type
|
|
15
|
-
|
|
16
|
-
type SchemaBuilder<V extends ValidatorType> = V extends 'zod' ? ZodBuilder : never;
|
|
17
|
-
type SchemaFunction<V extends ValidatorType, T> = (builder: SchemaBuilder<V>) => V extends 'zod' ? z.ZodSchema<T> : never;
|
|
18
|
-
type ScraperConfig<T extends Record<string, unknown>, V extends ValidatorType, R extends T = T> = {
|
|
19
|
-
validator: V;
|
|
20
|
-
schema: SchemaFunction<V, T>;
|
|
14
|
+
type ScraperConfig<S extends StandardSchemaV1<any, any>, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>> = {
|
|
15
|
+
schema: S;
|
|
21
16
|
extract: ExtractMap;
|
|
22
|
-
transform?: (data:
|
|
23
|
-
};
|
|
24
|
-
type BaseFieldOptions = {
|
|
25
|
-
attribute?: string;
|
|
26
|
-
};
|
|
27
|
-
type LeafFieldConfig = BaseFieldOptions & {
|
|
28
|
-
selector?: string;
|
|
29
|
-
selectorAll?: string;
|
|
30
|
-
} & ({
|
|
31
|
-
selector: string;
|
|
32
|
-
selectorAll?: never;
|
|
33
|
-
} | {
|
|
34
|
-
selector?: never;
|
|
35
|
-
selectorAll: string;
|
|
36
|
-
});
|
|
37
|
-
type FieldConfig<T> = T extends object ? T extends Array<infer U> ? LeafFieldConfig : {
|
|
38
|
-
fields: Fields<T>;
|
|
39
|
-
} : LeafFieldConfig;
|
|
40
|
-
type Fields<T> = {
|
|
41
|
-
[K in keyof T]: FieldConfig<T[K]>;
|
|
17
|
+
transform?: (data: StandardSchemaV1.InferOutput<S>) => Promise<R> | R;
|
|
42
18
|
};
|
|
43
19
|
type ValidationResult<T> = {
|
|
44
20
|
success: boolean;
|
|
@@ -50,17 +26,6 @@ type ScraperResult<T> = {
|
|
|
50
26
|
error?: unknown;
|
|
51
27
|
};
|
|
52
28
|
|
|
53
|
-
|
|
54
|
-
* Defines a scraper with the provided configuration.
|
|
55
|
-
*
|
|
56
|
-
* @template T - The shape of the extracted data.
|
|
57
|
-
* @template V - The type of the validator used for validation.
|
|
58
|
-
* @template R - The type of the result after optional transformation, defaults to T.
|
|
59
|
-
*
|
|
60
|
-
* @param config - The configuration object for the scraper.
|
|
61
|
-
* @returns A function that takes an HTML string and returns the scraping result, which could be
|
|
62
|
-
* a scraper result or a promise of a scraper result.
|
|
63
|
-
*/
|
|
64
|
-
declare function defineScraper<T extends Record<string, unknown>, V extends ValidatorType, R extends T = T>(config: ScraperConfig<T, V, R>): (html: string) => Promise<ScraperResult<R>>;
|
|
29
|
+
declare function defineScraper<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>>(config: ScraperConfig<S, R>): (html: string) => Promise<ScraperResult<R>>;
|
|
65
30
|
|
|
66
|
-
export { type
|
|
31
|
+
export { type ScraperConfig, type ScraperResult, type ValidationResult, defineScraper };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { StandardSchemaV1 } from '@standard-schema/spec';
|
|
2
2
|
import { Element } from 'domhandler';
|
|
3
3
|
|
|
4
4
|
type ExtractDescriptorFn = (el: Element, key: string, obj: Record<string, unknown>) => unknown;
|
|
@@ -11,34 +11,10 @@ interface ExtractMap {
|
|
|
11
11
|
[key: string]: ExtractValue;
|
|
12
12
|
}
|
|
13
13
|
|
|
14
|
-
type
|
|
15
|
-
|
|
16
|
-
type SchemaBuilder<V extends ValidatorType> = V extends 'zod' ? ZodBuilder : never;
|
|
17
|
-
type SchemaFunction<V extends ValidatorType, T> = (builder: SchemaBuilder<V>) => V extends 'zod' ? z.ZodSchema<T> : never;
|
|
18
|
-
type ScraperConfig<T extends Record<string, unknown>, V extends ValidatorType, R extends T = T> = {
|
|
19
|
-
validator: V;
|
|
20
|
-
schema: SchemaFunction<V, T>;
|
|
14
|
+
type ScraperConfig<S extends StandardSchemaV1<any, any>, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>> = {
|
|
15
|
+
schema: S;
|
|
21
16
|
extract: ExtractMap;
|
|
22
|
-
transform?: (data:
|
|
23
|
-
};
|
|
24
|
-
type BaseFieldOptions = {
|
|
25
|
-
attribute?: string;
|
|
26
|
-
};
|
|
27
|
-
type LeafFieldConfig = BaseFieldOptions & {
|
|
28
|
-
selector?: string;
|
|
29
|
-
selectorAll?: string;
|
|
30
|
-
} & ({
|
|
31
|
-
selector: string;
|
|
32
|
-
selectorAll?: never;
|
|
33
|
-
} | {
|
|
34
|
-
selector?: never;
|
|
35
|
-
selectorAll: string;
|
|
36
|
-
});
|
|
37
|
-
type FieldConfig<T> = T extends object ? T extends Array<infer U> ? LeafFieldConfig : {
|
|
38
|
-
fields: Fields<T>;
|
|
39
|
-
} : LeafFieldConfig;
|
|
40
|
-
type Fields<T> = {
|
|
41
|
-
[K in keyof T]: FieldConfig<T[K]>;
|
|
17
|
+
transform?: (data: StandardSchemaV1.InferOutput<S>) => Promise<R> | R;
|
|
42
18
|
};
|
|
43
19
|
type ValidationResult<T> = {
|
|
44
20
|
success: boolean;
|
|
@@ -50,17 +26,6 @@ type ScraperResult<T> = {
|
|
|
50
26
|
error?: unknown;
|
|
51
27
|
};
|
|
52
28
|
|
|
53
|
-
|
|
54
|
-
* Defines a scraper with the provided configuration.
|
|
55
|
-
*
|
|
56
|
-
* @template T - The shape of the extracted data.
|
|
57
|
-
* @template V - The type of the validator used for validation.
|
|
58
|
-
* @template R - The type of the result after optional transformation, defaults to T.
|
|
59
|
-
*
|
|
60
|
-
* @param config - The configuration object for the scraper.
|
|
61
|
-
* @returns A function that takes an HTML string and returns the scraping result, which could be
|
|
62
|
-
* a scraper result or a promise of a scraper result.
|
|
63
|
-
*/
|
|
64
|
-
declare function defineScraper<T extends Record<string, unknown>, V extends ValidatorType, R extends T = T>(config: ScraperConfig<T, V, R>): (html: string) => Promise<ScraperResult<R>>;
|
|
29
|
+
declare function defineScraper<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>>(config: ScraperConfig<S, R>): (html: string) => Promise<ScraperResult<R>>;
|
|
65
30
|
|
|
66
|
-
export { type
|
|
31
|
+
export { type ScraperConfig, type ScraperResult, type ValidationResult, defineScraper };
|
package/dist/index.js
CHANGED
|
@@ -1,79 +1,39 @@
|
|
|
1
1
|
// src/defineScraper.ts
|
|
2
2
|
import * as cheerio from "cheerio";
|
|
3
|
-
|
|
4
|
-
// src/validators.ts
|
|
5
|
-
import { z } from "zod";
|
|
6
|
-
var Validator = class {
|
|
7
|
-
constructor(schema, validateFunction) {
|
|
8
|
-
this.schema = schema;
|
|
9
|
-
this.validateFunction = validateFunction;
|
|
10
|
-
}
|
|
11
|
-
validate(data) {
|
|
12
|
-
try {
|
|
13
|
-
const result = this.validateFunction(this.schema, data);
|
|
14
|
-
return { success: true, data: result };
|
|
15
|
-
} catch (error) {
|
|
16
|
-
return { success: false, error };
|
|
17
|
-
}
|
|
18
|
-
}
|
|
19
|
-
};
|
|
20
|
-
function getSchemaBuilder(type) {
|
|
21
|
-
switch (type) {
|
|
22
|
-
case "zod":
|
|
23
|
-
return z;
|
|
24
|
-
default:
|
|
25
|
-
throw new Error(`Unsupported validator type: ${type}`);
|
|
26
|
-
}
|
|
27
|
-
}
|
|
28
|
-
function createValidator(type, schemaFn) {
|
|
29
|
-
const builder = getSchemaBuilder(type);
|
|
30
|
-
const schema = schemaFn(builder);
|
|
31
|
-
switch (type) {
|
|
32
|
-
case "zod":
|
|
33
|
-
return new Validator(
|
|
34
|
-
schema,
|
|
35
|
-
(schema2, data) => schema2.parse(data)
|
|
36
|
-
);
|
|
37
|
-
default:
|
|
38
|
-
throw new Error(`Unsupported validator type: ${type}`);
|
|
39
|
-
}
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
// src/defineScraper.ts
|
|
43
3
|
function defineScraper(config) {
|
|
44
|
-
const validator = createValidator(config.validator, config.schema);
|
|
45
4
|
return async (html) => {
|
|
46
5
|
try {
|
|
47
6
|
const $ = cheerio.load(html);
|
|
48
7
|
const extractedData = $.extract(config.extract);
|
|
49
|
-
const validationResult =
|
|
50
|
-
|
|
51
|
-
|
|
8
|
+
const validationResult = await Promise.resolve(
|
|
9
|
+
config.schema["~standard"].validate(extractedData)
|
|
10
|
+
);
|
|
11
|
+
if (validationResult.issues) {
|
|
12
|
+
return { error: validationResult.issues };
|
|
52
13
|
}
|
|
53
|
-
if (!validationResult
|
|
14
|
+
if (!("value" in validationResult)) {
|
|
54
15
|
return {
|
|
55
|
-
error: new Error(
|
|
16
|
+
error: new Error(
|
|
17
|
+
"xscrape: Validation succeeded but no data was returned"
|
|
18
|
+
)
|
|
56
19
|
};
|
|
57
20
|
}
|
|
58
21
|
if (config.transform) {
|
|
59
22
|
try {
|
|
60
23
|
const transformed = await Promise.resolve(
|
|
61
|
-
config.transform(validationResult.
|
|
24
|
+
config.transform(validationResult.value)
|
|
62
25
|
);
|
|
63
26
|
return { data: transformed };
|
|
64
27
|
} catch (error) {
|
|
65
28
|
return { error };
|
|
66
29
|
}
|
|
67
30
|
}
|
|
68
|
-
return { data: validationResult.
|
|
31
|
+
return { data: validationResult.value };
|
|
69
32
|
} catch (error) {
|
|
70
33
|
return { error };
|
|
71
34
|
}
|
|
72
35
|
};
|
|
73
36
|
}
|
|
74
|
-
|
|
75
|
-
// src/types/main.ts
|
|
76
|
-
import "zod";
|
|
77
37
|
export {
|
|
78
38
|
defineScraper
|
|
79
39
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "xscrape",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "3.0.0",
|
|
4
4
|
"description": "A flexible and powerful library designed to extract and transform data from HTML documents using user-defined schemas",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"exports": {
|
|
@@ -39,23 +39,24 @@
|
|
|
39
39
|
},
|
|
40
40
|
"homepage": "https://github.com/johnie/xscrape#readme",
|
|
41
41
|
"devDependencies": {
|
|
42
|
-
"@arethetypeswrong/cli": "^0.
|
|
42
|
+
"@arethetypeswrong/cli": "^0.18.2",
|
|
43
43
|
"@changesets/changelog-github": "^0.5.1",
|
|
44
|
-
"@changesets/cli": "^2.
|
|
45
|
-
"
|
|
46
|
-
"
|
|
47
|
-
"
|
|
48
|
-
"
|
|
49
|
-
"
|
|
50
|
-
"
|
|
51
|
-
"
|
|
44
|
+
"@changesets/cli": "^2.29.5",
|
|
45
|
+
"arktype": "^2.1.20",
|
|
46
|
+
"effect": "^3.16.12",
|
|
47
|
+
"jsdom": "^26.1.0",
|
|
48
|
+
"prettier": "^3.6.2",
|
|
49
|
+
"tsup": "^8.5.0",
|
|
50
|
+
"typescript": "^5.8.3",
|
|
51
|
+
"valibot": "^1.1.0",
|
|
52
|
+
"vite": "^7.0.4",
|
|
53
|
+
"vitest": "^3.2.4",
|
|
54
|
+
"zod": "^4.0.2"
|
|
52
55
|
},
|
|
53
56
|
"dependencies": {
|
|
54
|
-
"
|
|
55
|
-
"
|
|
56
|
-
"
|
|
57
|
-
"yup": "^1.6.1",
|
|
58
|
-
"zod": "^3.24.2"
|
|
57
|
+
"@standard-schema/spec": "^1.0.0",
|
|
58
|
+
"cheerio": "^1.1.0",
|
|
59
|
+
"domhandler": "^5.0.3"
|
|
59
60
|
},
|
|
60
61
|
"scripts": {
|
|
61
62
|
"build": "tsup",
|