xscrape 1.3.1 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +139 -129
- package/dist/index.cjs +35 -96
- package/dist/index.d.cts +24 -41
- package/dist/index.d.ts +24 -41
- package/dist/index.js +31 -89
- package/package.json +18 -13
package/README.md
CHANGED
|
@@ -1,33 +1,20 @@
|
|
|
1
1
|
# xscrape
|
|
2
2
|
|
|
3
|
-
`xscrape` is a powerful and flexible library designed for extracting and
|
|
4
|
-
transforming data from HTML documents using user-defined schemas. It integrates
|
|
5
|
-
seamlessly with various schema validation libraries such as Zod, Yup, Joi, and
|
|
6
|
-
Effect Schema, allowing you to use your preferred validation tool.
|
|
3
|
+
`xscrape` is a powerful and flexible library designed for extracting and transforming data from HTML documents using user-defined schemas. It now supports any validation library that implements the **Standard Schema**, allowing you to bring your own schema for robust, type-safe data validation.
|
|
7
4
|
|
|
8
5
|
## Features
|
|
9
6
|
|
|
10
|
-
|
|
11
|
-
[
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
- **Nested Field Support**: Define and extract nested data structures from
|
|
16
|
-
HTML elements.
|
|
7
|
+
* **HTML Parsing**: Extract data from HTML using CSS selectors with the help of [cheerio](https://github.com/cheeriojs/cheerio).
|
|
8
|
+
* **Flexible Schema Validation**: Validate and transform extracted data with any validation library that implements the [Standard Schema](https://standardschema.dev), such as Zod, Valibot, ArkType, and Effect Schema.
|
|
9
|
+
* **Custom Transformations**: Provide custom transformations for extracted attributes.
|
|
10
|
+
* **Default Values**: Define default values for missing data fields through your chosen schema library's features.
|
|
11
|
+
* **Nested Field Support**: Define and extract nested data structures from HTML elements.
|
|
17
12
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
| Schema Library | Status | Notes |
|
|
21
|
-
| ---------------------------------------------------- | ------------------- | ------------------------------------------------------------------ |
|
|
22
|
-
| [Zod](https://github.com/colinhacks/zod) | ✅ Supported | Default schema tool for `xscrape` |
|
|
23
|
-
| [Effect/Schema](https://github.com/Effect-TS/effect) | ✅ Supported | Support for Effect/Schema for additional flexibility |
|
|
24
|
-
| [Joi](https://github.com/sideway/joi) | ✅ Supported | Support for Joi for those familiar with server-side validation |
|
|
25
|
-
| [Yup](https://github.com/jquense/yup) | 🚧 Planned | Adding Yup support for schema validation in front-end applications |
|
|
26
|
-
| Others... | 🔄 In Consideration | Potential support for other schema tools as per user feedback |
|
|
13
|
+
-----
|
|
27
14
|
|
|
28
15
|
## Installation
|
|
29
16
|
|
|
30
|
-
To install this library, use
|
|
17
|
+
To install this library, use your preferred package manager:
|
|
31
18
|
|
|
32
19
|
```bash
|
|
33
20
|
pnpm add xscrape
|
|
@@ -35,142 +22,165 @@ pnpm add xscrape
|
|
|
35
22
|
npm install xscrape
|
|
36
23
|
```
|
|
37
24
|
|
|
38
|
-
|
|
25
|
+
You will also need to install your chosen schema validation library, for example, Zod:
|
|
39
26
|
|
|
40
|
-
|
|
41
|
-
|
|
27
|
+
```bash
|
|
28
|
+
pnpm add zod
|
|
29
|
+
# or
|
|
30
|
+
npm install zod
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
-----
|
|
42
34
|
|
|
43
|
-
|
|
35
|
+
## Usage
|
|
36
|
+
|
|
37
|
+
Below is an example of how to use `xscrape` with a Zod schema to extract and transform data from an HTML document.
|
|
44
38
|
|
|
45
39
|
```ts
|
|
40
|
+
import { defineScraper } from 'xscrape';
|
|
46
41
|
import { z } from 'zod';
|
|
47
42
|
|
|
48
|
-
const
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
|
|
43
|
+
const scraper = defineScraper({
|
|
44
|
+
schema: z.object({
|
|
45
|
+
title: z.string(),
|
|
46
|
+
description: z.string(),
|
|
47
|
+
keywords: z.array(z.string()),
|
|
48
|
+
views: z.coerce.number(),
|
|
49
|
+
}),
|
|
50
|
+
extract: {
|
|
51
|
+
title: {
|
|
52
|
+
selector: 'title',
|
|
53
|
+
},
|
|
54
|
+
description: {
|
|
55
|
+
selector: 'meta[name="description"]',
|
|
56
|
+
value: 'content',
|
|
57
|
+
},
|
|
58
|
+
keywords: {
|
|
59
|
+
selector: 'meta[name="keywords"]',
|
|
60
|
+
value(el) {
|
|
61
|
+
return el.attribs['content']?.split(',');
|
|
62
|
+
},
|
|
63
|
+
},
|
|
64
|
+
views: {
|
|
65
|
+
selector: 'meta[name="views"]',
|
|
66
|
+
value: 'content',
|
|
67
|
+
},
|
|
68
|
+
},
|
|
61
69
|
});
|
|
62
|
-
```
|
|
63
70
|
|
|
64
|
-
|
|
71
|
+
const html = `
|
|
72
|
+
<!DOCTYPE html>
|
|
73
|
+
<html>
|
|
74
|
+
<head>
|
|
75
|
+
<meta name="description" content="An example description.">
|
|
76
|
+
<meta name="keywords" content="typescript,html,parsing">
|
|
77
|
+
<meta name="views" content="1234">
|
|
78
|
+
<title>Example Title</title>
|
|
79
|
+
</head>
|
|
80
|
+
<body></body>
|
|
81
|
+
</html>
|
|
82
|
+
`;
|
|
83
|
+
|
|
84
|
+
const { data, error } = await scraper(html);
|
|
85
|
+
console.log(data);
|
|
65
86
|
|
|
66
|
-
|
|
67
|
-
|
|
87
|
+
// Outputs:
|
|
88
|
+
// {
|
|
89
|
+
// title: 'Example Title',
|
|
90
|
+
// description: 'An example description.',
|
|
91
|
+
// keywords: ['typescript', 'html', 'parsing'],
|
|
92
|
+
// views: 1234
|
|
93
|
+
// }
|
|
94
|
+
```
|
|
68
95
|
|
|
69
|
-
|
|
96
|
+
### Handling Missing Data
|
|
70
97
|
|
|
71
|
-
|
|
72
|
-
title: { selector: 'title' },
|
|
73
|
-
description: {
|
|
74
|
-
selector: 'meta[name="description"]',
|
|
75
|
-
attribute: 'content',
|
|
98
|
+
You can handle missing data by using the features of your chosen schema library, such as default values in Zod.
|
|
76
99
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
width: {
|
|
99
|
-
selector: 'meta[property="og:image:width"]',
|
|
100
|
-
attribute: 'content',
|
|
101
|
-
transform: (value) => parseInt(value, 10),
|
|
102
|
-
},
|
|
103
|
-
height: {
|
|
104
|
-
selector: 'meta[property="og:image:height"]',
|
|
105
|
-
attribute: 'content',
|
|
106
|
-
transform: (value) => parseInt(value, 10),
|
|
107
|
-
},
|
|
100
|
+
```ts
|
|
101
|
+
import { defineScraper } from 'xscrape';
|
|
102
|
+
import { z } from 'zod';
|
|
103
|
+
|
|
104
|
+
const scraper = defineScraper({
|
|
105
|
+
schema: z.object({
|
|
106
|
+
title: z.string().default('No title'),
|
|
107
|
+
description: z.string().default('No description'),
|
|
108
|
+
views: z.coerce.number().default(0),
|
|
109
|
+
}),
|
|
110
|
+
extract: {
|
|
111
|
+
title: {
|
|
112
|
+
selector: 'title',
|
|
113
|
+
},
|
|
114
|
+
description: {
|
|
115
|
+
selector: 'meta[name="description"]',
|
|
116
|
+
value: 'content',
|
|
117
|
+
},
|
|
118
|
+
views: {
|
|
119
|
+
selector: 'meta[name="views"]',
|
|
120
|
+
value: 'content',
|
|
108
121
|
},
|
|
109
122
|
},
|
|
110
|
-
};
|
|
123
|
+
});
|
|
111
124
|
```
|
|
112
125
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
```ts
|
|
116
|
-
import { createScraper, ZodValidator } from 'xscrape';
|
|
126
|
+
### Nested Fields
|
|
117
127
|
|
|
118
|
-
|
|
119
|
-
const scraper = createScraper({ fields, validator });
|
|
128
|
+
`xscrape` also supports extracting nested data structures.
|
|
120
129
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
<head>
|
|
125
|
-
<meta name="description" content="An example description.">
|
|
126
|
-
<meta name="keywords" content="typescript,html,parsing">
|
|
127
|
-
<meta name="views" content="1234">
|
|
128
|
-
<meta property="og:image" content="https://example.se/images/c12ffe73-3227-4a4a-b8ad-a3003cdf1d70?h=708&tight=false&w=1372">
|
|
129
|
-
<meta property="og:image:width" content="1372">
|
|
130
|
-
<meta property="og:image:height" content="708">
|
|
131
|
-
<title>Example Title</title>
|
|
132
|
-
</head>
|
|
133
|
-
<body></body>
|
|
134
|
-
</html>
|
|
135
|
-
`;
|
|
136
|
-
|
|
137
|
-
const data = scraper(html);
|
|
138
|
-
console.log(data);
|
|
130
|
+
```ts
|
|
131
|
+
import { defineScraper } from 'xscrape';
|
|
132
|
+
import { z } from 'zod';
|
|
139
133
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
134
|
+
const scraper = defineScraper({
|
|
135
|
+
schema: z.object({
|
|
136
|
+
title: z.string(),
|
|
137
|
+
image: z.object({
|
|
138
|
+
url: z.string().url(),
|
|
139
|
+
width: z.coerce.number(),
|
|
140
|
+
height: z.coerce.number(),
|
|
141
|
+
}).default({ url: '', width: 0, height: 0 }).optional(),
|
|
142
|
+
}),
|
|
143
|
+
extract: {
|
|
144
|
+
title: {
|
|
145
|
+
selector: 'title',
|
|
146
|
+
},
|
|
147
|
+
image: {
|
|
148
|
+
selector: 'head',
|
|
149
|
+
value: {
|
|
150
|
+
url: {
|
|
151
|
+
selector: 'meta[property="og:image"]',
|
|
152
|
+
value: 'content',
|
|
153
|
+
},
|
|
154
|
+
width: {
|
|
155
|
+
selector: 'meta[property="og:image:width"]',
|
|
156
|
+
value: 'content',
|
|
157
|
+
},
|
|
158
|
+
height: {
|
|
159
|
+
selector: 'meta[property="og:image:height"]',
|
|
160
|
+
value: 'content',
|
|
161
|
+
},
|
|
162
|
+
},
|
|
163
|
+
},
|
|
164
|
+
},
|
|
165
|
+
});
|
|
152
166
|
```
|
|
153
167
|
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
xscrape offers a range of configuration options through the types provided,
|
|
157
|
-
allowing for detailed customization and robust data extraction and validation:
|
|
168
|
+
-----
|
|
158
169
|
|
|
159
|
-
|
|
160
|
-
- `SchemaValidator`: Validates the extracted data according to defined schemas.
|
|
170
|
+
## Configuration
|
|
161
171
|
|
|
162
|
-
|
|
172
|
+
The `defineScraper` function accepts a configuration object with the following properties:
|
|
163
173
|
|
|
164
|
-
|
|
165
|
-
|
|
174
|
+
* **`schema`**: A schema object from any library that implements the [Standard Schema](https://standardschema.dev) interface. This schema defines the shape and validation rules for the extracted data.
|
|
175
|
+
* **`extract`**: An object that determines how fields are extracted from the HTML using CSS selectors.
|
|
176
|
+
* **`transform`** (optional): A function to apply custom transformations to the validated data.
|
|
166
177
|
|
|
167
|
-
|
|
178
|
+
-----
|
|
168
179
|
|
|
169
180
|
## Contributing
|
|
170
181
|
|
|
171
|
-
Contributions are welcome
|
|
182
|
+
Contributions are welcome\! Please see the [Contributing Guide](https://github.com/johnie/xscrape/blob/main/CONTRIBUTING.md) for more information.
|
|
172
183
|
|
|
173
184
|
## License
|
|
174
185
|
|
|
175
|
-
This project is licensed under the MIT License. See the LICENSE
|
|
176
|
-
https://github.com/johnie/xscrape/blob/main/LICENSE file for details.
|
|
186
|
+
This project is licensed under the MIT License. See the [LICENSE](https://github.com/johnie/xscrape/blob/main/LICENSE) file for details.
|
package/dist/index.cjs
CHANGED
|
@@ -28,110 +28,49 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
28
28
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
29
29
|
|
|
30
30
|
// src/index.ts
|
|
31
|
-
var
|
|
32
|
-
__export(
|
|
33
|
-
|
|
34
|
-
JoiValidator: () => JoiValidator,
|
|
35
|
-
ZodValidator: () => ZodValidator,
|
|
36
|
-
createScraper: () => createScraper
|
|
31
|
+
var index_exports = {};
|
|
32
|
+
__export(index_exports, {
|
|
33
|
+
defineScraper: () => defineScraper
|
|
37
34
|
});
|
|
38
|
-
module.exports = __toCommonJS(
|
|
35
|
+
module.exports = __toCommonJS(index_exports);
|
|
39
36
|
|
|
40
|
-
// src/
|
|
37
|
+
// src/defineScraper.ts
|
|
41
38
|
var cheerio = __toESM(require("cheerio"), 1);
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
const
|
|
48
|
-
|
|
49
|
-
$context
|
|
39
|
+
function defineScraper(config) {
|
|
40
|
+
return async (html) => {
|
|
41
|
+
try {
|
|
42
|
+
const $ = cheerio.load(html);
|
|
43
|
+
const extractedData = $.extract(config.extract);
|
|
44
|
+
const validationResult = await Promise.resolve(
|
|
45
|
+
config.schema["~standard"].validate(extractedData)
|
|
50
46
|
);
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
47
|
+
if (validationResult.issues) {
|
|
48
|
+
return { error: validationResult.issues };
|
|
49
|
+
}
|
|
50
|
+
if (!("value" in validationResult)) {
|
|
51
|
+
return {
|
|
52
|
+
error: new Error(
|
|
53
|
+
"xscrape: Validation succeeded but no data was returned"
|
|
54
|
+
)
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
if (config.transform) {
|
|
58
|
+
try {
|
|
59
|
+
const transformed = await Promise.resolve(
|
|
60
|
+
config.transform(validationResult.value)
|
|
61
|
+
);
|
|
62
|
+
return { data: transformed };
|
|
63
|
+
} catch (error) {
|
|
64
|
+
return { error };
|
|
59
65
|
}
|
|
60
|
-
});
|
|
61
|
-
if (values.length === 0 && fieldDef.defaultValue !== void 0) {
|
|
62
|
-
data[key] = fieldDef.defaultValue;
|
|
63
|
-
} else if (fieldDef.multiple) {
|
|
64
|
-
data[key] = values.map(
|
|
65
|
-
(value) => fieldDef.transform ? fieldDef.transform(value) : value
|
|
66
|
-
);
|
|
67
|
-
} else {
|
|
68
|
-
const value = values[0];
|
|
69
|
-
data[key] = fieldDef.transform && value ? fieldDef.transform(value) : value;
|
|
70
66
|
}
|
|
67
|
+
return { data: validationResult.value };
|
|
68
|
+
} catch (error) {
|
|
69
|
+
return { error };
|
|
71
70
|
}
|
|
72
|
-
}
|
|
73
|
-
return data;
|
|
74
|
-
};
|
|
75
|
-
var createScraper = ({
|
|
76
|
-
fields,
|
|
77
|
-
validator
|
|
78
|
-
}) => {
|
|
79
|
-
return (html) => {
|
|
80
|
-
const $ = typeof html === "string" ? cheerio.load(html) : html;
|
|
81
|
-
const data = extractData(fields, $);
|
|
82
|
-
return validator.validate(data);
|
|
83
71
|
};
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
// src/validators/effect.ts
|
|
87
|
-
var Schema = __toESM(require("effect/Schema"), 1);
|
|
88
|
-
var import_effect = require("effect");
|
|
89
|
-
var EffectValidator = class {
|
|
90
|
-
constructor(schema) {
|
|
91
|
-
this.schema = schema;
|
|
92
|
-
}
|
|
93
|
-
validate(data) {
|
|
94
|
-
const result = Schema.decodeUnknown(this.schema)(data);
|
|
95
|
-
return import_effect.Effect.runSync(result);
|
|
96
|
-
}
|
|
97
|
-
};
|
|
98
|
-
|
|
99
|
-
// src/validators/zod.ts
|
|
100
|
-
var import_zod = require("zod");
|
|
101
|
-
var ZodValidator = class {
|
|
102
|
-
constructor(schema) {
|
|
103
|
-
this.schema = schema;
|
|
104
|
-
}
|
|
105
|
-
validate(data) {
|
|
106
|
-
return this.schema.parse(data);
|
|
107
|
-
}
|
|
108
|
-
};
|
|
109
|
-
|
|
110
|
-
// src/validators/joi.ts
|
|
111
|
-
var JoiValidator = class {
|
|
112
|
-
constructor(schema) {
|
|
113
|
-
this.schema = schema;
|
|
114
|
-
}
|
|
115
|
-
validate(data) {
|
|
116
|
-
const { error, value } = this.schema.validate(data, {
|
|
117
|
-
convert: true,
|
|
118
|
-
stripUnknown: true,
|
|
119
|
-
presence: "optional",
|
|
120
|
-
abortEarly: false
|
|
121
|
-
});
|
|
122
|
-
if (error) {
|
|
123
|
-
throw new Error(this.formatError(error));
|
|
124
|
-
}
|
|
125
|
-
return value;
|
|
126
|
-
}
|
|
127
|
-
formatError(error) {
|
|
128
|
-
return error.details.map((detail) => detail.message).join("\n");
|
|
129
|
-
}
|
|
130
|
-
};
|
|
72
|
+
}
|
|
131
73
|
// Annotate the CommonJS export names for ESM import in node:
|
|
132
74
|
0 && (module.exports = {
|
|
133
|
-
|
|
134
|
-
JoiValidator,
|
|
135
|
-
ZodValidator,
|
|
136
|
-
createScraper
|
|
75
|
+
defineScraper
|
|
137
76
|
});
|
package/dist/index.d.cts
CHANGED
|
@@ -1,48 +1,31 @@
|
|
|
1
|
-
import
|
|
2
|
-
import
|
|
3
|
-
import { ZodSchema } from 'zod';
|
|
4
|
-
import { Schema as Schema$1 } from 'joi';
|
|
1
|
+
import { StandardSchemaV1 } from '@standard-schema/spec';
|
|
2
|
+
import { Element } from 'domhandler';
|
|
5
3
|
|
|
6
|
-
type
|
|
7
|
-
|
|
8
|
-
validator: SchemaValidator<T>;
|
|
9
|
-
};
|
|
10
|
-
type FieldDefinition<T> = {
|
|
4
|
+
type ExtractDescriptorFn = (el: Element, key: string, obj: Record<string, unknown>) => unknown;
|
|
5
|
+
interface ExtractDescriptor {
|
|
11
6
|
selector: string;
|
|
12
|
-
|
|
13
|
-
transform?: (value: string) => T;
|
|
14
|
-
defaultValue?: T;
|
|
15
|
-
multiple?: boolean;
|
|
16
|
-
} | NestedFieldDefinition<T>;
|
|
17
|
-
type NestedFieldDefinition<T> = {
|
|
18
|
-
fields: SchemaFieldDefinitions<T>;
|
|
19
|
-
};
|
|
20
|
-
type SchemaFieldDefinitions<T> = {
|
|
21
|
-
[K in keyof T]: FieldDefinition<T[K]>;
|
|
22
|
-
};
|
|
23
|
-
interface SchemaValidator<T> {
|
|
24
|
-
validate(data: unknown): T;
|
|
7
|
+
value?: string | ExtractDescriptorFn | ExtractMap;
|
|
25
8
|
}
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
declare class EffectValidator<A, I = A> implements SchemaValidator<A> {
|
|
30
|
-
private schema;
|
|
31
|
-
constructor(schema: Schema.Schema<A, I>);
|
|
32
|
-
validate(data: unknown): A;
|
|
9
|
+
type ExtractValue = string | ExtractDescriptor | [string | ExtractDescriptor];
|
|
10
|
+
interface ExtractMap {
|
|
11
|
+
[key: string]: ExtractValue;
|
|
33
12
|
}
|
|
34
13
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
}
|
|
14
|
+
type ScraperConfig<S extends StandardSchemaV1<any, any>, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>> = {
|
|
15
|
+
schema: S;
|
|
16
|
+
extract: ExtractMap;
|
|
17
|
+
transform?: (data: StandardSchemaV1.InferOutput<S>) => Promise<R> | R;
|
|
18
|
+
};
|
|
19
|
+
type ValidationResult<T> = {
|
|
20
|
+
success: boolean;
|
|
21
|
+
data?: T;
|
|
22
|
+
error?: unknown;
|
|
23
|
+
};
|
|
24
|
+
type ScraperResult<T> = {
|
|
25
|
+
data?: T;
|
|
26
|
+
error?: unknown;
|
|
27
|
+
};
|
|
40
28
|
|
|
41
|
-
declare
|
|
42
|
-
private schema;
|
|
43
|
-
constructor(schema: Schema$1<T>);
|
|
44
|
-
validate(data: unknown): T;
|
|
45
|
-
private formatError;
|
|
46
|
-
}
|
|
29
|
+
declare function defineScraper<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>>(config: ScraperConfig<S, R>): (html: string) => Promise<ScraperResult<R>>;
|
|
47
30
|
|
|
48
|
-
export {
|
|
31
|
+
export { type ScraperConfig, type ScraperResult, type ValidationResult, defineScraper };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,48 +1,31 @@
|
|
|
1
|
-
import
|
|
2
|
-
import
|
|
3
|
-
import { ZodSchema } from 'zod';
|
|
4
|
-
import { Schema as Schema$1 } from 'joi';
|
|
1
|
+
import { StandardSchemaV1 } from '@standard-schema/spec';
|
|
2
|
+
import { Element } from 'domhandler';
|
|
5
3
|
|
|
6
|
-
type
|
|
7
|
-
|
|
8
|
-
validator: SchemaValidator<T>;
|
|
9
|
-
};
|
|
10
|
-
type FieldDefinition<T> = {
|
|
4
|
+
type ExtractDescriptorFn = (el: Element, key: string, obj: Record<string, unknown>) => unknown;
|
|
5
|
+
interface ExtractDescriptor {
|
|
11
6
|
selector: string;
|
|
12
|
-
|
|
13
|
-
transform?: (value: string) => T;
|
|
14
|
-
defaultValue?: T;
|
|
15
|
-
multiple?: boolean;
|
|
16
|
-
} | NestedFieldDefinition<T>;
|
|
17
|
-
type NestedFieldDefinition<T> = {
|
|
18
|
-
fields: SchemaFieldDefinitions<T>;
|
|
19
|
-
};
|
|
20
|
-
type SchemaFieldDefinitions<T> = {
|
|
21
|
-
[K in keyof T]: FieldDefinition<T[K]>;
|
|
22
|
-
};
|
|
23
|
-
interface SchemaValidator<T> {
|
|
24
|
-
validate(data: unknown): T;
|
|
7
|
+
value?: string | ExtractDescriptorFn | ExtractMap;
|
|
25
8
|
}
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
declare class EffectValidator<A, I = A> implements SchemaValidator<A> {
|
|
30
|
-
private schema;
|
|
31
|
-
constructor(schema: Schema.Schema<A, I>);
|
|
32
|
-
validate(data: unknown): A;
|
|
9
|
+
type ExtractValue = string | ExtractDescriptor | [string | ExtractDescriptor];
|
|
10
|
+
interface ExtractMap {
|
|
11
|
+
[key: string]: ExtractValue;
|
|
33
12
|
}
|
|
34
13
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
}
|
|
14
|
+
type ScraperConfig<S extends StandardSchemaV1<any, any>, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>> = {
|
|
15
|
+
schema: S;
|
|
16
|
+
extract: ExtractMap;
|
|
17
|
+
transform?: (data: StandardSchemaV1.InferOutput<S>) => Promise<R> | R;
|
|
18
|
+
};
|
|
19
|
+
type ValidationResult<T> = {
|
|
20
|
+
success: boolean;
|
|
21
|
+
data?: T;
|
|
22
|
+
error?: unknown;
|
|
23
|
+
};
|
|
24
|
+
type ScraperResult<T> = {
|
|
25
|
+
data?: T;
|
|
26
|
+
error?: unknown;
|
|
27
|
+
};
|
|
40
28
|
|
|
41
|
-
declare
|
|
42
|
-
private schema;
|
|
43
|
-
constructor(schema: Schema$1<T>);
|
|
44
|
-
validate(data: unknown): T;
|
|
45
|
-
private formatError;
|
|
46
|
-
}
|
|
29
|
+
declare function defineScraper<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>>(config: ScraperConfig<S, R>): (html: string) => Promise<ScraperResult<R>>;
|
|
47
30
|
|
|
48
|
-
export {
|
|
31
|
+
export { type ScraperConfig, type ScraperResult, type ValidationResult, defineScraper };
|
package/dist/index.js
CHANGED
|
@@ -1,97 +1,39 @@
|
|
|
1
|
-
// src/
|
|
1
|
+
// src/defineScraper.ts
|
|
2
2
|
import * as cheerio from "cheerio";
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
const
|
|
9
|
-
|
|
10
|
-
$context
|
|
3
|
+
function defineScraper(config) {
|
|
4
|
+
return async (html) => {
|
|
5
|
+
try {
|
|
6
|
+
const $ = cheerio.load(html);
|
|
7
|
+
const extractedData = $.extract(config.extract);
|
|
8
|
+
const validationResult = await Promise.resolve(
|
|
9
|
+
config.schema["~standard"].validate(extractedData)
|
|
11
10
|
);
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
11
|
+
if (validationResult.issues) {
|
|
12
|
+
return { error: validationResult.issues };
|
|
13
|
+
}
|
|
14
|
+
if (!("value" in validationResult)) {
|
|
15
|
+
return {
|
|
16
|
+
error: new Error(
|
|
17
|
+
"xscrape: Validation succeeded but no data was returned"
|
|
18
|
+
)
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
if (config.transform) {
|
|
22
|
+
try {
|
|
23
|
+
const transformed = await Promise.resolve(
|
|
24
|
+
config.transform(validationResult.value)
|
|
25
|
+
);
|
|
26
|
+
return { data: transformed };
|
|
27
|
+
} catch (error) {
|
|
28
|
+
return { error };
|
|
20
29
|
}
|
|
21
|
-
});
|
|
22
|
-
if (values.length === 0 && fieldDef.defaultValue !== void 0) {
|
|
23
|
-
data[key] = fieldDef.defaultValue;
|
|
24
|
-
} else if (fieldDef.multiple) {
|
|
25
|
-
data[key] = values.map(
|
|
26
|
-
(value) => fieldDef.transform ? fieldDef.transform(value) : value
|
|
27
|
-
);
|
|
28
|
-
} else {
|
|
29
|
-
const value = values[0];
|
|
30
|
-
data[key] = fieldDef.transform && value ? fieldDef.transform(value) : value;
|
|
31
30
|
}
|
|
31
|
+
return { data: validationResult.value };
|
|
32
|
+
} catch (error) {
|
|
33
|
+
return { error };
|
|
32
34
|
}
|
|
33
|
-
}
|
|
34
|
-
return data;
|
|
35
|
-
};
|
|
36
|
-
var createScraper = ({
|
|
37
|
-
fields,
|
|
38
|
-
validator
|
|
39
|
-
}) => {
|
|
40
|
-
return (html) => {
|
|
41
|
-
const $ = typeof html === "string" ? cheerio.load(html) : html;
|
|
42
|
-
const data = extractData(fields, $);
|
|
43
|
-
return validator.validate(data);
|
|
44
35
|
};
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
// src/validators/effect.ts
|
|
48
|
-
import * as Schema from "effect/Schema";
|
|
49
|
-
import { Effect } from "effect";
|
|
50
|
-
var EffectValidator = class {
|
|
51
|
-
constructor(schema) {
|
|
52
|
-
this.schema = schema;
|
|
53
|
-
}
|
|
54
|
-
validate(data) {
|
|
55
|
-
const result = Schema.decodeUnknown(this.schema)(data);
|
|
56
|
-
return Effect.runSync(result);
|
|
57
|
-
}
|
|
58
|
-
};
|
|
59
|
-
|
|
60
|
-
// src/validators/zod.ts
|
|
61
|
-
import "zod";
|
|
62
|
-
var ZodValidator = class {
|
|
63
|
-
constructor(schema) {
|
|
64
|
-
this.schema = schema;
|
|
65
|
-
}
|
|
66
|
-
validate(data) {
|
|
67
|
-
return this.schema.parse(data);
|
|
68
|
-
}
|
|
69
|
-
};
|
|
70
|
-
|
|
71
|
-
// src/validators/joi.ts
|
|
72
|
-
var JoiValidator = class {
|
|
73
|
-
constructor(schema) {
|
|
74
|
-
this.schema = schema;
|
|
75
|
-
}
|
|
76
|
-
validate(data) {
|
|
77
|
-
const { error, value } = this.schema.validate(data, {
|
|
78
|
-
convert: true,
|
|
79
|
-
stripUnknown: true,
|
|
80
|
-
presence: "optional",
|
|
81
|
-
abortEarly: false
|
|
82
|
-
});
|
|
83
|
-
if (error) {
|
|
84
|
-
throw new Error(this.formatError(error));
|
|
85
|
-
}
|
|
86
|
-
return value;
|
|
87
|
-
}
|
|
88
|
-
formatError(error) {
|
|
89
|
-
return error.details.map((detail) => detail.message).join("\n");
|
|
90
|
-
}
|
|
91
|
-
};
|
|
36
|
+
}
|
|
92
37
|
export {
|
|
93
|
-
|
|
94
|
-
JoiValidator,
|
|
95
|
-
ZodValidator,
|
|
96
|
-
createScraper
|
|
38
|
+
defineScraper
|
|
97
39
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "xscrape",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "3.0.0",
|
|
4
4
|
"description": "A flexible and powerful library designed to extract and transform data from HTML documents using user-defined schemas",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"exports": {
|
|
@@ -39,26 +39,31 @@
|
|
|
39
39
|
},
|
|
40
40
|
"homepage": "https://github.com/johnie/xscrape#readme",
|
|
41
41
|
"devDependencies": {
|
|
42
|
-
"@arethetypeswrong/cli": "^0.
|
|
43
|
-
"@changesets/changelog-github": "^0.5.
|
|
44
|
-
"@changesets/cli": "^2.
|
|
45
|
-
"
|
|
46
|
-
"
|
|
47
|
-
"
|
|
48
|
-
"
|
|
49
|
-
"
|
|
42
|
+
"@arethetypeswrong/cli": "^0.18.2",
|
|
43
|
+
"@changesets/changelog-github": "^0.5.1",
|
|
44
|
+
"@changesets/cli": "^2.29.5",
|
|
45
|
+
"arktype": "^2.1.20",
|
|
46
|
+
"effect": "^3.16.12",
|
|
47
|
+
"jsdom": "^26.1.0",
|
|
48
|
+
"prettier": "^3.6.2",
|
|
49
|
+
"tsup": "^8.5.0",
|
|
50
|
+
"typescript": "^5.8.3",
|
|
51
|
+
"valibot": "^1.1.0",
|
|
52
|
+
"vite": "^7.0.4",
|
|
53
|
+
"vitest": "^3.2.4",
|
|
54
|
+
"zod": "^4.0.2"
|
|
50
55
|
},
|
|
51
56
|
"dependencies": {
|
|
52
|
-
"
|
|
53
|
-
"
|
|
54
|
-
"
|
|
55
|
-
"zod": "^3.23.8"
|
|
57
|
+
"@standard-schema/spec": "^1.0.0",
|
|
58
|
+
"cheerio": "^1.1.0",
|
|
59
|
+
"domhandler": "^5.0.3"
|
|
56
60
|
},
|
|
57
61
|
"scripts": {
|
|
58
62
|
"build": "tsup",
|
|
59
63
|
"ci": "npm run build && npm run check-format && npm run check-exports && npm run lint && npm run test",
|
|
60
64
|
"lint": "tsc",
|
|
61
65
|
"test": "vitest run",
|
|
66
|
+
"test:watch": "vitest",
|
|
62
67
|
"format": "prettier --write ./src",
|
|
63
68
|
"check-format": "prettier --check ./src",
|
|
64
69
|
"check-exports": "attw --pack .",
|