xscrape 1.3.1 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,33 +1,20 @@
1
1
  # xscrape
2
2
 
3
- `xscrape` is a powerful and flexible library designed for extracting and
4
- transforming data from HTML documents using user-defined schemas. It integrates
5
- seamlessly with various schema validation libraries such as Zod, Yup, Joi, and
6
- Effect Schema, allowing you to use your preferred validation tool.
3
+ `xscrape` is a powerful and flexible library designed for extracting and transforming data from HTML documents using user-defined schemas. It now supports any validation library that implements the **Standard Schema**, allowing you to bring your own schema for robust, type-safe data validation.
7
4
 
8
5
  ## Features
9
6
 
10
- - **HTML Parsing**: Extract data from HTML using CSS selectors with the help of
11
- [cheerio](https://github.com/cheeriojs/cheerio).
12
- - **Schema Validation**: Validate and transform extracted data with schema validation libraries like [Zod](https://github.com/colinhacks/zod).
13
- - **Custom Transformations**: Provide custom transformations for extractedattributes.
14
- - **Default Values**: Define default values for missing data fields.
15
- - **Nested Field Support**: Define and extract nested data structures from
16
- HTML elements.
7
+ * **HTML Parsing**: Extract data from HTML using CSS selectors with the help of [cheerio](https://github.com/cheeriojs/cheerio).
8
+ * **Flexible Schema Validation**: Validate and transform extracted data with any validation library that implements the [Standard Schema](https://standardschema.dev), such as Zod, Valibot, ArkType, and Effect Schema.
9
+ * **Custom Transformations**: Provide custom transformations for extracted attributes.
10
+ * **Default Values**: Define default values for missing data fields through your chosen schema library's features.
11
+ * **Nested Field Support**: Define and extract nested data structures from HTML elements.
17
12
 
18
- ### Schema Support
19
-
20
- | Schema Library | Status | Notes |
21
- | ---------------------------------------------------- | ------------------- | ------------------------------------------------------------------ |
22
- | [Zod](https://github.com/colinhacks/zod) | ✅ Supported | Default schema tool for `xscrape` |
23
- | [Effect/Schema](https://github.com/Effect-TS/effect) | ✅ Supported | Support for Effect/Schema for additional flexibility |
24
- | [Joi](https://github.com/sideway/joi) | ✅ Supported | Support for Joi for those familiar with server-side validation |
25
- | [Yup](https://github.com/jquense/yup) | 🚧 Planned | Adding Yup support for schema validation in front-end applications |
26
- | Others... | 🔄 In Consideration | Potential support for other schema tools as per user feedback |
13
+ -----
27
14
 
28
15
  ## Installation
29
16
 
30
- To install this library, use npm or yarn:
17
+ To install this library, use your preferred package manager:
31
18
 
32
19
  ```bash
33
20
  pnpm add xscrape
@@ -35,142 +22,165 @@ pnpm add xscrape
35
22
  npm install xscrape
36
23
  ```
37
24
 
38
- ## Usage
25
+ You will also need to install your chosen schema validation library, for example, Zod:
39
26
 
40
- Below is an example of how to use xscrape for extracting and transforming data
41
- from an HTML document:
27
+ ```bash
28
+ pnpm add zod
29
+ # or
30
+ npm install zod
31
+ ```
32
+
33
+ -----
42
34
 
43
- 1. Define Your Schema
35
+ ## Usage
36
+
37
+ Below is an example of how to use `xscrape` with a Zod schema to extract and transform data from an HTML document.
44
38
 
45
39
  ```ts
40
+ import { defineScraper } from 'xscrape';
46
41
  import { z } from 'zod';
47
42
 
48
- const schema = z.object({
49
- title: z.string().default('No title'),
50
- description: z.string(),
51
- keywords: z.array(z.string()),
52
- views: z.number(),
53
- image: z
54
- .object({
55
- url: z.string(),
56
- width: z.number(),
57
- height: z.number(),
58
- })
59
- .default({ url: '', width: 0, height: 0 })
60
- .optional(),
43
+ const scraper = defineScraper({
44
+ schema: z.object({
45
+ title: z.string(),
46
+ description: z.string(),
47
+ keywords: z.array(z.string()),
48
+ views: z.coerce.number(),
49
+ }),
50
+ extract: {
51
+ title: {
52
+ selector: 'title',
53
+ },
54
+ description: {
55
+ selector: 'meta[name="description"]',
56
+ value: 'content',
57
+ },
58
+ keywords: {
59
+ selector: 'meta[name="keywords"]',
60
+ value(el) {
61
+ return el.attribs['content']?.split(',');
62
+ },
63
+ },
64
+ views: {
65
+ selector: 'meta[name="views"]',
66
+ value: 'content',
67
+ },
68
+ },
61
69
  });
62
- ```
63
70
 
64
- 2. Define Field Definitions
71
+ const html = `
72
+ <!DOCTYPE html>
73
+ <html>
74
+ <head>
75
+ <meta name="description" content="An example description.">
76
+ <meta name="keywords" content="typescript,html,parsing">
77
+ <meta name="views" content="1234">
78
+ <title>Example Title</title>
79
+ </head>
80
+ <body></body>
81
+ </html>
82
+ `;
83
+
84
+ const { data, error } = await scraper(html);
85
+ console.log(data);
65
86
 
66
- ```ts
67
- import { type SchemaFieldDefinitions } from 'xscrape';
87
+ // Outputs:
88
+ // {
89
+ // title: 'Example Title',
90
+ // description: 'An example description.',
91
+ // keywords: ['typescript', 'html', 'parsing'],
92
+ // views: 1234
93
+ // }
94
+ ```
68
95
 
69
- type FieldDefinitions = SchemaFieldDefinitions<z.infer<typeof schema>>;
96
+ ### Handling Missing Data
70
97
 
71
- const fields: FieldDefinitions = {
72
- title: { selector: 'title' },
73
- description: {
74
- selector: 'meta[name="description"]',
75
- attribute: 'content',
98
+ You can handle missing data by using the features of your chosen schema library, such as default values in Zod.
76
99
 
77
- defaultValue: 'No description',
78
- },
79
- keywords: {
80
- selector: 'meta[name="keywords"]',
81
- attribute: 'content',
82
- transform: (value) => value.split(','),
83
- defaultValue: [],
84
- },
85
- views: {
86
- selector: 'meta[name="views"]',
87
- attribute: 'content',
88
- transform: (value) => parseInt(value, 10),
89
- defaultValue: 0,
90
- },
91
- // Example of a nested field
92
- image: {
93
- fields: {
94
- url: {
95
- selector: 'meta[property="og:image"]',
96
- attribute: 'content',
97
- },
98
- width: {
99
- selector: 'meta[property="og:image:width"]',
100
- attribute: 'content',
101
- transform: (value) => parseInt(value, 10),
102
- },
103
- height: {
104
- selector: 'meta[property="og:image:height"]',
105
- attribute: 'content',
106
- transform: (value) => parseInt(value, 10),
107
- },
100
+ ```ts
101
+ import { defineScraper } from 'xscrape';
102
+ import { z } from 'zod';
103
+
104
+ const scraper = defineScraper({
105
+ schema: z.object({
106
+ title: z.string().default('No title'),
107
+ description: z.string().default('No description'),
108
+ views: z.coerce.number().default(0),
109
+ }),
110
+ extract: {
111
+ title: {
112
+ selector: 'title',
113
+ },
114
+ description: {
115
+ selector: 'meta[name="description"]',
116
+ value: 'content',
117
+ },
118
+ views: {
119
+ selector: 'meta[name="views"]',
120
+ value: 'content',
108
121
  },
109
122
  },
110
- };
123
+ });
111
124
  ```
112
125
 
113
- 3. Create a Scraper and Extract Data
114
-
115
- ```ts
116
- import { createScraper, ZodValidator } from 'xscrape';
126
+ ### Nested Fields
117
127
 
118
- const validator = new ZodValidator(schema);
119
- const scraper = createScraper({ fields, validator });
128
+ `xscrape` also supports extracting nested data structures.
120
129
 
121
- const html = `
122
- <!DOCTYPE html>
123
- <html>
124
- <head>
125
- <meta name="description" content="An example description.">
126
- <meta name="keywords" content="typescript,html,parsing">
127
- <meta name="views" content="1234">
128
- <meta property="og:image" content="https://example.se/images/c12ffe73-3227-4a4a-b8ad-a3003cdf1d70?h=708&amp;tight=false&amp;w=1372">
129
- <meta property="og:image:width" content="1372">
130
- <meta property="og:image:height" content="708">
131
- <title>Example Title</title>
132
- </head>
133
- <body></body>
134
- </html>
135
- `;
136
-
137
- const data = scraper(html);
138
- console.log(data);
130
+ ```ts
131
+ import { defineScraper } from 'xscrape';
132
+ import { z } from 'zod';
139
133
 
140
- // Outputs:
141
- // {
142
- // title: 'Example Title',
143
- // description: 'An example description.',
144
- // keywords: ['typescript', 'html', 'parsing'],
145
- // views: 1234
146
- // image: {
147
- // url: 'https://example.se/images/c12ffe73-3227-4a4a-b8ad-a3003cdf1d70?h=708&amp;tight=false&amp;w=1372',
148
- // width: 1372,
149
- // height: 708
150
- // }
151
- // }
134
+ const scraper = defineScraper({
135
+ schema: z.object({
136
+ title: z.string(),
137
+ image: z.object({
138
+ url: z.string().url(),
139
+ width: z.coerce.number(),
140
+ height: z.coerce.number(),
141
+ }).default({ url: '', width: 0, height: 0 }).optional(),
142
+ }),
143
+ extract: {
144
+ title: {
145
+ selector: 'title',
146
+ },
147
+ image: {
148
+ selector: 'head',
149
+ value: {
150
+ url: {
151
+ selector: 'meta[property="og:image"]',
152
+ value: 'content',
153
+ },
154
+ width: {
155
+ selector: 'meta[property="og:image:width"]',
156
+ value: 'content',
157
+ },
158
+ height: {
159
+ selector: 'meta[property="og:image:height"]',
160
+ value: 'content',
161
+ },
162
+ },
163
+ },
164
+ },
165
+ });
152
166
  ```
153
167
 
154
- ## Configuration
155
-
156
- xscrape offers a range of configuration options through the types provided,
157
- allowing for detailed customization and robust data extraction and validation:
168
+ -----
158
169
 
159
- - `SchemaFieldDefinitions`: Determines how fields are extracted from the HTML.
160
- - `SchemaValidator`: Validates the extracted data according to defined schemas.
170
+ ## Configuration
161
171
 
162
- ## API Reference
172
+ The `defineScraper` function accepts a configuration object with the following properties:
163
173
 
164
- - `createScraper(config: ScrapeConfig): (html: string) => T` Creates a scraping function based on the specified fields and validator.
165
- - `ZodValidator` A built-in validator using Zod, allowing you to define schemas andvalidate data effortlessly.
174
+ * **`schema`**: A schema object from any library that implements the [Standard Schema](https://standardschema.dev) interface. This schema defines the shape and validation rules for the extracted data.
175
+ * **`extract`**: An object that determines how fields are extracted from the HTML using CSS selectors.
176
+ * **`transform`** (optional): A function to apply custom transformations to the validated data.
166
177
 
167
- For a complete list of API methods and more advanced configuration options,refer to the documentation on the project homepage https://github.com/johnie/xscrape.
178
+ -----
168
179
 
169
180
  ## Contributing
170
181
 
171
- Contributions are welcome! Please see the Contributing Guide https://github.com/johnie/xscrape/blob/main/CONTRIBUTING.md for more information.
182
+ Contributions are welcome\! Please see the [Contributing Guide](https://github.com/johnie/xscrape/blob/main/CONTRIBUTING.md) for more information.
172
183
 
173
184
  ## License
174
185
 
175
- This project is licensed under the MIT License. See the LICENSE
176
- https://github.com/johnie/xscrape/blob/main/LICENSE file for details.
186
+ This project is licensed under the MIT License. See the [LICENSE](https://github.com/johnie/xscrape/blob/main/LICENSE) file for details.
package/dist/index.cjs CHANGED
@@ -28,110 +28,49 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
28
28
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
29
 
30
30
  // src/index.ts
31
- var src_exports = {};
32
- __export(src_exports, {
33
- EffectValidator: () => EffectValidator,
34
- JoiValidator: () => JoiValidator,
35
- ZodValidator: () => ZodValidator,
36
- createScraper: () => createScraper
31
+ var index_exports = {};
32
+ __export(index_exports, {
33
+ defineScraper: () => defineScraper
37
34
  });
38
- module.exports = __toCommonJS(src_exports);
35
+ module.exports = __toCommonJS(index_exports);
39
36
 
40
- // src/createScraper.ts
37
+ // src/defineScraper.ts
41
38
  var cheerio = __toESM(require("cheerio"), 1);
42
- var extractData = (fields, $context) => {
43
- const data = {};
44
- for (const key in fields) {
45
- const fieldDef = fields[key];
46
- if ("fields" in fieldDef) {
47
- const nestedData = extractData(
48
- fieldDef.fields,
49
- $context
39
+ function defineScraper(config) {
40
+ return async (html) => {
41
+ try {
42
+ const $ = cheerio.load(html);
43
+ const extractedData = $.extract(config.extract);
44
+ const validationResult = await Promise.resolve(
45
+ config.schema["~standard"].validate(extractedData)
50
46
  );
51
- data[key] = nestedData;
52
- } else {
53
- const elements = $context(fieldDef.selector);
54
- let values = [];
55
- elements.each((_, element) => {
56
- const value = fieldDef.attribute ? $context(element).attr(fieldDef.attribute) : $context(element).text().trim();
57
- if (value !== void 0) {
58
- values.push(value);
47
+ if (validationResult.issues) {
48
+ return { error: validationResult.issues };
49
+ }
50
+ if (!("value" in validationResult)) {
51
+ return {
52
+ error: new Error(
53
+ "xscrape: Validation succeeded but no data was returned"
54
+ )
55
+ };
56
+ }
57
+ if (config.transform) {
58
+ try {
59
+ const transformed = await Promise.resolve(
60
+ config.transform(validationResult.value)
61
+ );
62
+ return { data: transformed };
63
+ } catch (error) {
64
+ return { error };
59
65
  }
60
- });
61
- if (values.length === 0 && fieldDef.defaultValue !== void 0) {
62
- data[key] = fieldDef.defaultValue;
63
- } else if (fieldDef.multiple) {
64
- data[key] = values.map(
65
- (value) => fieldDef.transform ? fieldDef.transform(value) : value
66
- );
67
- } else {
68
- const value = values[0];
69
- data[key] = fieldDef.transform && value ? fieldDef.transform(value) : value;
70
66
  }
67
+ return { data: validationResult.value };
68
+ } catch (error) {
69
+ return { error };
71
70
  }
72
- }
73
- return data;
74
- };
75
- var createScraper = ({
76
- fields,
77
- validator
78
- }) => {
79
- return (html) => {
80
- const $ = typeof html === "string" ? cheerio.load(html) : html;
81
- const data = extractData(fields, $);
82
- return validator.validate(data);
83
71
  };
84
- };
85
-
86
- // src/validators/effect.ts
87
- var Schema = __toESM(require("effect/Schema"), 1);
88
- var import_effect = require("effect");
89
- var EffectValidator = class {
90
- constructor(schema) {
91
- this.schema = schema;
92
- }
93
- validate(data) {
94
- const result = Schema.decodeUnknown(this.schema)(data);
95
- return import_effect.Effect.runSync(result);
96
- }
97
- };
98
-
99
- // src/validators/zod.ts
100
- var import_zod = require("zod");
101
- var ZodValidator = class {
102
- constructor(schema) {
103
- this.schema = schema;
104
- }
105
- validate(data) {
106
- return this.schema.parse(data);
107
- }
108
- };
109
-
110
- // src/validators/joi.ts
111
- var JoiValidator = class {
112
- constructor(schema) {
113
- this.schema = schema;
114
- }
115
- validate(data) {
116
- const { error, value } = this.schema.validate(data, {
117
- convert: true,
118
- stripUnknown: true,
119
- presence: "optional",
120
- abortEarly: false
121
- });
122
- if (error) {
123
- throw new Error(this.formatError(error));
124
- }
125
- return value;
126
- }
127
- formatError(error) {
128
- return error.details.map((detail) => detail.message).join("\n");
129
- }
130
- };
72
+ }
131
73
  // Annotate the CommonJS export names for ESM import in node:
132
74
  0 && (module.exports = {
133
- EffectValidator,
134
- JoiValidator,
135
- ZodValidator,
136
- createScraper
75
+ defineScraper
137
76
  });
package/dist/index.d.cts CHANGED
@@ -1,48 +1,31 @@
1
- import * as cheerio from 'cheerio';
2
- import * as Schema from 'effect/Schema';
3
- import { ZodSchema } from 'zod';
4
- import { Schema as Schema$1 } from 'joi';
1
+ import { StandardSchemaV1 } from '@standard-schema/spec';
2
+ import { Element } from 'domhandler';
5
3
 
6
- type ScrapeConfig<T> = {
7
- fields: SchemaFieldDefinitions<T>;
8
- validator: SchemaValidator<T>;
9
- };
10
- type FieldDefinition<T> = {
4
+ type ExtractDescriptorFn = (el: Element, key: string, obj: Record<string, unknown>) => unknown;
5
+ interface ExtractDescriptor {
11
6
  selector: string;
12
- attribute?: string;
13
- transform?: (value: string) => T;
14
- defaultValue?: T;
15
- multiple?: boolean;
16
- } | NestedFieldDefinition<T>;
17
- type NestedFieldDefinition<T> = {
18
- fields: SchemaFieldDefinitions<T>;
19
- };
20
- type SchemaFieldDefinitions<T> = {
21
- [K in keyof T]: FieldDefinition<T[K]>;
22
- };
23
- interface SchemaValidator<T> {
24
- validate(data: unknown): T;
7
+ value?: string | ExtractDescriptorFn | ExtractMap;
25
8
  }
26
-
27
- declare const createScraper: <T>({ fields, validator, }: ScrapeConfig<T>) => ((html: cheerio.CheerioAPI | string) => T);
28
-
29
- declare class EffectValidator<A, I = A> implements SchemaValidator<A> {
30
- private schema;
31
- constructor(schema: Schema.Schema<A, I>);
32
- validate(data: unknown): A;
9
+ type ExtractValue = string | ExtractDescriptor | [string | ExtractDescriptor];
10
+ interface ExtractMap {
11
+ [key: string]: ExtractValue;
33
12
  }
34
13
 
35
- declare class ZodValidator<T> implements SchemaValidator<T> {
36
- private schema;
37
- constructor(schema: ZodSchema<T>);
38
- validate(data: unknown): T;
39
- }
14
+ type ScraperConfig<S extends StandardSchemaV1<any, any>, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>> = {
15
+ schema: S;
16
+ extract: ExtractMap;
17
+ transform?: (data: StandardSchemaV1.InferOutput<S>) => Promise<R> | R;
18
+ };
19
+ type ValidationResult<T> = {
20
+ success: boolean;
21
+ data?: T;
22
+ error?: unknown;
23
+ };
24
+ type ScraperResult<T> = {
25
+ data?: T;
26
+ error?: unknown;
27
+ };
40
28
 
41
- declare class JoiValidator<T> implements SchemaValidator<T> {
42
- private schema;
43
- constructor(schema: Schema$1<T>);
44
- validate(data: unknown): T;
45
- private formatError;
46
- }
29
+ declare function defineScraper<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>>(config: ScraperConfig<S, R>): (html: string) => Promise<ScraperResult<R>>;
47
30
 
48
- export { EffectValidator, type FieldDefinition, JoiValidator, type SchemaFieldDefinitions, type SchemaValidator, type ScrapeConfig, ZodValidator, createScraper };
31
+ export { type ScraperConfig, type ScraperResult, type ValidationResult, defineScraper };
package/dist/index.d.ts CHANGED
@@ -1,48 +1,31 @@
1
- import * as cheerio from 'cheerio';
2
- import * as Schema from 'effect/Schema';
3
- import { ZodSchema } from 'zod';
4
- import { Schema as Schema$1 } from 'joi';
1
+ import { StandardSchemaV1 } from '@standard-schema/spec';
2
+ import { Element } from 'domhandler';
5
3
 
6
- type ScrapeConfig<T> = {
7
- fields: SchemaFieldDefinitions<T>;
8
- validator: SchemaValidator<T>;
9
- };
10
- type FieldDefinition<T> = {
4
+ type ExtractDescriptorFn = (el: Element, key: string, obj: Record<string, unknown>) => unknown;
5
+ interface ExtractDescriptor {
11
6
  selector: string;
12
- attribute?: string;
13
- transform?: (value: string) => T;
14
- defaultValue?: T;
15
- multiple?: boolean;
16
- } | NestedFieldDefinition<T>;
17
- type NestedFieldDefinition<T> = {
18
- fields: SchemaFieldDefinitions<T>;
19
- };
20
- type SchemaFieldDefinitions<T> = {
21
- [K in keyof T]: FieldDefinition<T[K]>;
22
- };
23
- interface SchemaValidator<T> {
24
- validate(data: unknown): T;
7
+ value?: string | ExtractDescriptorFn | ExtractMap;
25
8
  }
26
-
27
- declare const createScraper: <T>({ fields, validator, }: ScrapeConfig<T>) => ((html: cheerio.CheerioAPI | string) => T);
28
-
29
- declare class EffectValidator<A, I = A> implements SchemaValidator<A> {
30
- private schema;
31
- constructor(schema: Schema.Schema<A, I>);
32
- validate(data: unknown): A;
9
+ type ExtractValue = string | ExtractDescriptor | [string | ExtractDescriptor];
10
+ interface ExtractMap {
11
+ [key: string]: ExtractValue;
33
12
  }
34
13
 
35
- declare class ZodValidator<T> implements SchemaValidator<T> {
36
- private schema;
37
- constructor(schema: ZodSchema<T>);
38
- validate(data: unknown): T;
39
- }
14
+ type ScraperConfig<S extends StandardSchemaV1<any, any>, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>> = {
15
+ schema: S;
16
+ extract: ExtractMap;
17
+ transform?: (data: StandardSchemaV1.InferOutput<S>) => Promise<R> | R;
18
+ };
19
+ type ValidationResult<T> = {
20
+ success: boolean;
21
+ data?: T;
22
+ error?: unknown;
23
+ };
24
+ type ScraperResult<T> = {
25
+ data?: T;
26
+ error?: unknown;
27
+ };
40
28
 
41
- declare class JoiValidator<T> implements SchemaValidator<T> {
42
- private schema;
43
- constructor(schema: Schema$1<T>);
44
- validate(data: unknown): T;
45
- private formatError;
46
- }
29
+ declare function defineScraper<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>>(config: ScraperConfig<S, R>): (html: string) => Promise<ScraperResult<R>>;
47
30
 
48
- export { EffectValidator, type FieldDefinition, JoiValidator, type SchemaFieldDefinitions, type SchemaValidator, type ScrapeConfig, ZodValidator, createScraper };
31
+ export { type ScraperConfig, type ScraperResult, type ValidationResult, defineScraper };
package/dist/index.js CHANGED
@@ -1,97 +1,39 @@
1
- // src/createScraper.ts
1
+ // src/defineScraper.ts
2
2
  import * as cheerio from "cheerio";
3
- var extractData = (fields, $context) => {
4
- const data = {};
5
- for (const key in fields) {
6
- const fieldDef = fields[key];
7
- if ("fields" in fieldDef) {
8
- const nestedData = extractData(
9
- fieldDef.fields,
10
- $context
3
+ function defineScraper(config) {
4
+ return async (html) => {
5
+ try {
6
+ const $ = cheerio.load(html);
7
+ const extractedData = $.extract(config.extract);
8
+ const validationResult = await Promise.resolve(
9
+ config.schema["~standard"].validate(extractedData)
11
10
  );
12
- data[key] = nestedData;
13
- } else {
14
- const elements = $context(fieldDef.selector);
15
- let values = [];
16
- elements.each((_, element) => {
17
- const value = fieldDef.attribute ? $context(element).attr(fieldDef.attribute) : $context(element).text().trim();
18
- if (value !== void 0) {
19
- values.push(value);
11
+ if (validationResult.issues) {
12
+ return { error: validationResult.issues };
13
+ }
14
+ if (!("value" in validationResult)) {
15
+ return {
16
+ error: new Error(
17
+ "xscrape: Validation succeeded but no data was returned"
18
+ )
19
+ };
20
+ }
21
+ if (config.transform) {
22
+ try {
23
+ const transformed = await Promise.resolve(
24
+ config.transform(validationResult.value)
25
+ );
26
+ return { data: transformed };
27
+ } catch (error) {
28
+ return { error };
20
29
  }
21
- });
22
- if (values.length === 0 && fieldDef.defaultValue !== void 0) {
23
- data[key] = fieldDef.defaultValue;
24
- } else if (fieldDef.multiple) {
25
- data[key] = values.map(
26
- (value) => fieldDef.transform ? fieldDef.transform(value) : value
27
- );
28
- } else {
29
- const value = values[0];
30
- data[key] = fieldDef.transform && value ? fieldDef.transform(value) : value;
31
30
  }
31
+ return { data: validationResult.value };
32
+ } catch (error) {
33
+ return { error };
32
34
  }
33
- }
34
- return data;
35
- };
36
- var createScraper = ({
37
- fields,
38
- validator
39
- }) => {
40
- return (html) => {
41
- const $ = typeof html === "string" ? cheerio.load(html) : html;
42
- const data = extractData(fields, $);
43
- return validator.validate(data);
44
35
  };
45
- };
46
-
47
- // src/validators/effect.ts
48
- import * as Schema from "effect/Schema";
49
- import { Effect } from "effect";
50
- var EffectValidator = class {
51
- constructor(schema) {
52
- this.schema = schema;
53
- }
54
- validate(data) {
55
- const result = Schema.decodeUnknown(this.schema)(data);
56
- return Effect.runSync(result);
57
- }
58
- };
59
-
60
- // src/validators/zod.ts
61
- import "zod";
62
- var ZodValidator = class {
63
- constructor(schema) {
64
- this.schema = schema;
65
- }
66
- validate(data) {
67
- return this.schema.parse(data);
68
- }
69
- };
70
-
71
- // src/validators/joi.ts
72
- var JoiValidator = class {
73
- constructor(schema) {
74
- this.schema = schema;
75
- }
76
- validate(data) {
77
- const { error, value } = this.schema.validate(data, {
78
- convert: true,
79
- stripUnknown: true,
80
- presence: "optional",
81
- abortEarly: false
82
- });
83
- if (error) {
84
- throw new Error(this.formatError(error));
85
- }
86
- return value;
87
- }
88
- formatError(error) {
89
- return error.details.map((detail) => detail.message).join("\n");
90
- }
91
- };
36
+ }
92
37
  export {
93
- EffectValidator,
94
- JoiValidator,
95
- ZodValidator,
96
- createScraper
38
+ defineScraper
97
39
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "xscrape",
3
- "version": "1.3.1",
3
+ "version": "3.0.0",
4
4
  "description": "A flexible and powerful library designed to extract and transform data from HTML documents using user-defined schemas",
5
5
  "main": "dist/index.js",
6
6
  "exports": {
@@ -39,26 +39,31 @@
39
39
  },
40
40
  "homepage": "https://github.com/johnie/xscrape#readme",
41
41
  "devDependencies": {
42
- "@arethetypeswrong/cli": "^0.16.4",
43
- "@changesets/changelog-github": "^0.5.0",
44
- "@changesets/cli": "^2.27.9",
45
- "prettier": "^3.3.3",
46
- "tsup": "^8.3.5",
47
- "typescript": "^5.6.3",
48
- "vite": "^5.4.10",
49
- "vitest": "^2.1.3"
42
+ "@arethetypeswrong/cli": "^0.18.2",
43
+ "@changesets/changelog-github": "^0.5.1",
44
+ "@changesets/cli": "^2.29.5",
45
+ "arktype": "^2.1.20",
46
+ "effect": "^3.16.12",
47
+ "jsdom": "^26.1.0",
48
+ "prettier": "^3.6.2",
49
+ "tsup": "^8.5.0",
50
+ "typescript": "^5.8.3",
51
+ "valibot": "^1.1.0",
52
+ "vite": "^7.0.4",
53
+ "vitest": "^3.2.4",
54
+ "zod": "^4.0.2"
50
55
  },
51
56
  "dependencies": {
52
- "cheerio": "^1.0.0",
53
- "effect": "^3.10.4",
54
- "joi": "^17.13.3",
55
- "zod": "^3.23.8"
57
+ "@standard-schema/spec": "^1.0.0",
58
+ "cheerio": "^1.1.0",
59
+ "domhandler": "^5.0.3"
56
60
  },
57
61
  "scripts": {
58
62
  "build": "tsup",
59
63
  "ci": "npm run build && npm run check-format && npm run check-exports && npm run lint && npm run test",
60
64
  "lint": "tsc",
61
65
  "test": "vitest run",
66
+ "test:watch": "vitest",
62
67
  "format": "prettier --write ./src",
63
68
  "check-format": "prettier --check ./src",
64
69
  "check-exports": "attw --pack .",