xscrape 1.3.1 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -11
- package/dist/index.cjs +67 -88
- package/dist/index.d.cts +59 -41
- package/dist/index.d.ts +59 -41
- package/dist/index.js +66 -84
- package/package.json +15 -11
package/README.md
CHANGED
|
@@ -1,9 +1,6 @@
|
|
|
1
1
|
# xscrape
|
|
2
2
|
|
|
3
|
-
`xscrape` is a powerful and flexible library designed for extracting and
|
|
4
|
-
transforming data from HTML documents using user-defined schemas. It integrates
|
|
5
|
-
seamlessly with various schema validation libraries such as Zod, Yup, Joi, and
|
|
6
|
-
Effect Schema, allowing you to use your preferred validation tool.
|
|
3
|
+
`xscrape` is a powerful and flexible library designed for extracting and transforming data from HTML documents using user-defined schemas.
|
|
7
4
|
|
|
8
5
|
## Features
|
|
9
6
|
|
|
@@ -17,13 +14,13 @@ Effect Schema, allowing you to use your preferred validation tool.
|
|
|
17
14
|
|
|
18
15
|
### Schema Support
|
|
19
16
|
|
|
20
|
-
| Schema Library | Status | Notes
|
|
21
|
-
| ---------------------------------------------------- | ------------------- |
|
|
22
|
-
| [Zod](https://github.com/colinhacks/zod) | ✅ Supported | Default schema tool for `xscrape`
|
|
23
|
-
| [Effect/Schema](https://github.com/Effect-TS/effect) |
|
|
24
|
-
| [Joi](https://github.com/sideway/joi) |
|
|
25
|
-
| [Yup](https://github.com/jquense/yup) |
|
|
26
|
-
| Others... | 🔄 In Consideration | Potential support for other schema tools as per user feedback
|
|
17
|
+
| Schema Library | Status | Notes |
|
|
18
|
+
| ---------------------------------------------------- | ------------------- | ------------------------------------------------------------- |
|
|
19
|
+
| [Zod](https://github.com/colinhacks/zod) | ✅ Supported | Default schema tool for `xscrape` |
|
|
20
|
+
| [Effect/Schema](https://github.com/Effect-TS/effect) | 🔄 In Consideration | Support for Effect/Schema for additional flexibility |
|
|
21
|
+
| [Joi](https://github.com/sideway/joi) | 🔄 In Consideration | Support for Joi for validation |
|
|
22
|
+
| [Yup](https://github.com/jquense/yup) | 🔄 In Consideration | Support for Yup for validation |
|
|
23
|
+
| Others... | 🔄 In Consideration | Potential support for other schema tools as per user feedback |
|
|
27
24
|
|
|
28
25
|
## Installation
|
|
29
26
|
|
package/dist/index.cjs
CHANGED
|
@@ -30,108 +30,87 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
30
30
|
// src/index.ts
|
|
31
31
|
var src_exports = {};
|
|
32
32
|
__export(src_exports, {
|
|
33
|
-
|
|
34
|
-
JoiValidator: () => JoiValidator,
|
|
35
|
-
ZodValidator: () => ZodValidator,
|
|
36
|
-
createScraper: () => createScraper
|
|
33
|
+
defineScraper: () => defineScraper
|
|
37
34
|
});
|
|
38
35
|
module.exports = __toCommonJS(src_exports);
|
|
39
36
|
|
|
40
|
-
// src/
|
|
37
|
+
// src/defineScraper.ts
|
|
41
38
|
var cheerio = __toESM(require("cheerio"), 1);
|
|
42
|
-
var extractData = (fields, $context) => {
|
|
43
|
-
const data = {};
|
|
44
|
-
for (const key in fields) {
|
|
45
|
-
const fieldDef = fields[key];
|
|
46
|
-
if ("fields" in fieldDef) {
|
|
47
|
-
const nestedData = extractData(
|
|
48
|
-
fieldDef.fields,
|
|
49
|
-
$context
|
|
50
|
-
);
|
|
51
|
-
data[key] = nestedData;
|
|
52
|
-
} else {
|
|
53
|
-
const elements = $context(fieldDef.selector);
|
|
54
|
-
let values = [];
|
|
55
|
-
elements.each((_, element) => {
|
|
56
|
-
const value = fieldDef.attribute ? $context(element).attr(fieldDef.attribute) : $context(element).text().trim();
|
|
57
|
-
if (value !== void 0) {
|
|
58
|
-
values.push(value);
|
|
59
|
-
}
|
|
60
|
-
});
|
|
61
|
-
if (values.length === 0 && fieldDef.defaultValue !== void 0) {
|
|
62
|
-
data[key] = fieldDef.defaultValue;
|
|
63
|
-
} else if (fieldDef.multiple) {
|
|
64
|
-
data[key] = values.map(
|
|
65
|
-
(value) => fieldDef.transform ? fieldDef.transform(value) : value
|
|
66
|
-
);
|
|
67
|
-
} else {
|
|
68
|
-
const value = values[0];
|
|
69
|
-
data[key] = fieldDef.transform && value ? fieldDef.transform(value) : value;
|
|
70
|
-
}
|
|
71
|
-
}
|
|
72
|
-
}
|
|
73
|
-
return data;
|
|
74
|
-
};
|
|
75
|
-
var createScraper = ({
|
|
76
|
-
fields,
|
|
77
|
-
validator
|
|
78
|
-
}) => {
|
|
79
|
-
return (html) => {
|
|
80
|
-
const $ = typeof html === "string" ? cheerio.load(html) : html;
|
|
81
|
-
const data = extractData(fields, $);
|
|
82
|
-
return validator.validate(data);
|
|
83
|
-
};
|
|
84
|
-
};
|
|
85
39
|
|
|
86
|
-
// src/validators
|
|
87
|
-
var
|
|
88
|
-
var
|
|
89
|
-
|
|
90
|
-
constructor(schema) {
|
|
40
|
+
// src/validators.ts
|
|
41
|
+
var import_zod = require("zod");
|
|
42
|
+
var Validator = class {
|
|
43
|
+
constructor(schema, validateFunction) {
|
|
91
44
|
this.schema = schema;
|
|
45
|
+
this.validateFunction = validateFunction;
|
|
92
46
|
}
|
|
93
47
|
validate(data) {
|
|
94
|
-
|
|
95
|
-
|
|
48
|
+
try {
|
|
49
|
+
const result = this.validateFunction(this.schema, data);
|
|
50
|
+
return { success: true, data: result };
|
|
51
|
+
} catch (error) {
|
|
52
|
+
return { success: false, error };
|
|
53
|
+
}
|
|
96
54
|
}
|
|
97
55
|
};
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
56
|
+
function getSchemaBuilder(type) {
|
|
57
|
+
switch (type) {
|
|
58
|
+
case "zod":
|
|
59
|
+
return import_zod.z;
|
|
60
|
+
default:
|
|
61
|
+
throw new Error(`Unsupported validator type: ${type}`);
|
|
104
62
|
}
|
|
105
|
-
|
|
106
|
-
|
|
63
|
+
}
|
|
64
|
+
function createValidator(type, schemaFn) {
|
|
65
|
+
const builder = getSchemaBuilder(type);
|
|
66
|
+
const schema = schemaFn(builder);
|
|
67
|
+
switch (type) {
|
|
68
|
+
case "zod":
|
|
69
|
+
return new Validator(
|
|
70
|
+
schema,
|
|
71
|
+
(schema2, data) => schema2.parse(data)
|
|
72
|
+
);
|
|
73
|
+
default:
|
|
74
|
+
throw new Error(`Unsupported validator type: ${type}`);
|
|
107
75
|
}
|
|
108
|
-
}
|
|
76
|
+
}
|
|
109
77
|
|
|
110
|
-
// src/
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
78
|
+
// src/defineScraper.ts
|
|
79
|
+
function defineScraper(config) {
|
|
80
|
+
const validator = createValidator(config.validator, config.schema);
|
|
81
|
+
return async (html) => {
|
|
82
|
+
try {
|
|
83
|
+
const $ = cheerio.load(html);
|
|
84
|
+
const extractedData = $.extract(config.extract);
|
|
85
|
+
const validationResult = validator.validate(extractedData);
|
|
86
|
+
if (!validationResult.success) {
|
|
87
|
+
return { error: validationResult.error };
|
|
88
|
+
}
|
|
89
|
+
if (!validationResult.data) {
|
|
90
|
+
return {
|
|
91
|
+
error: new Error("Validation succeeded but no data was returned")
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
if (config.transform) {
|
|
95
|
+
try {
|
|
96
|
+
const transformed = await Promise.resolve(
|
|
97
|
+
config.transform(validationResult.data)
|
|
98
|
+
);
|
|
99
|
+
return { data: transformed };
|
|
100
|
+
} catch (error) {
|
|
101
|
+
return { error };
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
return { data: validationResult.data };
|
|
105
|
+
} catch (error) {
|
|
106
|
+
return { error };
|
|
124
107
|
}
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
};
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// src/types/main.ts
|
|
112
|
+
var import_zod2 = require("zod");
|
|
131
113
|
// Annotate the CommonJS export names for ESM import in node:
|
|
132
114
|
0 && (module.exports = {
|
|
133
|
-
|
|
134
|
-
JoiValidator,
|
|
135
|
-
ZodValidator,
|
|
136
|
-
createScraper
|
|
115
|
+
defineScraper
|
|
137
116
|
});
|
package/dist/index.d.cts
CHANGED
|
@@ -1,48 +1,66 @@
|
|
|
1
|
-
import
|
|
2
|
-
import
|
|
3
|
-
import { ZodSchema } from 'zod';
|
|
4
|
-
import { Schema as Schema$1 } from 'joi';
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { Element } from 'domhandler';
|
|
5
3
|
|
|
6
|
-
type
|
|
7
|
-
|
|
8
|
-
validator: SchemaValidator<T>;
|
|
9
|
-
};
|
|
10
|
-
type FieldDefinition<T> = {
|
|
4
|
+
type ExtractDescriptorFn = (el: Element, key: string, obj: Record<string, unknown>) => unknown;
|
|
5
|
+
interface ExtractDescriptor {
|
|
11
6
|
selector: string;
|
|
12
|
-
|
|
13
|
-
transform?: (value: string) => T;
|
|
14
|
-
defaultValue?: T;
|
|
15
|
-
multiple?: boolean;
|
|
16
|
-
} | NestedFieldDefinition<T>;
|
|
17
|
-
type NestedFieldDefinition<T> = {
|
|
18
|
-
fields: SchemaFieldDefinitions<T>;
|
|
19
|
-
};
|
|
20
|
-
type SchemaFieldDefinitions<T> = {
|
|
21
|
-
[K in keyof T]: FieldDefinition<T[K]>;
|
|
22
|
-
};
|
|
23
|
-
interface SchemaValidator<T> {
|
|
24
|
-
validate(data: unknown): T;
|
|
7
|
+
value?: string | ExtractDescriptorFn | ExtractMap;
|
|
25
8
|
}
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
declare class EffectValidator<A, I = A> implements SchemaValidator<A> {
|
|
30
|
-
private schema;
|
|
31
|
-
constructor(schema: Schema.Schema<A, I>);
|
|
32
|
-
validate(data: unknown): A;
|
|
9
|
+
type ExtractValue = string | ExtractDescriptor | [string | ExtractDescriptor];
|
|
10
|
+
interface ExtractMap {
|
|
11
|
+
[key: string]: ExtractValue;
|
|
33
12
|
}
|
|
34
13
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
14
|
+
type ValidatorType = 'zod';
|
|
15
|
+
type ZodBuilder = typeof z;
|
|
16
|
+
type SchemaBuilder<V extends ValidatorType> = V extends 'zod' ? ZodBuilder : never;
|
|
17
|
+
type SchemaFunction<V extends ValidatorType, T> = (builder: SchemaBuilder<V>) => V extends 'zod' ? z.ZodSchema<T> : never;
|
|
18
|
+
type ScraperConfig<T extends Record<string, unknown>, V extends ValidatorType, R extends T = T> = {
|
|
19
|
+
validator: V;
|
|
20
|
+
schema: SchemaFunction<V, T>;
|
|
21
|
+
extract: ExtractMap;
|
|
22
|
+
transform?: (data: T) => Promise<R> | R;
|
|
23
|
+
};
|
|
24
|
+
type BaseFieldOptions = {
|
|
25
|
+
attribute?: string;
|
|
26
|
+
};
|
|
27
|
+
type LeafFieldConfig = BaseFieldOptions & {
|
|
28
|
+
selector?: string;
|
|
29
|
+
selectorAll?: string;
|
|
30
|
+
} & ({
|
|
31
|
+
selector: string;
|
|
32
|
+
selectorAll?: never;
|
|
33
|
+
} | {
|
|
34
|
+
selector?: never;
|
|
35
|
+
selectorAll: string;
|
|
36
|
+
});
|
|
37
|
+
type FieldConfig<T> = T extends object ? T extends Array<infer U> ? LeafFieldConfig : {
|
|
38
|
+
fields: Fields<T>;
|
|
39
|
+
} : LeafFieldConfig;
|
|
40
|
+
type Fields<T> = {
|
|
41
|
+
[K in keyof T]: FieldConfig<T[K]>;
|
|
42
|
+
};
|
|
43
|
+
type ValidationResult<T> = {
|
|
44
|
+
success: boolean;
|
|
45
|
+
data?: T;
|
|
46
|
+
error?: unknown;
|
|
47
|
+
};
|
|
48
|
+
type ScraperResult<T> = {
|
|
49
|
+
data?: T;
|
|
50
|
+
error?: unknown;
|
|
51
|
+
};
|
|
40
52
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
53
|
+
/**
|
|
54
|
+
* Defines a scraper with the provided configuration.
|
|
55
|
+
*
|
|
56
|
+
* @template T - The shape of the extracted data.
|
|
57
|
+
* @template V - The type of the validator used for validation.
|
|
58
|
+
* @template R - The type of the result after optional transformation, defaults to T.
|
|
59
|
+
*
|
|
60
|
+
* @param config - The configuration object for the scraper.
|
|
61
|
+
* @returns A function that takes an HTML string and returns the scraping result, which could be
|
|
62
|
+
* a scraper result or a promise of a scraper result.
|
|
63
|
+
*/
|
|
64
|
+
declare function defineScraper<T extends Record<string, unknown>, V extends ValidatorType, R extends T = T>(config: ScraperConfig<T, V, R>): (html: string) => Promise<ScraperResult<R>>;
|
|
47
65
|
|
|
48
|
-
export {
|
|
66
|
+
export { type FieldConfig, type Fields, type LeafFieldConfig, type SchemaBuilder, type SchemaFunction, type ScraperConfig, type ScraperResult, type ValidationResult, type ValidatorType, defineScraper };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,48 +1,66 @@
|
|
|
1
|
-
import
|
|
2
|
-
import
|
|
3
|
-
import { ZodSchema } from 'zod';
|
|
4
|
-
import { Schema as Schema$1 } from 'joi';
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { Element } from 'domhandler';
|
|
5
3
|
|
|
6
|
-
type
|
|
7
|
-
|
|
8
|
-
validator: SchemaValidator<T>;
|
|
9
|
-
};
|
|
10
|
-
type FieldDefinition<T> = {
|
|
4
|
+
type ExtractDescriptorFn = (el: Element, key: string, obj: Record<string, unknown>) => unknown;
|
|
5
|
+
interface ExtractDescriptor {
|
|
11
6
|
selector: string;
|
|
12
|
-
|
|
13
|
-
transform?: (value: string) => T;
|
|
14
|
-
defaultValue?: T;
|
|
15
|
-
multiple?: boolean;
|
|
16
|
-
} | NestedFieldDefinition<T>;
|
|
17
|
-
type NestedFieldDefinition<T> = {
|
|
18
|
-
fields: SchemaFieldDefinitions<T>;
|
|
19
|
-
};
|
|
20
|
-
type SchemaFieldDefinitions<T> = {
|
|
21
|
-
[K in keyof T]: FieldDefinition<T[K]>;
|
|
22
|
-
};
|
|
23
|
-
interface SchemaValidator<T> {
|
|
24
|
-
validate(data: unknown): T;
|
|
7
|
+
value?: string | ExtractDescriptorFn | ExtractMap;
|
|
25
8
|
}
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
declare class EffectValidator<A, I = A> implements SchemaValidator<A> {
|
|
30
|
-
private schema;
|
|
31
|
-
constructor(schema: Schema.Schema<A, I>);
|
|
32
|
-
validate(data: unknown): A;
|
|
9
|
+
type ExtractValue = string | ExtractDescriptor | [string | ExtractDescriptor];
|
|
10
|
+
interface ExtractMap {
|
|
11
|
+
[key: string]: ExtractValue;
|
|
33
12
|
}
|
|
34
13
|
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
14
|
+
type ValidatorType = 'zod';
|
|
15
|
+
type ZodBuilder = typeof z;
|
|
16
|
+
type SchemaBuilder<V extends ValidatorType> = V extends 'zod' ? ZodBuilder : never;
|
|
17
|
+
type SchemaFunction<V extends ValidatorType, T> = (builder: SchemaBuilder<V>) => V extends 'zod' ? z.ZodSchema<T> : never;
|
|
18
|
+
type ScraperConfig<T extends Record<string, unknown>, V extends ValidatorType, R extends T = T> = {
|
|
19
|
+
validator: V;
|
|
20
|
+
schema: SchemaFunction<V, T>;
|
|
21
|
+
extract: ExtractMap;
|
|
22
|
+
transform?: (data: T) => Promise<R> | R;
|
|
23
|
+
};
|
|
24
|
+
type BaseFieldOptions = {
|
|
25
|
+
attribute?: string;
|
|
26
|
+
};
|
|
27
|
+
type LeafFieldConfig = BaseFieldOptions & {
|
|
28
|
+
selector?: string;
|
|
29
|
+
selectorAll?: string;
|
|
30
|
+
} & ({
|
|
31
|
+
selector: string;
|
|
32
|
+
selectorAll?: never;
|
|
33
|
+
} | {
|
|
34
|
+
selector?: never;
|
|
35
|
+
selectorAll: string;
|
|
36
|
+
});
|
|
37
|
+
type FieldConfig<T> = T extends object ? T extends Array<infer U> ? LeafFieldConfig : {
|
|
38
|
+
fields: Fields<T>;
|
|
39
|
+
} : LeafFieldConfig;
|
|
40
|
+
type Fields<T> = {
|
|
41
|
+
[K in keyof T]: FieldConfig<T[K]>;
|
|
42
|
+
};
|
|
43
|
+
type ValidationResult<T> = {
|
|
44
|
+
success: boolean;
|
|
45
|
+
data?: T;
|
|
46
|
+
error?: unknown;
|
|
47
|
+
};
|
|
48
|
+
type ScraperResult<T> = {
|
|
49
|
+
data?: T;
|
|
50
|
+
error?: unknown;
|
|
51
|
+
};
|
|
40
52
|
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
53
|
+
/**
|
|
54
|
+
* Defines a scraper with the provided configuration.
|
|
55
|
+
*
|
|
56
|
+
* @template T - The shape of the extracted data.
|
|
57
|
+
* @template V - The type of the validator used for validation.
|
|
58
|
+
* @template R - The type of the result after optional transformation, defaults to T.
|
|
59
|
+
*
|
|
60
|
+
* @param config - The configuration object for the scraper.
|
|
61
|
+
* @returns A function that takes an HTML string and returns the scraping result, which could be
|
|
62
|
+
* a scraper result or a promise of a scraper result.
|
|
63
|
+
*/
|
|
64
|
+
declare function defineScraper<T extends Record<string, unknown>, V extends ValidatorType, R extends T = T>(config: ScraperConfig<T, V, R>): (html: string) => Promise<ScraperResult<R>>;
|
|
47
65
|
|
|
48
|
-
export {
|
|
66
|
+
export { type FieldConfig, type Fields, type LeafFieldConfig, type SchemaBuilder, type SchemaFunction, type ScraperConfig, type ScraperResult, type ValidationResult, type ValidatorType, defineScraper };
|
package/dist/index.js
CHANGED
|
@@ -1,97 +1,79 @@
|
|
|
1
|
-
// src/
|
|
1
|
+
// src/defineScraper.ts
|
|
2
2
|
import * as cheerio from "cheerio";
|
|
3
|
-
var extractData = (fields, $context) => {
|
|
4
|
-
const data = {};
|
|
5
|
-
for (const key in fields) {
|
|
6
|
-
const fieldDef = fields[key];
|
|
7
|
-
if ("fields" in fieldDef) {
|
|
8
|
-
const nestedData = extractData(
|
|
9
|
-
fieldDef.fields,
|
|
10
|
-
$context
|
|
11
|
-
);
|
|
12
|
-
data[key] = nestedData;
|
|
13
|
-
} else {
|
|
14
|
-
const elements = $context(fieldDef.selector);
|
|
15
|
-
let values = [];
|
|
16
|
-
elements.each((_, element) => {
|
|
17
|
-
const value = fieldDef.attribute ? $context(element).attr(fieldDef.attribute) : $context(element).text().trim();
|
|
18
|
-
if (value !== void 0) {
|
|
19
|
-
values.push(value);
|
|
20
|
-
}
|
|
21
|
-
});
|
|
22
|
-
if (values.length === 0 && fieldDef.defaultValue !== void 0) {
|
|
23
|
-
data[key] = fieldDef.defaultValue;
|
|
24
|
-
} else if (fieldDef.multiple) {
|
|
25
|
-
data[key] = values.map(
|
|
26
|
-
(value) => fieldDef.transform ? fieldDef.transform(value) : value
|
|
27
|
-
);
|
|
28
|
-
} else {
|
|
29
|
-
const value = values[0];
|
|
30
|
-
data[key] = fieldDef.transform && value ? fieldDef.transform(value) : value;
|
|
31
|
-
}
|
|
32
|
-
}
|
|
33
|
-
}
|
|
34
|
-
return data;
|
|
35
|
-
};
|
|
36
|
-
var createScraper = ({
|
|
37
|
-
fields,
|
|
38
|
-
validator
|
|
39
|
-
}) => {
|
|
40
|
-
return (html) => {
|
|
41
|
-
const $ = typeof html === "string" ? cheerio.load(html) : html;
|
|
42
|
-
const data = extractData(fields, $);
|
|
43
|
-
return validator.validate(data);
|
|
44
|
-
};
|
|
45
|
-
};
|
|
46
3
|
|
|
47
|
-
// src/validators
|
|
48
|
-
import
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
constructor(schema) {
|
|
4
|
+
// src/validators.ts
|
|
5
|
+
import { z } from "zod";
|
|
6
|
+
var Validator = class {
|
|
7
|
+
constructor(schema, validateFunction) {
|
|
52
8
|
this.schema = schema;
|
|
9
|
+
this.validateFunction = validateFunction;
|
|
53
10
|
}
|
|
54
11
|
validate(data) {
|
|
55
|
-
|
|
56
|
-
|
|
12
|
+
try {
|
|
13
|
+
const result = this.validateFunction(this.schema, data);
|
|
14
|
+
return { success: true, data: result };
|
|
15
|
+
} catch (error) {
|
|
16
|
+
return { success: false, error };
|
|
17
|
+
}
|
|
57
18
|
}
|
|
58
19
|
};
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
20
|
+
function getSchemaBuilder(type) {
|
|
21
|
+
switch (type) {
|
|
22
|
+
case "zod":
|
|
23
|
+
return z;
|
|
24
|
+
default:
|
|
25
|
+
throw new Error(`Unsupported validator type: ${type}`);
|
|
65
26
|
}
|
|
66
|
-
|
|
67
|
-
|
|
27
|
+
}
|
|
28
|
+
function createValidator(type, schemaFn) {
|
|
29
|
+
const builder = getSchemaBuilder(type);
|
|
30
|
+
const schema = schemaFn(builder);
|
|
31
|
+
switch (type) {
|
|
32
|
+
case "zod":
|
|
33
|
+
return new Validator(
|
|
34
|
+
schema,
|
|
35
|
+
(schema2, data) => schema2.parse(data)
|
|
36
|
+
);
|
|
37
|
+
default:
|
|
38
|
+
throw new Error(`Unsupported validator type: ${type}`);
|
|
68
39
|
}
|
|
69
|
-
}
|
|
40
|
+
}
|
|
70
41
|
|
|
71
|
-
// src/
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
42
|
+
// src/defineScraper.ts
|
|
43
|
+
function defineScraper(config) {
|
|
44
|
+
const validator = createValidator(config.validator, config.schema);
|
|
45
|
+
return async (html) => {
|
|
46
|
+
try {
|
|
47
|
+
const $ = cheerio.load(html);
|
|
48
|
+
const extractedData = $.extract(config.extract);
|
|
49
|
+
const validationResult = validator.validate(extractedData);
|
|
50
|
+
if (!validationResult.success) {
|
|
51
|
+
return { error: validationResult.error };
|
|
52
|
+
}
|
|
53
|
+
if (!validationResult.data) {
|
|
54
|
+
return {
|
|
55
|
+
error: new Error("Validation succeeded but no data was returned")
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
if (config.transform) {
|
|
59
|
+
try {
|
|
60
|
+
const transformed = await Promise.resolve(
|
|
61
|
+
config.transform(validationResult.data)
|
|
62
|
+
);
|
|
63
|
+
return { data: transformed };
|
|
64
|
+
} catch (error) {
|
|
65
|
+
return { error };
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
return { data: validationResult.data };
|
|
69
|
+
} catch (error) {
|
|
70
|
+
return { error };
|
|
85
71
|
}
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
};
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// src/types/main.ts
|
|
76
|
+
import "zod";
|
|
92
77
|
export {
|
|
93
|
-
|
|
94
|
-
JoiValidator,
|
|
95
|
-
ZodValidator,
|
|
96
|
-
createScraper
|
|
78
|
+
defineScraper
|
|
97
79
|
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "xscrape",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "2.0.0",
|
|
4
4
|
"description": "A flexible and powerful library designed to extract and transform data from HTML documents using user-defined schemas",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"exports": {
|
|
@@ -39,26 +39,30 @@
|
|
|
39
39
|
},
|
|
40
40
|
"homepage": "https://github.com/johnie/xscrape#readme",
|
|
41
41
|
"devDependencies": {
|
|
42
|
-
"@arethetypeswrong/cli": "^0.
|
|
43
|
-
"@changesets/changelog-github": "^0.5.
|
|
44
|
-
"@changesets/cli": "^2.
|
|
45
|
-
"
|
|
46
|
-
"
|
|
47
|
-
"
|
|
48
|
-
"
|
|
49
|
-
"
|
|
42
|
+
"@arethetypeswrong/cli": "^0.17.3",
|
|
43
|
+
"@changesets/changelog-github": "^0.5.1",
|
|
44
|
+
"@changesets/cli": "^2.28.1",
|
|
45
|
+
"domhandler": "^5.0.3",
|
|
46
|
+
"jsdom": "^26.0.0",
|
|
47
|
+
"prettier": "^3.5.1",
|
|
48
|
+
"tsup": "^8.3.6",
|
|
49
|
+
"typescript": "^5.7.3",
|
|
50
|
+
"vite": "^6.1.1",
|
|
51
|
+
"vitest": "^3.0.6"
|
|
50
52
|
},
|
|
51
53
|
"dependencies": {
|
|
52
54
|
"cheerio": "^1.0.0",
|
|
53
|
-
"effect": "^3.
|
|
55
|
+
"effect": "^3.13.2",
|
|
54
56
|
"joi": "^17.13.3",
|
|
55
|
-
"
|
|
57
|
+
"yup": "^1.6.1",
|
|
58
|
+
"zod": "^3.24.2"
|
|
56
59
|
},
|
|
57
60
|
"scripts": {
|
|
58
61
|
"build": "tsup",
|
|
59
62
|
"ci": "npm run build && npm run check-format && npm run check-exports && npm run lint && npm run test",
|
|
60
63
|
"lint": "tsc",
|
|
61
64
|
"test": "vitest run",
|
|
65
|
+
"test:watch": "vitest",
|
|
62
66
|
"format": "prettier --write ./src",
|
|
63
67
|
"check-format": "prettier --check ./src",
|
|
64
68
|
"check-exports": "attw --pack .",
|