xscrape 1.3.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,9 +1,6 @@
1
1
  # xscrape
2
2
 
3
- `xscrape` is a powerful and flexible library designed for extracting and
4
- transforming data from HTML documents using user-defined schemas. It integrates
5
- seamlessly with various schema validation libraries such as Zod, Yup, Joi, and
6
- Effect Schema, allowing you to use your preferred validation tool.
3
+ `xscrape` is a powerful and flexible library designed for extracting and transforming data from HTML documents using user-defined schemas.
7
4
 
8
5
  ## Features
9
6
 
@@ -17,13 +14,13 @@ Effect Schema, allowing you to use your preferred validation tool.
17
14
 
18
15
  ### Schema Support
19
16
 
20
- | Schema Library | Status | Notes |
21
- | ---------------------------------------------------- | ------------------- | ------------------------------------------------------------------ |
22
- | [Zod](https://github.com/colinhacks/zod) | ✅ Supported | Default schema tool for `xscrape` |
23
- | [Effect/Schema](https://github.com/Effect-TS/effect) | Supported | Support for Effect/Schema for additional flexibility |
24
- | [Joi](https://github.com/sideway/joi) | 🚧 Planned | Support for Joi for those familiar with server-side validation |
25
- | [Yup](https://github.com/jquense/yup) | 🚧 Planned | Adding Yup support for schema validation in front-end applications |
26
- | Others... | 🔄 In Consideration | Potential support for other schema tools as per user feedback |
17
+ | Schema Library | Status | Notes |
18
+ | ---------------------------------------------------- | ------------------- | ------------------------------------------------------------- |
19
+ | [Zod](https://github.com/colinhacks/zod) | ✅ Supported | Default schema tool for `xscrape` |
20
+ | [Effect/Schema](https://github.com/Effect-TS/effect) | 🔄 In Consideration | Support for Effect/Schema for additional flexibility |
21
+ | [Joi](https://github.com/sideway/joi) | 🔄 In Consideration | Support for Joi for validation |
22
+ | [Yup](https://github.com/jquense/yup) | 🔄 In Consideration | Support for Yup for validation |
23
+ | Others... | 🔄 In Consideration | Potential support for other schema tools as per user feedback |
27
24
 
28
25
  ## Installation
29
26
 
package/dist/index.cjs CHANGED
@@ -30,84 +30,87 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
30
30
  // src/index.ts
31
31
  var src_exports = {};
32
32
  __export(src_exports, {
33
- EffectValidator: () => EffectValidator,
34
- ZodValidator: () => ZodValidator,
35
- createScraper: () => createScraper
33
+ defineScraper: () => defineScraper
36
34
  });
37
35
  module.exports = __toCommonJS(src_exports);
38
36
 
39
- // src/createScraper.ts
37
+ // src/defineScraper.ts
40
38
  var cheerio = __toESM(require("cheerio"), 1);
41
- var extractData = (fields, $context) => {
42
- const data = {};
43
- for (const key in fields) {
44
- const fieldDef = fields[key];
45
- if ("fields" in fieldDef) {
46
- const nestedData = extractData(
47
- fieldDef.fields,
48
- $context
49
- );
50
- data[key] = nestedData;
51
- } else {
52
- const elements = $context(fieldDef.selector);
53
- let values = [];
54
- elements.each((_, element) => {
55
- const value = fieldDef.attribute ? $context(element).attr(fieldDef.attribute) : $context(element).text().trim();
56
- if (value !== void 0) {
57
- values.push(value);
58
- }
59
- });
60
- if (values.length === 0 && fieldDef.defaultValue !== void 0) {
61
- data[key] = fieldDef.defaultValue;
62
- } else if (fieldDef.multiple) {
63
- data[key] = values.map(
64
- (value) => fieldDef.transform ? fieldDef.transform(value) : value
65
- );
66
- } else {
67
- const value = values[0];
68
- data[key] = fieldDef.transform && value ? fieldDef.transform(value) : value;
69
- }
70
- }
71
- }
72
- return data;
73
- };
74
- var createScraper = ({
75
- fields,
76
- validator
77
- }) => {
78
- return (html) => {
79
- const $ = typeof html === "string" ? cheerio.load(html) : html;
80
- const data = extractData(fields, $);
81
- return validator.validate(data);
82
- };
83
- };
84
39
 
85
- // src/validators/effect.ts
86
- var Schema = __toESM(require("effect/Schema"), 1);
87
- var import_effect = require("effect");
88
- var EffectValidator = class {
89
- constructor(schema) {
40
+ // src/validators.ts
41
+ var import_zod = require("zod");
42
+ var Validator = class {
43
+ constructor(schema, validateFunction) {
90
44
  this.schema = schema;
45
+ this.validateFunction = validateFunction;
91
46
  }
92
47
  validate(data) {
93
- const result = Schema.decodeUnknown(this.schema)(data);
94
- return import_effect.Effect.runSync(result);
48
+ try {
49
+ const result = this.validateFunction(this.schema, data);
50
+ return { success: true, data: result };
51
+ } catch (error) {
52
+ return { success: false, error };
53
+ }
95
54
  }
96
55
  };
97
-
98
- // src/validators/zod.ts
99
- var import_zod = require("zod");
100
- var ZodValidator = class {
101
- constructor(schema) {
102
- this.schema = schema;
56
+ function getSchemaBuilder(type) {
57
+ switch (type) {
58
+ case "zod":
59
+ return import_zod.z;
60
+ default:
61
+ throw new Error(`Unsupported validator type: ${type}`);
103
62
  }
104
- validate(data) {
105
- return this.schema.parse(data);
63
+ }
64
+ function createValidator(type, schemaFn) {
65
+ const builder = getSchemaBuilder(type);
66
+ const schema = schemaFn(builder);
67
+ switch (type) {
68
+ case "zod":
69
+ return new Validator(
70
+ schema,
71
+ (schema2, data) => schema2.parse(data)
72
+ );
73
+ default:
74
+ throw new Error(`Unsupported validator type: ${type}`);
106
75
  }
107
- };
76
+ }
77
+
78
+ // src/defineScraper.ts
79
+ function defineScraper(config) {
80
+ const validator = createValidator(config.validator, config.schema);
81
+ return async (html) => {
82
+ try {
83
+ const $ = cheerio.load(html);
84
+ const extractedData = $.extract(config.extract);
85
+ const validationResult = validator.validate(extractedData);
86
+ if (!validationResult.success) {
87
+ return { error: validationResult.error };
88
+ }
89
+ if (!validationResult.data) {
90
+ return {
91
+ error: new Error("Validation succeeded but no data was returned")
92
+ };
93
+ }
94
+ if (config.transform) {
95
+ try {
96
+ const transformed = await Promise.resolve(
97
+ config.transform(validationResult.data)
98
+ );
99
+ return { data: transformed };
100
+ } catch (error) {
101
+ return { error };
102
+ }
103
+ }
104
+ return { data: validationResult.data };
105
+ } catch (error) {
106
+ return { error };
107
+ }
108
+ };
109
+ }
110
+
111
+ // src/types/main.ts
112
+ var import_zod2 = require("zod");
108
113
  // Annotate the CommonJS export names for ESM import in node:
109
114
  0 && (module.exports = {
110
- EffectValidator,
111
- ZodValidator,
112
- createScraper
115
+ defineScraper
113
116
  });
package/dist/index.d.cts CHANGED
@@ -1,40 +1,66 @@
1
- import * as cheerio from 'cheerio';
2
- import * as Schema from 'effect/Schema';
3
- import { ZodSchema } from 'zod';
1
+ import { z } from 'zod';
2
+ import { Element } from 'domhandler';
4
3
 
5
- type ScrapeConfig<T> = {
6
- fields: SchemaFieldDefinitions<T>;
7
- validator: SchemaValidator<T>;
8
- };
9
- type FieldDefinition<T> = {
4
+ type ExtractDescriptorFn = (el: Element, key: string, obj: Record<string, unknown>) => unknown;
5
+ interface ExtractDescriptor {
10
6
  selector: string;
7
+ value?: string | ExtractDescriptorFn | ExtractMap;
8
+ }
9
+ type ExtractValue = string | ExtractDescriptor | [string | ExtractDescriptor];
10
+ interface ExtractMap {
11
+ [key: string]: ExtractValue;
12
+ }
13
+
14
+ type ValidatorType = 'zod';
15
+ type ZodBuilder = typeof z;
16
+ type SchemaBuilder<V extends ValidatorType> = V extends 'zod' ? ZodBuilder : never;
17
+ type SchemaFunction<V extends ValidatorType, T> = (builder: SchemaBuilder<V>) => V extends 'zod' ? z.ZodSchema<T> : never;
18
+ type ScraperConfig<T extends Record<string, unknown>, V extends ValidatorType, R extends T = T> = {
19
+ validator: V;
20
+ schema: SchemaFunction<V, T>;
21
+ extract: ExtractMap;
22
+ transform?: (data: T) => Promise<R> | R;
23
+ };
24
+ type BaseFieldOptions = {
11
25
  attribute?: string;
12
- transform?: (value: string) => T;
13
- defaultValue?: T;
14
- multiple?: boolean;
15
- } | NestedFieldDefinition<T>;
16
- type NestedFieldDefinition<T> = {
17
- fields: SchemaFieldDefinitions<T>;
18
26
  };
19
- type SchemaFieldDefinitions<T> = {
20
- [K in keyof T]: FieldDefinition<T[K]>;
27
+ type LeafFieldConfig = BaseFieldOptions & {
28
+ selector?: string;
29
+ selectorAll?: string;
30
+ } & ({
31
+ selector: string;
32
+ selectorAll?: never;
33
+ } | {
34
+ selector?: never;
35
+ selectorAll: string;
36
+ });
37
+ type FieldConfig<T> = T extends object ? T extends Array<infer U> ? LeafFieldConfig : {
38
+ fields: Fields<T>;
39
+ } : LeafFieldConfig;
40
+ type Fields<T> = {
41
+ [K in keyof T]: FieldConfig<T[K]>;
42
+ };
43
+ type ValidationResult<T> = {
44
+ success: boolean;
45
+ data?: T;
46
+ error?: unknown;
47
+ };
48
+ type ScraperResult<T> = {
49
+ data?: T;
50
+ error?: unknown;
21
51
  };
22
- interface SchemaValidator<T> {
23
- validate(data: unknown): T;
24
- }
25
-
26
- declare const createScraper: <T>({ fields, validator, }: ScrapeConfig<T>) => ((html: cheerio.CheerioAPI | string) => T);
27
-
28
- declare class EffectValidator<A, I = A> implements SchemaValidator<A> {
29
- private schema;
30
- constructor(schema: Schema.Schema<A, I>);
31
- validate(data: unknown): A;
32
- }
33
52
 
34
- declare class ZodValidator<T> implements SchemaValidator<T> {
35
- private schema;
36
- constructor(schema: ZodSchema<T>);
37
- validate(data: unknown): T;
38
- }
53
+ /**
54
+ * Defines a scraper with the provided configuration.
55
+ *
56
+ * @template T - The shape of the extracted data.
57
+ * @template V - The type of the validator used for validation.
58
+ * @template R - The type of the result after optional transformation, defaults to T.
59
+ *
60
+ * @param config - The configuration object for the scraper.
61
+ * @returns A function that takes an HTML string and returns the scraping result, which could be
62
+ * a scraper result or a promise of a scraper result.
63
+ */
64
+ declare function defineScraper<T extends Record<string, unknown>, V extends ValidatorType, R extends T = T>(config: ScraperConfig<T, V, R>): (html: string) => Promise<ScraperResult<R>>;
39
65
 
40
- export { EffectValidator, type FieldDefinition, type SchemaFieldDefinitions, type SchemaValidator, type ScrapeConfig, ZodValidator, createScraper };
66
+ export { type FieldConfig, type Fields, type LeafFieldConfig, type SchemaBuilder, type SchemaFunction, type ScraperConfig, type ScraperResult, type ValidationResult, type ValidatorType, defineScraper };
package/dist/index.d.ts CHANGED
@@ -1,40 +1,66 @@
1
- import * as cheerio from 'cheerio';
2
- import * as Schema from 'effect/Schema';
3
- import { ZodSchema } from 'zod';
1
+ import { z } from 'zod';
2
+ import { Element } from 'domhandler';
4
3
 
5
- type ScrapeConfig<T> = {
6
- fields: SchemaFieldDefinitions<T>;
7
- validator: SchemaValidator<T>;
8
- };
9
- type FieldDefinition<T> = {
4
+ type ExtractDescriptorFn = (el: Element, key: string, obj: Record<string, unknown>) => unknown;
5
+ interface ExtractDescriptor {
10
6
  selector: string;
7
+ value?: string | ExtractDescriptorFn | ExtractMap;
8
+ }
9
+ type ExtractValue = string | ExtractDescriptor | [string | ExtractDescriptor];
10
+ interface ExtractMap {
11
+ [key: string]: ExtractValue;
12
+ }
13
+
14
+ type ValidatorType = 'zod';
15
+ type ZodBuilder = typeof z;
16
+ type SchemaBuilder<V extends ValidatorType> = V extends 'zod' ? ZodBuilder : never;
17
+ type SchemaFunction<V extends ValidatorType, T> = (builder: SchemaBuilder<V>) => V extends 'zod' ? z.ZodSchema<T> : never;
18
+ type ScraperConfig<T extends Record<string, unknown>, V extends ValidatorType, R extends T = T> = {
19
+ validator: V;
20
+ schema: SchemaFunction<V, T>;
21
+ extract: ExtractMap;
22
+ transform?: (data: T) => Promise<R> | R;
23
+ };
24
+ type BaseFieldOptions = {
11
25
  attribute?: string;
12
- transform?: (value: string) => T;
13
- defaultValue?: T;
14
- multiple?: boolean;
15
- } | NestedFieldDefinition<T>;
16
- type NestedFieldDefinition<T> = {
17
- fields: SchemaFieldDefinitions<T>;
18
26
  };
19
- type SchemaFieldDefinitions<T> = {
20
- [K in keyof T]: FieldDefinition<T[K]>;
27
+ type LeafFieldConfig = BaseFieldOptions & {
28
+ selector?: string;
29
+ selectorAll?: string;
30
+ } & ({
31
+ selector: string;
32
+ selectorAll?: never;
33
+ } | {
34
+ selector?: never;
35
+ selectorAll: string;
36
+ });
37
+ type FieldConfig<T> = T extends object ? T extends Array<infer U> ? LeafFieldConfig : {
38
+ fields: Fields<T>;
39
+ } : LeafFieldConfig;
40
+ type Fields<T> = {
41
+ [K in keyof T]: FieldConfig<T[K]>;
42
+ };
43
+ type ValidationResult<T> = {
44
+ success: boolean;
45
+ data?: T;
46
+ error?: unknown;
47
+ };
48
+ type ScraperResult<T> = {
49
+ data?: T;
50
+ error?: unknown;
21
51
  };
22
- interface SchemaValidator<T> {
23
- validate(data: unknown): T;
24
- }
25
-
26
- declare const createScraper: <T>({ fields, validator, }: ScrapeConfig<T>) => ((html: cheerio.CheerioAPI | string) => T);
27
-
28
- declare class EffectValidator<A, I = A> implements SchemaValidator<A> {
29
- private schema;
30
- constructor(schema: Schema.Schema<A, I>);
31
- validate(data: unknown): A;
32
- }
33
52
 
34
- declare class ZodValidator<T> implements SchemaValidator<T> {
35
- private schema;
36
- constructor(schema: ZodSchema<T>);
37
- validate(data: unknown): T;
38
- }
53
+ /**
54
+ * Defines a scraper with the provided configuration.
55
+ *
56
+ * @template T - The shape of the extracted data.
57
+ * @template V - The type of the validator used for validation.
58
+ * @template R - The type of the result after optional transformation, defaults to T.
59
+ *
60
+ * @param config - The configuration object for the scraper.
61
+ * @returns A function that takes an HTML string and returns the scraping result, which could be
62
+ * a scraper result or a promise of a scraper result.
63
+ */
64
+ declare function defineScraper<T extends Record<string, unknown>, V extends ValidatorType, R extends T = T>(config: ScraperConfig<T, V, R>): (html: string) => Promise<ScraperResult<R>>;
39
65
 
40
- export { EffectValidator, type FieldDefinition, type SchemaFieldDefinitions, type SchemaValidator, type ScrapeConfig, ZodValidator, createScraper };
66
+ export { type FieldConfig, type Fields, type LeafFieldConfig, type SchemaBuilder, type SchemaFunction, type ScraperConfig, type ScraperResult, type ValidationResult, type ValidatorType, defineScraper };
package/dist/index.js CHANGED
@@ -1,74 +1,79 @@
1
- // src/createScraper.ts
1
+ // src/defineScraper.ts
2
2
  import * as cheerio from "cheerio";
3
- var extractData = (fields, $context) => {
4
- const data = {};
5
- for (const key in fields) {
6
- const fieldDef = fields[key];
7
- if ("fields" in fieldDef) {
8
- const nestedData = extractData(
9
- fieldDef.fields,
10
- $context
11
- );
12
- data[key] = nestedData;
13
- } else {
14
- const elements = $context(fieldDef.selector);
15
- let values = [];
16
- elements.each((_, element) => {
17
- const value = fieldDef.attribute ? $context(element).attr(fieldDef.attribute) : $context(element).text().trim();
18
- if (value !== void 0) {
19
- values.push(value);
20
- }
21
- });
22
- if (values.length === 0 && fieldDef.defaultValue !== void 0) {
23
- data[key] = fieldDef.defaultValue;
24
- } else if (fieldDef.multiple) {
25
- data[key] = values.map(
26
- (value) => fieldDef.transform ? fieldDef.transform(value) : value
27
- );
28
- } else {
29
- const value = values[0];
30
- data[key] = fieldDef.transform && value ? fieldDef.transform(value) : value;
31
- }
32
- }
33
- }
34
- return data;
35
- };
36
- var createScraper = ({
37
- fields,
38
- validator
39
- }) => {
40
- return (html) => {
41
- const $ = typeof html === "string" ? cheerio.load(html) : html;
42
- const data = extractData(fields, $);
43
- return validator.validate(data);
44
- };
45
- };
46
3
 
47
- // src/validators/effect.ts
48
- import * as Schema from "effect/Schema";
49
- import { Effect } from "effect";
50
- var EffectValidator = class {
51
- constructor(schema) {
4
+ // src/validators.ts
5
+ import { z } from "zod";
6
+ var Validator = class {
7
+ constructor(schema, validateFunction) {
52
8
  this.schema = schema;
9
+ this.validateFunction = validateFunction;
53
10
  }
54
11
  validate(data) {
55
- const result = Schema.decodeUnknown(this.schema)(data);
56
- return Effect.runSync(result);
12
+ try {
13
+ const result = this.validateFunction(this.schema, data);
14
+ return { success: true, data: result };
15
+ } catch (error) {
16
+ return { success: false, error };
17
+ }
57
18
  }
58
19
  };
59
-
60
- // src/validators/zod.ts
61
- import "zod";
62
- var ZodValidator = class {
63
- constructor(schema) {
64
- this.schema = schema;
20
+ function getSchemaBuilder(type) {
21
+ switch (type) {
22
+ case "zod":
23
+ return z;
24
+ default:
25
+ throw new Error(`Unsupported validator type: ${type}`);
65
26
  }
66
- validate(data) {
67
- return this.schema.parse(data);
27
+ }
28
+ function createValidator(type, schemaFn) {
29
+ const builder = getSchemaBuilder(type);
30
+ const schema = schemaFn(builder);
31
+ switch (type) {
32
+ case "zod":
33
+ return new Validator(
34
+ schema,
35
+ (schema2, data) => schema2.parse(data)
36
+ );
37
+ default:
38
+ throw new Error(`Unsupported validator type: ${type}`);
68
39
  }
69
- };
40
+ }
41
+
42
+ // src/defineScraper.ts
43
+ function defineScraper(config) {
44
+ const validator = createValidator(config.validator, config.schema);
45
+ return async (html) => {
46
+ try {
47
+ const $ = cheerio.load(html);
48
+ const extractedData = $.extract(config.extract);
49
+ const validationResult = validator.validate(extractedData);
50
+ if (!validationResult.success) {
51
+ return { error: validationResult.error };
52
+ }
53
+ if (!validationResult.data) {
54
+ return {
55
+ error: new Error("Validation succeeded but no data was returned")
56
+ };
57
+ }
58
+ if (config.transform) {
59
+ try {
60
+ const transformed = await Promise.resolve(
61
+ config.transform(validationResult.data)
62
+ );
63
+ return { data: transformed };
64
+ } catch (error) {
65
+ return { error };
66
+ }
67
+ }
68
+ return { data: validationResult.data };
69
+ } catch (error) {
70
+ return { error };
71
+ }
72
+ };
73
+ }
74
+
75
+ // src/types/main.ts
76
+ import "zod";
70
77
  export {
71
- EffectValidator,
72
- ZodValidator,
73
- createScraper
78
+ defineScraper
74
79
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "xscrape",
3
- "version": "1.3.0",
3
+ "version": "2.0.0",
4
4
  "description": "A flexible and powerful library designed to extract and transform data from HTML documents using user-defined schemas",
5
5
  "main": "dist/index.js",
6
6
  "exports": {
@@ -9,6 +9,7 @@
9
9
  "require": "./dist/index.cjs"
10
10
  }
11
11
  },
12
+ "typings": "dist/index.d.ts",
12
13
  "files": [
13
14
  "dist"
14
15
  ],
@@ -38,27 +39,34 @@
38
39
  },
39
40
  "homepage": "https://github.com/johnie/xscrape#readme",
40
41
  "devDependencies": {
41
- "@arethetypeswrong/cli": "^0.16.4",
42
- "@changesets/cli": "^2.27.9",
43
- "prettier": "^3.3.3",
44
- "tsup": "^8.3.5",
45
- "typescript": "^5.6.3",
46
- "vite": "^5.4.10",
47
- "vitest": "^2.1.3"
42
+ "@arethetypeswrong/cli": "^0.17.3",
43
+ "@changesets/changelog-github": "^0.5.1",
44
+ "@changesets/cli": "^2.28.1",
45
+ "domhandler": "^5.0.3",
46
+ "jsdom": "^26.0.0",
47
+ "prettier": "^3.5.1",
48
+ "tsup": "^8.3.6",
49
+ "typescript": "^5.7.3",
50
+ "vite": "^6.1.1",
51
+ "vitest": "^3.0.6"
48
52
  },
49
53
  "dependencies": {
50
54
  "cheerio": "^1.0.0",
51
- "effect": "^3.10.4",
52
- "zod": "^3.23.8"
55
+ "effect": "^3.13.2",
56
+ "joi": "^17.13.3",
57
+ "yup": "^1.6.1",
58
+ "zod": "^3.24.2"
53
59
  },
54
60
  "scripts": {
55
61
  "build": "tsup",
56
62
  "ci": "npm run build && npm run check-format && npm run check-exports && npm run lint && npm run test",
57
63
  "lint": "tsc",
58
64
  "test": "vitest run",
65
+ "test:watch": "vitest",
59
66
  "format": "prettier --write ./src",
60
67
  "check-format": "prettier --check ./src",
61
68
  "check-exports": "attw --pack .",
62
- "local-release": "npm run ci && changeset version && changeset publish"
69
+ "local-release": "npm run ci && changeset version && changeset publish",
70
+ "release": "npm run ci && changeset publish"
63
71
  }
64
72
  }