npm - xscrape - Versions diffs - 2.0.0 → 3.0.0 - Mend

xscrape 2.0.0 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/README.md CHANGED Viewed

@@ -1,30 +1,20 @@
 # xscrape
-`xscrape` is a powerful and flexible library designed for extracting and transforming data from HTML documents using user-defined schemas.
+`xscrape` is a powerful and flexible library designed for extracting and transforming data from HTML documents using user-defined schemas. It now supports any validation library that implements the **Standard Schema**, allowing you to bring your own schema for robust, type-safe data validation.
 ## Features
-- **HTML Parsing**: Extract data from HTML using CSS selectors with the help of
-  [cheerio](https://github.com/cheeriojs/cheerio).
-- **Schema Validation**: Validate and transform extracted data with schema validation libraries like [Zod](https://github.com/colinhacks/zod).
-- **Custom Transformations**: Provide custom transformations for extractedattributes.
-- **Default Values**: Define default values for missing data fields.
-- **Nested Field Support**: Define and extract nested data structures from
-  HTML elements.
+  * **HTML Parsing**: Extract data from HTML using CSS selectors with the help of [cheerio](https://github.com/cheeriojs/cheerio).
+  * **Flexible Schema Validation**: Validate and transform extracted data with any validation library that implements the [Standard Schema](https://standardschema.dev), such as Zod, Valibot, ArkType, and Effect Schema.
+  * **Custom Transformations**: Provide custom transformations for extracted attributes.
+  * **Default Values**: Define default values for missing data fields through your chosen schema library's features.
+  * **Nested Field Support**: Define and extract nested data structures from HTML elements.
-### Schema Support
-| Schema Library                                       | Status              | Notes                                                         |
-| ---------------------------------------------------- | ------------------- | ------------------------------------------------------------- |
-| [Zod](https://github.com/colinhacks/zod)             | ✅ Supported        | Default schema tool for `xscrape`                             |
-| [Effect/Schema](https://github.com/Effect-TS/effect) | 🔄 In Consideration        | Support for Effect/Schema for additional flexibility          |
-| [Joi](https://github.com/sideway/joi)                | 🔄 In Consideration        | Support for Joi for validation                                |
-| [Yup](https://github.com/jquense/yup)                | 🔄 In Consideration        | Support for Yup for validation                                |
-| Others...                                            | 🔄 In Consideration | Potential support for other schema tools as per user feedback |
+-----
 ## Installation
-To install this library, use npm or yarn:
+To install this library, use your preferred package manager:
 ```bash
 pnpm add xscrape
@@ -32,142 +22,165 @@ pnpm add xscrape
 npm install xscrape
 ```
-## Usage
+You will also need to install your chosen schema validation library, for example, Zod:
-Below is an example of how to use xscrape for extracting and transforming data
-from an HTML document:
+```bash
+pnpm add zod
+# or
+npm install zod
+```
+-----
-1. Define Your Schema
+## Usage
+Below is an example of how to use `xscrape` with a Zod schema to extract and transform data from an HTML document.
 ```ts
+import { defineScraper } from 'xscrape';
 import { z } from 'zod';
-const schema = z.object({
-  title: z.string().default('No title'),
-  description: z.string(),
-  keywords: z.array(z.string()),
-  views: z.number(),
-  image: z
-    .object({
-      url: z.string(),
-      width: z.number(),
-      height: z.number(),
-    })
-    .default({ url: '', width: 0, height: 0 })
-    .optional(),
+const scraper = defineScraper({
+  schema: z.object({
+    title: z.string(),
+    description: z.string(),
+    keywords: z.array(z.string()),
+    views: z.coerce.number(),
+  }),
+  extract: {
+    title: {
+      selector: 'title',
+    },
+    description: {
+      selector: 'meta[name="description"]',
+      value: 'content',
+    },
+    keywords: {
+      selector: 'meta[name="keywords"]',
+      value(el) {
+        return el.attribs['content']?.split(',');
+      },
+    },
+    views: {
+      selector: 'meta[name="views"]',
+      value: 'content',
+    },
+  },
 });
-```
-2. Define Field Definitions
+const html = `
+<!DOCTYPE html>
+<html>
+<head>
+  <meta name="description" content="An example description.">
+  <meta name="keywords" content="typescript,html,parsing">
+  <meta name="views" content="1234">
+  <title>Example Title</title>
+</head>
+<body></body>
+</html>
+`;
+const { data, error } = await scraper(html);
+console.log(data);
-```ts
-import { type SchemaFieldDefinitions } from 'xscrape';
+// Outputs:
+// {
+//   title: 'Example Title',
+//   description: 'An example description.',
+//   keywords: ['typescript', 'html', 'parsing'],
+//   views: 1234
+// }
+```
-type FieldDefinitions = SchemaFieldDefinitions<z.infer<typeof schema>>;
+### Handling Missing Data
-const fields: FieldDefinitions = {
-  title: { selector: 'title' },
-  description: {
-    selector: 'meta[name="description"]',
-    attribute: 'content',
+You can handle missing data by using the features of your chosen schema library, such as default values in Zod.
-    defaultValue: 'No description',
-  },
-  keywords: {
-    selector: 'meta[name="keywords"]',
-    attribute: 'content',
-    transform: (value) => value.split(','),
-    defaultValue: [],
-  },
-  views: {
-    selector: 'meta[name="views"]',
-    attribute: 'content',
-    transform: (value) => parseInt(value, 10),
-    defaultValue: 0,
-  },
-  // Example of a nested field
-  image: {
-    fields: {
-      url: {
-        selector: 'meta[property="og:image"]',
-        attribute: 'content',
-      },
-      width: {
-        selector: 'meta[property="og:image:width"]',
-        attribute: 'content',
-        transform: (value) => parseInt(value, 10),
-      },
-      height: {
-        selector: 'meta[property="og:image:height"]',
-        attribute: 'content',
-        transform: (value) => parseInt(value, 10),
-      },
+```ts
+import { defineScraper } from 'xscrape';
+import { z } from 'zod';
+const scraper = defineScraper({
+  schema: z.object({
+    title: z.string().default('No title'),
+    description: z.string().default('No description'),
+    views: z.coerce.number().default(0),
+  }),
+  extract: {
+    title: {
+      selector: 'title',
+    },
+    description: {
+      selector: 'meta[name="description"]',
+      value: 'content',
+    },
+    views: {
+      selector: 'meta[name="views"]',
+      value: 'content',
     },
   },
-};
+});
 ```
-3. Create a Scraper and Extract Data
-```ts
-import { createScraper, ZodValidator } from 'xscrape';
+### Nested Fields
-const validator = new ZodValidator(schema);
-const scraper = createScraper({ fields, validator });
+`xscrape` also supports extracting nested data structures.
-const html = `
-   <!DOCTYPE html>
-   <html>
-   <head>
-     <meta name="description" content="An example description.">
-     <meta name="keywords" content="typescript,html,parsing">
-     <meta name="views" content="1234">
-     <meta property="og:image" content="https://example.se/images/c12ffe73-3227-4a4a-b8ad-a3003cdf1d70?h=708&amp;tight=false&amp;w=1372">
-     <meta property="og:image:width" content="1372">
-     <meta property="og:image:height" content="708">
-     <title>Example Title</title>
-   </head>
-   <body></body>
-   </html>
-   `;
-const data = scraper(html);
-console.log(data);
+```ts
+import { defineScraper } from 'xscrape';
+import { z } from 'zod';
-// Outputs:
-// {
-// title: 'Example Title',
-// description: 'An example description.',
-// keywords: ['typescript', 'html', 'parsing'],
-// views: 1234
-// image: {
-//   url: 'https://example.se/images/c12ffe73-3227-4a4a-b8ad-a3003cdf1d70?h=708&amp;tight=false&amp;w=1372',
-//   width: 1372,
-//   height: 708
-// }
-// }
+const scraper = defineScraper({
+  schema: z.object({
+    title: z.string(),
+    image: z.object({
+      url: z.string().url(),
+      width: z.coerce.number(),
+      height: z.coerce.number(),
+    }).default({ url: '', width: 0, height: 0 }).optional(),
+  }),
+  extract: {
+    title: {
+      selector: 'title',
+    },
+    image: {
+      selector: 'head',
+      value: {
+        url: {
+          selector: 'meta[property="og:image"]',
+          value: 'content',
+        },
+        width: {
+          selector: 'meta[property="og:image:width"]',
+          value: 'content',
+        },
+        height: {
+          selector: 'meta[property="og:image:height"]',
+          value: 'content',
+        },
+      },
+    },
+  },
+});
 ```
-## Configuration
-xscrape offers a range of configuration options through the types provided,
-allowing for detailed customization and robust data extraction and validation:
+-----
-- `SchemaFieldDefinitions`: Determines how fields are extracted from the HTML.
-- `SchemaValidator`: Validates the extracted data according to defined schemas.
+## Configuration
-## API Reference
+The `defineScraper` function accepts a configuration object with the following properties:
-- `createScraper(config: ScrapeConfig): (html: string) => T` Creates a scraping function based on the specified fields and validator.
-- `ZodValidator` A built-in validator using Zod, allowing you to define schemas andvalidate data effortlessly.
+  * **`schema`**: A schema object from any library that implements the [Standard Schema](https://standardschema.dev) interface. This schema defines the shape and validation rules for the extracted data.
+  * **`extract`**: An object that determines how fields are extracted from the HTML using CSS selectors.
+  * **`transform`** (optional): A function to apply custom transformations to the validated data.
-For a complete list of API methods and more advanced configuration options,refer to the documentation on the project homepage https://github.com/johnie/xscrape.
+-----
 ## Contributing
-Contributions are welcome! Please see the Contributing Guide https://github.com/johnie/xscrape/blob/main/CONTRIBUTING.md for more information.
+Contributions are welcome\! Please see the [Contributing Guide](https://github.com/johnie/xscrape/blob/main/CONTRIBUTING.md) for more information.
 ## License
-This project is licensed under the MIT License. See the LICENSE
-https://github.com/johnie/xscrape/blob/main/LICENSE file for details.
+This project is licensed under the MIT License. See the [LICENSE](https://github.com/johnie/xscrape/blob/main/LICENSE) file for details.

package/dist/index.cjs CHANGED Viewed

@@ -28,88 +28,48 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
 var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
 // src/index.ts
-var src_exports = {};
-__export(src_exports, {
+var index_exports = {};
+__export(index_exports, {
   defineScraper: () => defineScraper
 });
-module.exports = __toCommonJS(src_exports);
+module.exports = __toCommonJS(index_exports);
 // src/defineScraper.ts
 var cheerio = __toESM(require("cheerio"), 1);
-// src/validators.ts
-var import_zod = require("zod");
-var Validator = class {
-  constructor(schema, validateFunction) {
-    this.schema = schema;
-    this.validateFunction = validateFunction;
-  }
-  validate(data) {
-    try {
-      const result = this.validateFunction(this.schema, data);
-      return { success: true, data: result };
-    } catch (error) {
-      return { success: false, error };
-    }
-  }
-};
-function getSchemaBuilder(type) {
-  switch (type) {
-    case "zod":
-      return import_zod.z;
-    default:
-      throw new Error(`Unsupported validator type: ${type}`);
-  }
-}
-function createValidator(type, schemaFn) {
-  const builder = getSchemaBuilder(type);
-  const schema = schemaFn(builder);
-  switch (type) {
-    case "zod":
-      return new Validator(
-        schema,
-        (schema2, data) => schema2.parse(data)
-      );
-    default:
-      throw new Error(`Unsupported validator type: ${type}`);
-  }
-}
-// src/defineScraper.ts
 function defineScraper(config) {
-  const validator = createValidator(config.validator, config.schema);
   return async (html) => {
     try {
       const $ = cheerio.load(html);
       const extractedData = $.extract(config.extract);
-      const validationResult = validator.validate(extractedData);
-      if (!validationResult.success) {
-        return { error: validationResult.error };
+      const validationResult = await Promise.resolve(
+        config.schema["~standard"].validate(extractedData)
+      );
+      if (validationResult.issues) {
+        return { error: validationResult.issues };
       }
-      if (!validationResult.data) {
+      if (!("value" in validationResult)) {
         return {
-          error: new Error("Validation succeeded but no data was returned")
+          error: new Error(
+            "xscrape: Validation succeeded but no data was returned"
+          )
         };
       }
       if (config.transform) {
         try {
           const transformed = await Promise.resolve(
-            config.transform(validationResult.data)
+            config.transform(validationResult.value)
           );
           return { data: transformed };
         } catch (error) {
           return { error };
         }
       }
-      return { data: validationResult.data };
+      return { data: validationResult.value };
     } catch (error) {
       return { error };
     }
   };
 }
-// src/types/main.ts
-var import_zod2 = require("zod");
 // Annotate the CommonJS export names for ESM import in node:
 0 && (module.exports = {
   defineScraper

package/dist/index.d.cts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { z } from 'zod';
+import { StandardSchemaV1 } from '@standard-schema/spec';
 import { Element } from 'domhandler';
 type ExtractDescriptorFn = (el: Element, key: string, obj: Record<string, unknown>) => unknown;
@@ -11,34 +11,10 @@ interface ExtractMap {
     [key: string]: ExtractValue;
 }
-type ValidatorType = 'zod';
-type ZodBuilder = typeof z;
-type SchemaBuilder<V extends ValidatorType> = V extends 'zod' ? ZodBuilder : never;
-type SchemaFunction<V extends ValidatorType, T> = (builder: SchemaBuilder<V>) => V extends 'zod' ? z.ZodSchema<T> : never;
-type ScraperConfig<T extends Record<string, unknown>, V extends ValidatorType, R extends T = T> = {
-    validator: V;
-    schema: SchemaFunction<V, T>;
+type ScraperConfig<S extends StandardSchemaV1<any, any>, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>> = {
+    schema: S;
     extract: ExtractMap;
-    transform?: (data: T) => Promise<R> | R;
-};
-type BaseFieldOptions = {
-    attribute?: string;
-};
-type LeafFieldConfig = BaseFieldOptions & {
-    selector?: string;
-    selectorAll?: string;
-} & ({
-    selector: string;
-    selectorAll?: never;
-} | {
-    selector?: never;
-    selectorAll: string;
-});
-type FieldConfig<T> = T extends object ? T extends Array<infer U> ? LeafFieldConfig : {
-    fields: Fields<T>;
-} : LeafFieldConfig;
-type Fields<T> = {
-    [K in keyof T]: FieldConfig<T[K]>;
+    transform?: (data: StandardSchemaV1.InferOutput<S>) => Promise<R> | R;
 };
 type ValidationResult<T> = {
     success: boolean;
@@ -50,17 +26,6 @@ type ScraperResult<T> = {
     error?: unknown;
 };
-/**
- * Defines a scraper with the provided configuration.
- *
- * @template T - The shape of the extracted data.
- * @template V - The type of the validator used for validation.
- * @template R - The type of the result after optional transformation, defaults to T.
- *
- * @param config - The configuration object for the scraper.
- * @returns A function that takes an HTML string and returns the scraping result, which could be
- * a scraper result or a promise of a scraper result.
- */
-declare function defineScraper<T extends Record<string, unknown>, V extends ValidatorType, R extends T = T>(config: ScraperConfig<T, V, R>): (html: string) => Promise<ScraperResult<R>>;
+declare function defineScraper<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>>(config: ScraperConfig<S, R>): (html: string) => Promise<ScraperResult<R>>;
-export { type FieldConfig, type Fields, type LeafFieldConfig, type SchemaBuilder, type SchemaFunction, type ScraperConfig, type ScraperResult, type ValidationResult, type ValidatorType, defineScraper };
+export { type ScraperConfig, type ScraperResult, type ValidationResult, defineScraper };

package/dist/index.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { z } from 'zod';
+import { StandardSchemaV1 } from '@standard-schema/spec';
 import { Element } from 'domhandler';
 type ExtractDescriptorFn = (el: Element, key: string, obj: Record<string, unknown>) => unknown;
@@ -11,34 +11,10 @@ interface ExtractMap {
     [key: string]: ExtractValue;
 }
-type ValidatorType = 'zod';
-type ZodBuilder = typeof z;
-type SchemaBuilder<V extends ValidatorType> = V extends 'zod' ? ZodBuilder : never;
-type SchemaFunction<V extends ValidatorType, T> = (builder: SchemaBuilder<V>) => V extends 'zod' ? z.ZodSchema<T> : never;
-type ScraperConfig<T extends Record<string, unknown>, V extends ValidatorType, R extends T = T> = {
-    validator: V;
-    schema: SchemaFunction<V, T>;
+type ScraperConfig<S extends StandardSchemaV1<any, any>, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>> = {
+    schema: S;
     extract: ExtractMap;
-    transform?: (data: T) => Promise<R> | R;
-};
-type BaseFieldOptions = {
-    attribute?: string;
-};
-type LeafFieldConfig = BaseFieldOptions & {
-    selector?: string;
-    selectorAll?: string;
-} & ({
-    selector: string;
-    selectorAll?: never;
-} | {
-    selector?: never;
-    selectorAll: string;
-});
-type FieldConfig<T> = T extends object ? T extends Array<infer U> ? LeafFieldConfig : {
-    fields: Fields<T>;
-} : LeafFieldConfig;
-type Fields<T> = {
-    [K in keyof T]: FieldConfig<T[K]>;
+    transform?: (data: StandardSchemaV1.InferOutput<S>) => Promise<R> | R;
 };
 type ValidationResult<T> = {
     success: boolean;
@@ -50,17 +26,6 @@ type ScraperResult<T> = {
     error?: unknown;
 };
-/**
- * Defines a scraper with the provided configuration.
- *
- * @template T - The shape of the extracted data.
- * @template V - The type of the validator used for validation.
- * @template R - The type of the result after optional transformation, defaults to T.
- *
- * @param config - The configuration object for the scraper.
- * @returns A function that takes an HTML string and returns the scraping result, which could be
- * a scraper result or a promise of a scraper result.
- */
-declare function defineScraper<T extends Record<string, unknown>, V extends ValidatorType, R extends T = T>(config: ScraperConfig<T, V, R>): (html: string) => Promise<ScraperResult<R>>;
+declare function defineScraper<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>>(config: ScraperConfig<S, R>): (html: string) => Promise<ScraperResult<R>>;
-export { type FieldConfig, type Fields, type LeafFieldConfig, type SchemaBuilder, type SchemaFunction, type ScraperConfig, type ScraperResult, type ValidationResult, type ValidatorType, defineScraper };
+export { type ScraperConfig, type ScraperResult, type ValidationResult, defineScraper };

package/dist/index.js CHANGED Viewed

@@ -1,79 +1,39 @@
 // src/defineScraper.ts
 import * as cheerio from "cheerio";
-// src/validators.ts
-import { z } from "zod";
-var Validator = class {
-  constructor(schema, validateFunction) {
-    this.schema = schema;
-    this.validateFunction = validateFunction;
-  }
-  validate(data) {
-    try {
-      const result = this.validateFunction(this.schema, data);
-      return { success: true, data: result };
-    } catch (error) {
-      return { success: false, error };
-    }
-  }
-};
-function getSchemaBuilder(type) {
-  switch (type) {
-    case "zod":
-      return z;
-    default:
-      throw new Error(`Unsupported validator type: ${type}`);
-  }
-}
-function createValidator(type, schemaFn) {
-  const builder = getSchemaBuilder(type);
-  const schema = schemaFn(builder);
-  switch (type) {
-    case "zod":
-      return new Validator(
-        schema,
-        (schema2, data) => schema2.parse(data)
-      );
-    default:
-      throw new Error(`Unsupported validator type: ${type}`);
-  }
-}
-// src/defineScraper.ts
 function defineScraper(config) {
-  const validator = createValidator(config.validator, config.schema);
   return async (html) => {
     try {
       const $ = cheerio.load(html);
       const extractedData = $.extract(config.extract);
-      const validationResult = validator.validate(extractedData);
-      if (!validationResult.success) {
-        return { error: validationResult.error };
+      const validationResult = await Promise.resolve(
+        config.schema["~standard"].validate(extractedData)
+      );
+      if (validationResult.issues) {
+        return { error: validationResult.issues };
       }
-      if (!validationResult.data) {
+      if (!("value" in validationResult)) {
         return {
-          error: new Error("Validation succeeded but no data was returned")
+          error: new Error(
+            "xscrape: Validation succeeded but no data was returned"
+          )
         };
       }
       if (config.transform) {
         try {
           const transformed = await Promise.resolve(
-            config.transform(validationResult.data)
+            config.transform(validationResult.value)
           );
           return { data: transformed };
         } catch (error) {
           return { error };
         }
       }
-      return { data: validationResult.data };
+      return { data: validationResult.value };
     } catch (error) {
       return { error };
     }
   };
 }
-// src/types/main.ts
-import "zod";
 export {
   defineScraper
 };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "xscrape",
-  "version": "2.0.0",
+  "version": "3.0.0",
   "description": "A flexible and powerful library designed to extract and transform data from HTML documents using user-defined schemas",
   "main": "dist/index.js",
   "exports": {
@@ -39,23 +39,24 @@
   },
   "homepage": "https://github.com/johnie/xscrape#readme",
   "devDependencies": {
-    "@arethetypeswrong/cli": "^0.17.3",
+    "@arethetypeswrong/cli": "^0.18.2",
     "@changesets/changelog-github": "^0.5.1",
-    "@changesets/cli": "^2.28.1",
-    "domhandler": "^5.0.3",
-    "jsdom": "^26.0.0",
-    "prettier": "^3.5.1",
-    "tsup": "^8.3.6",
-    "typescript": "^5.7.3",
-    "vite": "^6.1.1",
-    "vitest": "^3.0.6"
+    "@changesets/cli": "^2.29.5",
+    "arktype": "^2.1.20",
+    "effect": "^3.16.12",
+    "jsdom": "^26.1.0",
+    "prettier": "^3.6.2",
+    "tsup": "^8.5.0",
+    "typescript": "^5.8.3",
+    "valibot": "^1.1.0",
+    "vite": "^7.0.4",
+    "vitest": "^3.2.4",
+    "zod": "^4.0.2"
   },
   "dependencies": {
-    "cheerio": "^1.0.0",
-    "effect": "^3.13.2",
-    "joi": "^17.13.3",
-    "yup": "^1.6.1",
-    "zod": "^3.24.2"
+    "@standard-schema/spec": "^1.0.0",
+    "cheerio": "^1.1.0",
+    "domhandler": "^5.0.3"
   },
   "scripts": {
     "build": "tsup",