xscrape 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE.md ADDED
@@ -0,0 +1,9 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Johnie Hjelm
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,139 @@
1
+ # xscrape
2
+
3
+ `xscrape` is a powerful and flexible library designed for extracting and
4
+ transforming data from HTML documents using user-defined schemas. It integrates
5
+ seamlessly with various schema validation libraries such as Zod, Yup, Joi, and
6
+ Effect Schema, allowing you to use your preferred validation tool.
7
+
8
+ ## Features
9
+
10
+ - HTML Parsing: Extract data from HTML using CSS selectors with the help of
11
+ [cheerio](https://github.com/cheeriojs/cheerio).
12
+ - Schema Validation: Validate and transform extracted data with schema validation libraries like [Zod](https://github.com/colinhacks/zod).
13
+ - Custom Transformations: Provide custom transformations for extractedattributes.
14
+ - Default Values: Define default values for missing data fields.
15
+
16
+ ### Schema Support
17
+
18
+ | Schema Library | Status | Notes |
19
+ | ---------------------------------------------------- | ------------------- | ------------------------------------------------------------------ |
20
+ | [Zod](https://github.com/colinhacks/zod) | ✅ Supported | Default schema tool for `xscrape` |
21
+ | [Effect/Schema](https://github.com/Effect-TS/schema) | 🚧 Planned | Planned support for Effect/Schema for additional flexibility |
22
+ | [Joi](https://github.com/sideway/joi) | 🚧 Planned | Support for Joi for those familiar with server-side validation |
23
+ | [Yup](https://github.com/jquense/yup) | 🚧 Planned | Adding Yup support for schema validation in front-end applications |
24
+ | Others... | 🔄 In Consideration | Potential support for other schema tools as per user feedback |
25
+
26
+ ## Installation
27
+
28
+ To install this library, use npm or yarn:
29
+
30
+ ```bash
31
+ pnpm add xscrape
32
+ # or
33
+ npm install xscrape
34
+ ```
35
+
36
+ ## Usage
37
+
38
+ Below is an example of how to use xscrape for extracting and transforming data
39
+ from an HTML document:
40
+
41
+ 1. Define Your Schema
42
+
43
+ ```ts
44
+ import { z } from 'zod';
45
+
46
+ const schema = z.object({
47
+ title: z.string().default('No title'),
48
+ description: z.string(),
49
+ keywords: z.array(z.string()),
50
+ views: z.number(),
51
+ });
52
+ ```
53
+
54
+ 2. Define Field Definitions
55
+
56
+ ```ts
57
+ import { type SchemaFieldDefinitions } from 'xscrape';
58
+
59
+ type FieldDefinitions = SchemaFieldDefinitions<z.infer<typeof schema>>;
60
+
61
+ const fields: FieldDefinitions = {
62
+ title: { selector: 'title' },
63
+ description: {
64
+ selector: 'meta[name="description"]',
65
+ attribute: 'content',
66
+
67
+ defaultValue: 'No description',
68
+ },
69
+ keywords: {
70
+ selector: 'meta[name="keywords"]',
71
+ attribute: 'content',
72
+ transform: (value) => value.split(','),
73
+ defaultValue: [],
74
+ },
75
+ views: {
76
+ selector: 'meta[name="views"]',
77
+ attribute: 'content',
78
+ transform: (value) => parseInt(value, 10),
79
+ defaultValue: 0,
80
+ },
81
+ };
82
+ ```
83
+
84
+ 3. Create a Scraper and Extract Data
85
+
86
+ ```ts
87
+ import { createScraper, ZodValidator } from 'xscrape';
88
+
89
+ const validator = new ZodValidator(schema);
90
+ const scraper = createScraper({ fields, validator });
91
+
92
+ const html = `
93
+ <!DOCTYPE html>
94
+ <html>
95
+ <head>
96
+ <meta name="description" content="An example description.">
97
+ <meta name="keywords" content="typescript,html,parsing">
98
+ <meta name="views" content="1234">
99
+ <title>Example Title</title>
100
+ </head>
101
+ <body></body>
102
+ </html>
103
+ `;
104
+
105
+ const data = scraper(html);
106
+ console.log(data);
107
+
108
+ // Outputs:
109
+ // {
110
+ // title: 'Example Title',
111
+ // description: 'An example description.',
112
+ // keywords: ['typescript', 'html', 'parsing'],
113
+ // views: 1234
114
+ // }
115
+ ```
116
+
117
+ ## Configuration
118
+
119
+ xscrape offers a range of configuration options through the types provided,
120
+ allowing for detailed customization and robust data extraction and validation:
121
+
122
+ - `SchemaFieldDefinitions`: Determines how fields are extracted from the HTML.
123
+ - `SchemaValidator`: Validates the extracted data according to defined schemas.
124
+
125
+ ## API Reference
126
+
127
+ - `createScraper(config: ScrapeConfig): (html: string) => T` Creates a scraping function based on the specified fields and validator.
128
+ - `ZodValidator` A built-in validator using Zod, allowing you to define schemas andvalidate data effortlessly.
129
+
130
+ For a complete list of API methods and more advanced configuration options,refer to the documentation on the project homepage https://github.com/johnie/xscrape.
131
+
132
+ ## Contributing
133
+
134
+ Contributions are welcome! Please see the Contributing Guide https://github.com/johnie/xscrape/blob/main/CONTRIBUTING.md for more information.
135
+
136
+ ## License
137
+
138
+ This project is licensed under the MIT License. See the LICENSE
139
+ https://github.com/johnie/xscrape/blob/main/LICENSE file for details.
package/dist/index.cjs ADDED
@@ -0,0 +1,86 @@
1
+ "use strict";
2
+ var __create = Object.create;
3
+ var __defProp = Object.defineProperty;
4
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
+ var __getOwnPropNames = Object.getOwnPropertyNames;
6
+ var __getProtoOf = Object.getPrototypeOf;
7
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
8
+ var __export = (target, all) => {
9
+ for (var name in all)
10
+ __defProp(target, name, { get: all[name], enumerable: true });
11
+ };
12
+ var __copyProps = (to, from, except, desc) => {
13
+ if (from && typeof from === "object" || typeof from === "function") {
14
+ for (let key of __getOwnPropNames(from))
15
+ if (!__hasOwnProp.call(to, key) && key !== except)
16
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
+ }
18
+ return to;
19
+ };
20
+ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
+ // If the importer is in node compatibility mode or this is not an ESM
22
+ // file that has been converted to a CommonJS file using a Babel-
23
+ // compatible transform (i.e. "__esModule" has not been set), then set
24
+ // "default" to the CommonJS "module.exports" for node compatibility.
25
+ isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
+ mod
27
+ ));
28
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
+
30
+ // src/index.ts
31
+ var src_exports = {};
32
+ __export(src_exports, {
33
+ ZodValidator: () => ZodValidator,
34
+ createScraper: () => createScraper
35
+ });
36
+ module.exports = __toCommonJS(src_exports);
37
+
38
+ // src/createScraper.ts
39
+ var cheerio = __toESM(require("cheerio"), 1);
40
+ var createScraper = ({
41
+ fields,
42
+ validator
43
+ }) => {
44
+ return (html) => {
45
+ const $ = cheerio.load(html);
46
+ const data = {};
47
+ for (const key in fields) {
48
+ const fieldDef = fields[key];
49
+ const elements = $(fieldDef.selector);
50
+ let values = [];
51
+ elements.each((_, element) => {
52
+ const value = fieldDef.attribute ? $(element).attr(fieldDef.attribute) : $(element).text();
53
+ if (value !== void 0) {
54
+ values.push(value);
55
+ }
56
+ });
57
+ if (values.length === 0 && fieldDef.defaultValue !== void 0) {
58
+ data[key] = fieldDef.defaultValue;
59
+ } else if (fieldDef.multiple) {
60
+ data[key] = values.map(
61
+ (value) => fieldDef.transform ? fieldDef.transform(value) : value
62
+ );
63
+ } else {
64
+ const value = values[0];
65
+ data[key] = fieldDef.transform && value ? fieldDef.transform(value) : value;
66
+ }
67
+ }
68
+ return validator.validate(data);
69
+ };
70
+ };
71
+
72
+ // src/validators/zod.ts
73
+ var import_zod = require("zod");
74
+ var ZodValidator = class {
75
+ constructor(schema) {
76
+ this.schema = schema;
77
+ }
78
+ validate(data) {
79
+ return this.schema.parse(data);
80
+ }
81
+ };
82
+ // Annotate the CommonJS export names for ESM import in node:
83
+ 0 && (module.exports = {
84
+ ZodValidator,
85
+ createScraper
86
+ });
@@ -0,0 +1,29 @@
1
+ import { ZodSchema } from 'zod';
2
+
3
+ type ScrapeConfig<T> = {
4
+ fields: SchemaFieldDefinitions<T>;
5
+ validator: SchemaValidator<T>;
6
+ };
7
+ type FieldDefinition<T> = {
8
+ selector: string;
9
+ attribute?: string;
10
+ transform?: (value: string) => T;
11
+ defaultValue?: T;
12
+ multiple?: boolean;
13
+ };
14
+ type SchemaFieldDefinitions<T> = {
15
+ [K in keyof T]: FieldDefinition<T[K]>;
16
+ };
17
+ interface SchemaValidator<T> {
18
+ validate(data: unknown): T;
19
+ }
20
+
21
+ declare const createScraper: <T>({ fields, validator, }: ScrapeConfig<T>) => ((html: string) => T);
22
+
23
+ declare class ZodValidator<T> implements SchemaValidator<T> {
24
+ private schema;
25
+ constructor(schema: ZodSchema<T>);
26
+ validate(data: unknown): T;
27
+ }
28
+
29
+ export { type FieldDefinition, type SchemaFieldDefinitions, type SchemaValidator, type ScrapeConfig, ZodValidator, createScraper };
@@ -0,0 +1,29 @@
1
+ import { ZodSchema } from 'zod';
2
+
3
+ type ScrapeConfig<T> = {
4
+ fields: SchemaFieldDefinitions<T>;
5
+ validator: SchemaValidator<T>;
6
+ };
7
+ type FieldDefinition<T> = {
8
+ selector: string;
9
+ attribute?: string;
10
+ transform?: (value: string) => T;
11
+ defaultValue?: T;
12
+ multiple?: boolean;
13
+ };
14
+ type SchemaFieldDefinitions<T> = {
15
+ [K in keyof T]: FieldDefinition<T[K]>;
16
+ };
17
+ interface SchemaValidator<T> {
18
+ validate(data: unknown): T;
19
+ }
20
+
21
+ declare const createScraper: <T>({ fields, validator, }: ScrapeConfig<T>) => ((html: string) => T);
22
+
23
+ declare class ZodValidator<T> implements SchemaValidator<T> {
24
+ private schema;
25
+ constructor(schema: ZodSchema<T>);
26
+ validate(data: unknown): T;
27
+ }
28
+
29
+ export { type FieldDefinition, type SchemaFieldDefinitions, type SchemaValidator, type ScrapeConfig, ZodValidator, createScraper };
package/dist/index.js ADDED
@@ -0,0 +1,48 @@
1
+ // src/createScraper.ts
2
+ import * as cheerio from "cheerio";
3
+ var createScraper = ({
4
+ fields,
5
+ validator
6
+ }) => {
7
+ return (html) => {
8
+ const $ = cheerio.load(html);
9
+ const data = {};
10
+ for (const key in fields) {
11
+ const fieldDef = fields[key];
12
+ const elements = $(fieldDef.selector);
13
+ let values = [];
14
+ elements.each((_, element) => {
15
+ const value = fieldDef.attribute ? $(element).attr(fieldDef.attribute) : $(element).text();
16
+ if (value !== void 0) {
17
+ values.push(value);
18
+ }
19
+ });
20
+ if (values.length === 0 && fieldDef.defaultValue !== void 0) {
21
+ data[key] = fieldDef.defaultValue;
22
+ } else if (fieldDef.multiple) {
23
+ data[key] = values.map(
24
+ (value) => fieldDef.transform ? fieldDef.transform(value) : value
25
+ );
26
+ } else {
27
+ const value = values[0];
28
+ data[key] = fieldDef.transform && value ? fieldDef.transform(value) : value;
29
+ }
30
+ }
31
+ return validator.validate(data);
32
+ };
33
+ };
34
+
35
+ // src/validators/zod.ts
36
+ import "zod";
37
+ var ZodValidator = class {
38
+ constructor(schema) {
39
+ this.schema = schema;
40
+ }
41
+ validate(data) {
42
+ return this.schema.parse(data);
43
+ }
44
+ };
45
+ export {
46
+ ZodValidator,
47
+ createScraper
48
+ };
package/package.json ADDED
@@ -0,0 +1,63 @@
1
+ {
2
+ "name": "xscrape",
3
+ "version": "1.0.0",
4
+ "description": "A flexible and powerful library designed to extract and transform data from HTML documents using user-defined schemas",
5
+ "main": "dist/index.js",
6
+ "exports": {
7
+ ".": {
8
+ "import": "./dist/index.js",
9
+ "require": "./dist/index.cjs"
10
+ }
11
+ },
12
+ "files": [
13
+ "dist"
14
+ ],
15
+ "type": "module",
16
+ "scripts": {
17
+ "build": "tsup",
18
+ "ci": "npm run build && npm run check-format && npm run check-exports && npm run lint && npm run test",
19
+ "lint": "tsc",
20
+ "test": "vitest run",
21
+ "format": "prettier --write .",
22
+ "check-format": "prettier --check .",
23
+ "check-exports": "attw --pack .",
24
+ "local-release": "npm run ci && changeset version && changeset publish"
25
+ },
26
+ "keywords": [
27
+ "web-scraping",
28
+ "data-extraction",
29
+ "automation",
30
+ "html-parsing",
31
+ "data-transformation",
32
+ "user-defined-schemas",
33
+ "crawler",
34
+ "scraper",
35
+ "zod",
36
+ "yup",
37
+ "joi",
38
+ "effect-schema"
39
+ ],
40
+ "author": "Johnie Hjelm <johnie@hjelm.im>",
41
+ "license": "MIT",
42
+ "repository": {
43
+ "type": "git",
44
+ "url": "git+https://github.com/johnie/xscrape.git"
45
+ },
46
+ "bugs": {
47
+ "url": "https://github.com/johnie/xscrape/issues"
48
+ },
49
+ "homepage": "https://github.com/johnie/xscrape#readme",
50
+ "devDependencies": {
51
+ "@arethetypeswrong/cli": "^0.16.4",
52
+ "@changesets/cli": "^2.27.9",
53
+ "prettier": "^3.3.3",
54
+ "tsup": "^8.3.5",
55
+ "typescript": "^5.6.3",
56
+ "vite": "^5.4.10",
57
+ "vitest": "^2.1.3"
58
+ },
59
+ "dependencies": {
60
+ "cheerio": "^1.0.0",
61
+ "zod": "^3.23.8"
62
+ }
63
+ }