xscrape 3.0.2 → 3.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -11,47 +11,78 @@
11
11
  <p align="center">
12
12
  <a href="https://opensource.org/licenses/MIT" rel="nofollow"><img src="https://img.shields.io/github/license/johnie/xscrape" alt="License"></a>
13
13
  <a href="https://www.npmjs.com/package/xscrape" rel="nofollow"><img src="https://img.shields.io/npm/v/xscrape.svg" alt="npm"></a>
14
+ <a href="https://github.com/johnie/xscrape/actions"><img src="https://github.com/johnie/xscrape/actions/workflows/ci.yml/badge.svg" alt="Build Status"></a>
14
15
  <a href="https://github.com/johnie/xscrape" rel="nofollow"><img src="https://img.shields.io/github/stars/johnie/xscrape" alt="stars"></a>
15
16
  </p>
16
17
 
17
18
  <br/>
18
19
  <br/>
19
20
 
20
- ## Features
21
+ ## Overview
22
+
23
+ xscrape is a powerful HTML scraping library that combines the flexibility of query selectors with the safety of schema validation. It works with any validation library that implements the [Standard Schema](https://standardschema.dev) specification, including Zod, Valibot, ArkType, and Effect Schema.
21
24
 
22
- * **HTML Parsing**: Extract data from HTML using CSS selectors with the help of [cheerio](https://github.com/cheeriojs/cheerio).
23
- * **Flexible Schema Validation**: Validate and transform extracted data with any validation library that implements the [Standard Schema](https://standardschema.dev), such as Zod, Valibot, ArkType, and Effect Schema.
24
- * **Custom Transformations**: Provide custom transformations for extracted attributes.
25
- * **Default Values**: Define default values for missing data fields through your chosen schema library's features.
26
- * **Nested Field Support**: Define and extract nested data structures from HTML elements.
25
+ ## Features
27
26
 
28
- -----
27
+ - **HTML Parsing**: Extract data from HTML using query selectors powered by [cheerio](https://github.com/cheeriojs/cheerio)
28
+ - **Universal Schema Support**: Works with any [Standard Schema](https://standardschema.dev) compatible library
29
+ - **Type Safety**: Full TypeScript support with inferred types from your schemas
30
+ - **Flexible Extraction**: Support for nested objects, arrays, and custom transformation functions
31
+ - **Error Handling**: Comprehensive error handling with detailed validation feedback
32
+ - **Custom Transformations**: Apply post-processing transformations to validated data
33
+ - **Default Values**: Handle missing data gracefully through schema defaults
29
34
 
30
35
  ## Installation
31
36
 
32
- To install this library, use your preferred package manager:
37
+ Install xscrape with your preferred package manager:
33
38
 
34
39
  ```bash
40
+ npm install xscrape
41
+ # or
35
42
  pnpm add xscrape
36
43
  # or
37
- npm install xscrape
44
+ bun add xscrape
38
45
  ```
39
46
 
40
- You will also need to install your chosen schema validation library, for example, Zod:
47
+ ## Quick Start
41
48
 
42
- ```bash
43
- pnpm add zod
44
- # or
45
- npm install zod
49
+ ```typescript
50
+ import { defineScraper } from 'xscrape';
51
+ import { z } from 'zod';
52
+
53
+ // Define your schema
54
+ const schema = z.object({
55
+ title: z.string(),
56
+ description: z.string(),
57
+ keywords: z.array(z.string()),
58
+ views: z.coerce.number(),
59
+ });
60
+
61
+ // Create a scraper
62
+ const scraper = defineScraper({
63
+ schema,
64
+ extract: {
65
+ title: { selector: 'title' },
66
+ description: { selector: 'meta[name="description"]', value: 'content' },
67
+ keywords: {
68
+ selector: 'meta[name="keywords"]',
69
+ value: (el) => el.attribs['content']?.split(',') || [],
70
+ },
71
+ views: { selector: 'meta[name="views"]', value: 'content' },
72
+ },
73
+ });
74
+
75
+ // Use the scraper
76
+ const { data, error } = await scraper(htmlString);
46
77
  ```
47
78
 
48
- -----
79
+ ## Usage Examples
49
80
 
50
- ## Usage
81
+ ### Basic Extraction
51
82
 
52
- Below is an example of how to use `xscrape` with a Zod schema to extract and transform data from an HTML document.
83
+ Extract basic metadata from an HTML page:
53
84
 
54
- ```ts
85
+ ```typescript
55
86
  import { defineScraper } from 'xscrape';
56
87
  import { z } from 'zod';
57
88
 
@@ -59,27 +90,12 @@ const scraper = defineScraper({
59
90
  schema: z.object({
60
91
  title: z.string(),
61
92
  description: z.string(),
62
- keywords: z.array(z.string()),
63
- views: z.coerce.number(),
93
+ author: z.string(),
64
94
  }),
65
95
  extract: {
66
- title: {
67
- selector: 'title',
68
- },
69
- description: {
70
- selector: 'meta[name="description"]',
71
- value: 'content',
72
- },
73
- keywords: {
74
- selector: 'meta[name="keywords"]',
75
- value(el) {
76
- return el.attribs['content']?.split(',');
77
- },
78
- },
79
- views: {
80
- selector: 'meta[name="views"]',
81
- value: 'content',
82
- },
96
+ title: { selector: 'title' },
97
+ description: { selector: 'meta[name="description"]', value: 'content' },
98
+ author: { selector: 'meta[name="author"]', value: 'content' },
83
99
  },
84
100
  });
85
101
 
@@ -87,115 +103,320 @@ const html = `
87
103
  <!DOCTYPE html>
88
104
  <html>
89
105
  <head>
90
- <meta name="description" content="An example description.">
91
- <meta name="keywords" content="typescript,html,parsing">
92
- <meta name="views" content="1234">
93
- <title>Example Title</title>
106
+ <title>My Blog Post</title>
107
+ <meta name="description" content="An interesting blog post">
108
+ <meta name="author" content="John Doe">
94
109
  </head>
95
- <body></body>
110
+ <body>...</body>
96
111
  </html>
97
112
  `;
98
113
 
99
114
  const { data, error } = await scraper(html);
100
- console.log(data);
101
-
102
- // Outputs:
103
- // {
104
- // title: 'Example Title',
105
- // description: 'An example description.',
106
- // keywords: ['typescript', 'html', 'parsing'],
107
- // views: 1234
108
- // }
115
+ // data: { title: "My Blog Post", description: "An interesting blog post", author: "John Doe" }
109
116
  ```
110
117
 
111
118
  ### Handling Missing Data
112
119
 
113
- You can handle missing data by using the features of your chosen schema library, such as default values in Zod.
114
-
115
- ```ts
116
- import { defineScraper } from 'xscrape';
117
- import { z } from 'zod';
120
+ Use schema defaults to handle missing data gracefully:
118
121
 
122
+ ```typescript
119
123
  const scraper = defineScraper({
120
124
  schema: z.object({
121
- title: z.string().default('No title'),
122
- description: z.string().default('No description'),
125
+ title: z.string().default('Untitled'),
126
+ description: z.string().default('No description available'),
127
+ publishedAt: z.string().optional(),
123
128
  views: z.coerce.number().default(0),
124
129
  }),
125
130
  extract: {
126
- title: {
127
- selector: 'title',
128
- },
129
- description: {
130
- selector: 'meta[name="description"]',
131
- value: 'content',
132
- },
133
- views: {
134
- selector: 'meta[name="views"]',
135
- value: 'content',
136
- },
131
+ title: { selector: 'title' },
132
+ description: { selector: 'meta[name="description"]', value: 'content' },
133
+ publishedAt: { selector: 'meta[name="published"]', value: 'content' },
134
+ views: { selector: 'meta[name="views"]', value: 'content' },
137
135
  },
138
136
  });
137
+
138
+ // Even with incomplete HTML, you get sensible defaults
139
+ const { data } = await scraper('<html><head><title>Test</title></head></html>');
140
+ // data: { title: "Test", description: "No description available", views: 0 }
139
141
  ```
140
142
 
141
- ### Nested Fields
143
+ ### Extracting Arrays
142
144
 
143
- `xscrape` also supports extracting nested data structures.
145
+ Extract multiple elements as arrays:
144
146
 
145
- ```ts
146
- import { defineScraper } from 'xscrape';
147
- import { z } from 'zod';
147
+ ```typescript
148
+ const scraper = defineScraper({
149
+ schema: z.object({
150
+ links: z.array(z.string()),
151
+ headings: z.array(z.string()),
152
+ }),
153
+ extract: {
154
+ links: [{ selector: 'a', value: 'href' }],
155
+ headings: [{ selector: 'h1, h2, h3' }],
156
+ },
157
+ });
158
+
159
+ const html = `
160
+ <html>
161
+ <body>
162
+ <h1>Main Title</h1>
163
+ <h2>Subtitle</h2>
164
+ <a href="/page1">Link 1</a>
165
+ <a href="/page2">Link 2</a>
166
+ </body>
167
+ </html>
168
+ `;
148
169
 
170
+ const { data } = await scraper(html);
171
+ // data: {
172
+ // links: ["/page1", "/page2"],
173
+ // headings: ["Main Title", "Subtitle"]
174
+ // }
175
+ ```
176
+
177
+ ### Nested Objects
178
+
179
+ Extract complex nested data structures:
180
+
181
+ ```typescript
149
182
  const scraper = defineScraper({
150
183
  schema: z.object({
151
184
  title: z.string(),
152
- image: z.object({
153
- url: z.string().url(),
185
+ socialMedia: z.object({
186
+ image: z.string().url(),
154
187
  width: z.coerce.number(),
155
188
  height: z.coerce.number(),
156
- }).default({ url: '', width: 0, height: 0 }).optional(),
189
+ type: z.string(),
190
+ }),
157
191
  }),
158
192
  extract: {
159
- title: {
160
- selector: 'title',
161
- },
162
- image: {
193
+ title: { selector: 'title' },
194
+ socialMedia: {
163
195
  selector: 'head',
164
196
  value: {
165
- url: {
166
- selector: 'meta[property="og:image"]',
167
- value: 'content',
168
- },
169
- width: {
170
- selector: 'meta[property="og:image:width"]',
171
- value: 'content',
172
- },
173
- height: {
174
- selector: 'meta[property="og:image:height"]',
175
- value: 'content',
176
- },
197
+ image: { selector: 'meta[property="og:image"]', value: 'content' },
198
+ width: { selector: 'meta[property="og:image:width"]', value: 'content' },
199
+ height: { selector: 'meta[property="og:image:height"]', value: 'content' },
200
+ type: { selector: 'meta[property="og:type"]', value: 'content' },
201
+ },
202
+ },
203
+ },
204
+ });
205
+ ```
206
+
207
+ ### Custom Value Transformations
208
+
209
+ Apply custom logic to extracted values:
210
+
211
+ ```typescript
212
+ const scraper = defineScraper({
213
+ schema: z.object({
214
+ tags: z.array(z.string()),
215
+ publishedDate: z.date(),
216
+ readingTime: z.number(),
217
+ }),
218
+ extract: {
219
+ tags: {
220
+ selector: 'meta[name="keywords"]',
221
+ value: (el) => el.attribs['content']?.split(',').map(tag => tag.trim()) || [],
222
+ },
223
+ publishedDate: {
224
+ selector: 'meta[name="published"]',
225
+ value: (el) => new Date(el.attribs['content']),
226
+ },
227
+ readingTime: {
228
+ selector: 'article',
229
+ value: (el) => {
230
+ const text = el.text();
231
+ const wordsPerMinute = 200;
232
+ const wordCount = text.split(/\s+/).length;
233
+ return Math.ceil(wordCount / wordsPerMinute);
177
234
  },
178
235
  },
179
236
  },
180
237
  });
181
238
  ```
182
239
 
183
- -----
240
+ ### Post-Processing with Transform
241
+
242
+ Apply transformations to the validated data:
243
+
244
+ ```typescript
245
+ const scraper = defineScraper({
246
+ schema: z.object({
247
+ title: z.string(),
248
+ description: z.string(),
249
+ tags: z.array(z.string()),
250
+ }),
251
+ extract: {
252
+ title: { selector: 'title' },
253
+ description: { selector: 'meta[name="description"]', value: 'content' },
254
+ tags: {
255
+ selector: 'meta[name="keywords"]',
256
+ value: (el) => el.attribs['content']?.split(',') || [],
257
+ },
258
+ },
259
+ transform: (data) => ({
260
+ ...data,
261
+ slug: data.title.toLowerCase().replace(/\s+/g, '-'),
262
+ tagCount: data.tags.length,
263
+ summary: data.description.substring(0, 100) + '...',
264
+ }),
265
+ });
266
+ ```
267
+
268
+ ## Schema Library Examples
269
+
270
+ ### Zod
184
271
 
185
- ## Configuration
272
+ ```typescript
273
+ import { z } from 'zod';
186
274
 
187
- The `defineScraper` function accepts a configuration object with the following properties:
275
+ const schema = z.object({
276
+ title: z.string(),
277
+ price: z.coerce.number(),
278
+ inStock: z.boolean().default(false),
279
+ });
280
+ ```
188
281
 
189
- * **`schema`**: A schema object from any library that implements the [Standard Schema](https://standardschema.dev) interface. This schema defines the shape and validation rules for the extracted data.
190
- * **`extract`**: An object that determines how fields are extracted from the HTML using CSS selectors.
191
- * **`transform`** (optional): A function to apply custom transformations to the validated data.
282
+ ### Valibot
283
+
284
+ ```typescript
285
+ import * as v from 'valibot';
286
+
287
+ const schema = v.object({
288
+ title: v.string(),
289
+ price: v.pipe(v.string(), v.transform(Number)),
290
+ inStock: v.optional(v.boolean(), false),
291
+ });
292
+ ```
192
293
 
193
- -----
294
+ ### ArkType
295
+
296
+ ```typescript
297
+ import { type } from 'arktype';
298
+
299
+ const schema = type({
300
+ title: 'string',
301
+ price: 'number',
302
+ inStock: 'boolean = false',
303
+ });
304
+ ```
305
+
306
+ ### Effect Schema
307
+
308
+ ```typescript
309
+ import { Schema } from 'effect';
310
+
311
+ const schema = Schema.Struct({
312
+ title: Schema.String,
313
+ price: Schema.NumberFromString,
314
+ inStock: Schema.optionalWith(Schema.Boolean, { default: () => false }),
315
+ });
316
+ ```
317
+
318
+ ## API Reference
319
+
320
+ ### `defineScraper(config)`
321
+
322
+ Creates a scraper function with the specified configuration.
323
+
324
+ #### Parameters
325
+
326
+ - `config.schema`: A Standard Schema compatible schema object
327
+ - `config.extract`: Extraction configuration object
328
+ - `config.transform?`: Optional post-processing function
329
+
330
+ #### Returns
331
+
332
+ A scraper function that takes HTML string and returns `Promise<{ data?: T, error?: unknown }>`.
333
+
334
+ ### Extraction Configuration
335
+
336
+ The `extract` object defines how to extract data from HTML:
337
+
338
+ ```typescript
339
+ type ExtractConfig = {
340
+ [key: string]: ExtractDescriptor | [ExtractDescriptor];
341
+ };
342
+
343
+ type ExtractDescriptor = {
344
+ selector: string;
345
+ value?: string | ((el: Element) => any) | ExtractConfig;
346
+ };
347
+ ```
348
+
349
+ #### Properties
350
+
351
+ - `selector`: CSS selector to find elements
352
+ - `value`: How to extract the value:
353
+ - `string`: Attribute name (e.g., `'href'`, `'content'`)
354
+ - `function`: Custom extraction function
355
+ - `object`: Nested extraction configuration
356
+ - `undefined`: Extract text content
357
+
358
+ #### Array Extraction
359
+
360
+ Wrap the descriptor in an array to extract multiple elements:
361
+
362
+ ```typescript
363
+ {
364
+ links: [{ selector: 'a', value: 'href' }]
365
+ }
366
+ ```
367
+
368
+ ## Error Handling
369
+
370
+ xscrape provides comprehensive error handling:
371
+
372
+ ```typescript
373
+ const { data, error } = await scraper(html);
374
+
375
+ if (error) {
376
+ // Handle validation errors, extraction errors, or transform errors
377
+ console.error('Scraping failed:', error);
378
+ } else {
379
+ // Use the validated data
380
+ console.log('Extracted data:', data);
381
+ }
382
+ ```
383
+
384
+ ## Best Practices
385
+
386
+ 1. **Use Specific Selectors**: Be as specific as possible with CSS selectors to avoid unexpected matches
387
+ 2. **Handle Missing Data**: Use schema defaults or optional fields for data that might not be present
388
+ 3. **Validate URLs**: Use URL validation in your schema for href attributes
389
+ 4. **Transform Data Early**: Use custom value functions rather than post-processing when possible
390
+ 5. **Type Safety**: Let TypeScript infer types from your schema for better developer experience
391
+
392
+ ## Common Use Cases
393
+
394
+ - **Web Scraping**: Extract structured data from websites
395
+ - **Meta Tag Extraction**: Get social media and SEO metadata
396
+ - **Content Migration**: Transform HTML content to structured data
397
+ - **Testing**: Validate HTML structure in tests
398
+ - **RSS/Feed Processing**: Extract article data from HTML feeds
399
+
400
+ ## Performance Considerations
401
+
402
+ - xscrape uses cheerio for fast HTML parsing
403
+ - Schema validation is performed once after extraction
404
+ - Consider using streaming for large HTML documents
405
+ - Cache scrapers when processing many similar documents
194
406
 
195
407
  ## Contributing
196
408
 
197
- Contributions are welcome\! Please see the [Contributing Guide](https://github.com/johnie/xscrape/blob/main/CONTRIBUTING.md) for more information.
409
+ We welcome contributions! Please see our [Contributing Guide](https://github.com/johnie/xscrape/blob/main/CONTRIBUTING.md) for details.
198
410
 
199
411
  ## License
200
412
 
201
- This project is licensed under the MIT License. See the [LICENSE](https://github.com/johnie/xscrape/blob/main/LICENSE) file for details.
413
+ MIT License. See the [LICENSE](https://github.com/johnie/xscrape/blob/main/LICENSE) file for details.
414
+
415
+ ## Related Projects
416
+
417
+ - [cheerio](https://github.com/cheeriojs/cheerio) - jQuery-like server-side HTML parsing
418
+ - [Standard Schema](https://standardschema.dev) - Universal schema specification
419
+ - [Zod](https://zod.dev) - TypeScript-first schema validation
420
+ - [Valibot](https://valibot.dev) - Modular and type-safe schema library
421
+ - [Effect](https://effect.website) - Maximum Type-safety (incl. error handling)
422
+ - [ArkType](https://arktype.io) - TypeScript's 1:1 validator, optimized from editor to runtime
package/dist/index.cjs CHANGED
@@ -55,14 +55,10 @@ function defineScraper(config) {
55
55
  };
56
56
  }
57
57
  if (config.transform) {
58
- try {
59
- const transformed = await Promise.resolve(
60
- config.transform(validationResult.value)
61
- );
62
- return { data: transformed };
63
- } catch (error) {
64
- return { error };
65
- }
58
+ const transformed = await Promise.resolve(
59
+ config.transform(validationResult.value)
60
+ );
61
+ return { data: transformed };
66
62
  }
67
63
  return { data: validationResult.value };
68
64
  } catch (error) {
package/dist/index.js CHANGED
@@ -19,14 +19,10 @@ function defineScraper(config) {
19
19
  };
20
20
  }
21
21
  if (config.transform) {
22
- try {
23
- const transformed = await Promise.resolve(
24
- config.transform(validationResult.value)
25
- );
26
- return { data: transformed };
27
- } catch (error) {
28
- return { error };
29
- }
22
+ const transformed = await Promise.resolve(
23
+ config.transform(validationResult.value)
24
+ );
25
+ return { data: transformed };
30
26
  }
31
27
  return { data: validationResult.value };
32
28
  } catch (error) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "xscrape",
3
- "version": "3.0.2",
3
+ "version": "3.0.3",
4
4
  "description": "A flexible and powerful library designed to extract and transform data from HTML documents using user-defined schemas",
5
5
  "main": "dist/index.js",
6
6
  "exports": {
@@ -41,35 +41,39 @@
41
41
  "homepage": "https://github.com/johnie/xscrape#readme",
42
42
  "devDependencies": {
43
43
  "@arethetypeswrong/cli": "^0.18.2",
44
- "@biomejs/biome": "2.1.1",
44
+ "@biomejs/biome": "2.1.2",
45
45
  "@changesets/changelog-github": "^0.5.1",
46
46
  "@changesets/cli": "^2.29.5",
47
47
  "arktype": "^2.1.20",
48
- "effect": "^3.16.12",
48
+ "effect": "^3.17.0",
49
49
  "jsdom": "^26.1.0",
50
50
  "lefthook": "^1.12.2",
51
51
  "tsup": "^8.5.0",
52
52
  "typescript": "^5.8.3",
53
53
  "valibot": "^1.1.0",
54
- "vite": "^7.0.4",
54
+ "vite": "^7.0.5",
55
+ "vitepress": "^1.6.3",
55
56
  "vitest": "^3.2.4",
56
- "zod": "^4.0.2"
57
+ "zod": "^4.0.5"
57
58
  },
58
59
  "dependencies": {
59
60
  "@standard-schema/spec": "^1.0.0",
60
- "cheerio": "^1.1.0",
61
+ "cheerio": "^1.1.2",
61
62
  "domhandler": "^5.0.3"
62
63
  },
63
64
  "scripts": {
64
65
  "build": "tsup",
65
- "ci": "npm run build && npm run check-format && npm run check-exports && npm run lint && npm run test",
66
- "lint": "tsc",
66
+ "ci": "pnpm run build && pnpm run lint && pnpm run typecheck && pnpm run check-exports && pnpm run test",
67
+ "typecheck": "tsc",
67
68
  "test": "vitest run",
68
69
  "test:watch": "vitest",
69
70
  "format": "biome format --write ./src",
70
- "check-format": "biome check ./src",
71
+ "lint": "biome check ./src",
71
72
  "check-exports": "attw --pack .",
72
- "local-release": "npm run ci && changeset version && changeset publish",
73
- "release": "npm run ci && changeset publish"
73
+ "local-release": "pnpm run ci && changeset version && changeset publish",
74
+ "release": "pnpm run ci && changeset publish",
75
+ "docs:dev": "vitepress dev docs",
76
+ "docs:build": "vitepress build docs",
77
+ "docs:preview": "vitepress preview docs"
74
78
  }
75
79
  }