xscrape 3.0.4 → 3.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -66,7 +66,7 @@ const scraper = defineScraper({
66
66
  description: { selector: 'meta[name="description"]', value: 'content' },
67
67
  keywords: {
68
68
  selector: 'meta[name="keywords"]',
69
- value: (el) => el.attribs['content']?.split(',') || [],
69
+ value: (node) => node.attr('content')?.split(',') || [],
70
70
  },
71
71
  views: { selector: 'meta[name="views"]', value: 'content' },
72
72
  },
@@ -218,16 +218,16 @@ const scraper = defineScraper({
218
218
  extract: {
219
219
  tags: {
220
220
  selector: 'meta[name="keywords"]',
221
- value: (el) => el.attribs['content']?.split(',').map(tag => tag.trim()) || [],
221
+ value: (node) => node.attr('content')?.split(',').map(tag => tag.trim()) || [],
222
222
  },
223
223
  publishedDate: {
224
224
  selector: 'meta[name="published"]',
225
- value: (el) => new Date(el.attribs['content']),
225
+ value: (node) => new Date(node.attr('content') ?? ''),
226
226
  },
227
227
  readingTime: {
228
228
  selector: 'article',
229
- value: (el) => {
230
- const text = el.text();
229
+ value: (node) => {
230
+ const text = node.text();
231
231
  const wordsPerMinute = 200;
232
232
  const wordCount = text.split(/\s+/).length;
233
233
  return Math.ceil(wordCount / wordsPerMinute);
@@ -253,7 +253,7 @@ const scraper = defineScraper({
253
253
  description: { selector: 'meta[name="description"]', value: 'content' },
254
254
  tags: {
255
255
  selector: 'meta[name="keywords"]',
256
- value: (el) => el.attribs['content']?.split(',') || [],
256
+ value: (node) => node.attr('content')?.split(',') || [],
257
257
  },
258
258
  },
259
259
  transform: (data) => ({
@@ -336,14 +336,18 @@ A scraper function that takes HTML string and returns `Promise<{ data?: T, error
336
336
  The `extract` object defines how to extract data from HTML:
337
337
 
338
338
  ```typescript
339
- type ExtractConfig = {
340
- [key: string]: ExtractDescriptor | [ExtractDescriptor];
341
- };
339
+ interface ExtractNode {
340
+ attr(name: string): string | undefined;
341
+ text(): string;
342
+ html(): string | undefined;
343
+ }
342
344
 
343
345
  type ExtractDescriptor = {
344
346
  selector: string;
345
- value?: string | ((el: Element) => any) | ExtractConfig;
347
+ value?: string | ((node: ExtractNode) => unknown) | ExtractConfig;
346
348
  };
349
+
350
+ type ExtractConfig = Record<string, string | ExtractDescriptor | [string | ExtractDescriptor]>;
347
351
  ```
348
352
 
349
353
  #### Properties
@@ -351,7 +355,7 @@ type ExtractDescriptor = {
351
355
  - `selector`: CSS selector to find elements
352
356
  - `value`: How to extract the value:
353
357
  - `string`: Attribute name (e.g., `'href'`, `'content'`)
354
- - `function`: Custom extraction function
358
+ - `function`: Custom extraction function receiving an xscrape `ExtractNode`
355
359
  - `object`: Nested extraction configuration
356
360
  - `undefined`: Extract text content
357
361
 
package/dist/index.cjs CHANGED
@@ -1,72 +1,81 @@
1
- "use strict";
2
- var __create = Object.create;
3
- var __defProp = Object.defineProperty;
4
- var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
5
- var __getOwnPropNames = Object.getOwnPropertyNames;
6
- var __getProtoOf = Object.getPrototypeOf;
7
- var __hasOwnProp = Object.prototype.hasOwnProperty;
8
- var __export = (target, all) => {
9
- for (var name in all)
10
- __defProp(target, name, { get: all[name], enumerable: true });
11
- };
12
- var __copyProps = (to, from, except, desc) => {
13
- if (from && typeof from === "object" || typeof from === "function") {
14
- for (let key of __getOwnPropNames(from))
15
- if (!__hasOwnProp.call(to, key) && key !== except)
16
- __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
17
- }
18
- return to;
19
- };
20
- var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
21
- // If the importer is in node compatibility mode or this is not an ESM
22
- // file that has been converted to a CommonJS file using a Babel-
23
- // compatible transform (i.e. "__esModule" has not been set), then set
24
- // "default" to the CommonJS "module.exports" for node compatibility.
25
- isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
26
- mod
27
- ));
28
- var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
29
-
30
- // src/index.ts
31
- var index_exports = {};
32
- __export(index_exports, {
33
- defineScraper: () => defineScraper
34
- });
35
- module.exports = __toCommonJS(index_exports);
36
-
37
- // src/defineScraper.ts
38
- var cheerio = __toESM(require("cheerio"), 1);
1
+ Object.defineProperty(exports, Symbol.toStringTag, { value: "Module" });
2
+ let cheerio = require("cheerio");
3
+ //#region src/internal/runtime.ts
4
+ function createScraperRuntime(config) {
5
+ return async (html) => {
6
+ try {
7
+ const extractedData = extractHtml(html, config.extract);
8
+ const validation = await validateExtractedData(config.schema, extractedData);
9
+ if (!validation.ok) return { error: validation.error };
10
+ return { data: await transformValidatedData(validation.value, config.transform) };
11
+ } catch (error) {
12
+ return { error };
13
+ }
14
+ };
15
+ }
16
+ function extractHtml(html, extract) {
17
+ const $ = (0, cheerio.load)(html);
18
+ const plan = compileExtractConfig($, extract);
19
+ return $.extract(plan);
20
+ }
21
+ async function validateExtractedData(schema, extractedData) {
22
+ const validationResult = await Promise.resolve(schema["~standard"].validate(extractedData));
23
+ if (validationResult.issues) return {
24
+ ok: false,
25
+ error: validationResult.issues
26
+ };
27
+ if (!("value" in validationResult)) return {
28
+ ok: false,
29
+ error: /* @__PURE__ */ new Error("xscrape: Validation succeeded but no data was returned")
30
+ };
31
+ return {
32
+ ok: true,
33
+ value: validationResult.value
34
+ };
35
+ }
36
+ function transformValidatedData(value, transform) {
37
+ return Promise.resolve(transform ? transform(value) : value);
38
+ }
39
+ function compileExtractConfig($, extract) {
40
+ return Object.fromEntries(Object.entries(extract).map(([key, value]) => [key, compileExtractField($, value)]));
41
+ }
42
+ function compileExtractField($, field) {
43
+ if (Array.isArray(field)) {
44
+ const item = field[0];
45
+ return [typeof item === "string" ? item : compileExtractDescriptor($, item)];
46
+ }
47
+ if (typeof field === "string") return field;
48
+ return compileExtractDescriptor($, field);
49
+ }
50
+ function compileExtractDescriptor($, descriptor) {
51
+ const value = compileDescriptorValue($, descriptor.value);
52
+ return value === void 0 ? { selector: descriptor.selector } : {
53
+ selector: descriptor.selector,
54
+ value
55
+ };
56
+ }
57
+ function compileDescriptorValue($, value) {
58
+ if (value === void 0 || typeof value === "string") return value;
59
+ if (typeof value === "function") return (element, key, obj) => value(createExtractNode($, element), key, obj);
60
+ return compileExtractConfig($, value);
61
+ }
62
+ function createExtractNode($, element) {
63
+ return {
64
+ attr(name) {
65
+ return element.attribs[name] ?? void 0;
66
+ },
67
+ text() {
68
+ return $(element).text();
69
+ },
70
+ html() {
71
+ return $(element).html() ?? void 0;
72
+ }
73
+ };
74
+ }
75
+ //#endregion
76
+ //#region src/index.ts
39
77
  function defineScraper(config) {
40
- return async (html) => {
41
- try {
42
- const $ = cheerio.load(html);
43
- const extractedData = $.extract(config.extract);
44
- const validationResult = await Promise.resolve(
45
- config.schema["~standard"].validate(extractedData)
46
- );
47
- if (validationResult.issues) {
48
- return { error: validationResult.issues };
49
- }
50
- if (!("value" in validationResult)) {
51
- return {
52
- error: new Error(
53
- "xscrape: Validation succeeded but no data was returned"
54
- )
55
- };
56
- }
57
- if (config.transform) {
58
- const transformed = await Promise.resolve(
59
- config.transform(validationResult.value)
60
- );
61
- return { data: transformed };
62
- }
63
- return { data: validationResult.value };
64
- } catch (error) {
65
- return { error };
66
- }
67
- };
78
+ return createScraperRuntime(config);
68
79
  }
69
- // Annotate the CommonJS export names for ESM import in node:
70
- 0 && (module.exports = {
71
- defineScraper
72
- });
80
+ //#endregion
81
+ exports.defineScraper = defineScraper;
package/dist/index.d.cts CHANGED
@@ -1,32 +1,36 @@
1
- import { StandardSchemaV1 } from '@standard-schema/spec';
2
- import { Element } from 'domhandler';
1
+ import { StandardSchemaV1 } from "@standard-schema/spec";
3
2
 
4
- type ExtractDescriptorFn = (el: Element, key: string, obj: Record<string, unknown>) => unknown;
5
- interface ExtractDescriptor {
6
- selector: string;
7
- value?: string | ExtractDescriptorFn | ExtractMap;
3
+ //#region src/types/extract.d.ts
4
+ type ExtractObjectShape<T> = NonNullable<T> extends readonly unknown[] ? never : NonNullable<T> extends object ? NonNullable<T> : never;
5
+ interface ExtractNode {
6
+ attr(name: string): string | undefined;
7
+ html(): string | undefined;
8
+ text(): string;
9
+ }
10
+ type ExtractValueCallback<T = unknown> = (node: ExtractNode, key: string, object: Record<string, unknown>) => T | undefined;
11
+ interface ExtractDescriptor<T = unknown> {
12
+ selector: string;
13
+ value?: string | ExtractValueCallback<T> | ExtractConfig<ExtractObjectShape<T>>;
14
+ }
15
+ type ExtractField<T = unknown> = string | ExtractDescriptor<T> | (T extends readonly (infer Item)[] ? [string | ExtractDescriptor<NonNullable<Item>>] : never);
16
+ type ExtractShape<T> = [T] extends [never] ? Record<string, unknown> : NonNullable<T> extends object ? NonNullable<T> : Record<string, unknown>;
17
+ type ExtractConfig<T = Record<string, unknown>> = { [K in keyof ExtractShape<T>]: ExtractField<ExtractShape<T>[K]> };
18
+ //#endregion
19
+ //#region src/types/main.d.ts
20
+ interface ScraperConfig<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>> {
21
+ extract: ExtractConfig<StandardSchemaV1.InferOutput<S>>;
22
+ schema: S;
23
+ transform?: (data: StandardSchemaV1.InferOutput<S>) => Promise<R> | R;
8
24
  }
9
- type ExtractValue = string | ExtractDescriptor | [string | ExtractDescriptor];
10
- type ExtractMap = Record<string, ExtractValue>;
11
-
12
- type SchemaAwareExtractMap<T> = {
13
- [K in keyof T]: ExtractMap[string];
14
- };
15
- type ScraperConfig<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>> = {
16
- schema: S;
17
- extract: SchemaAwareExtractMap<StandardSchemaV1.InferOutput<S>>;
18
- transform?: (data: StandardSchemaV1.InferOutput<S>) => Promise<R> | R;
19
- };
20
- type ValidationResult<T> = {
21
- success: boolean;
22
- data?: T;
23
- error?: unknown;
24
- };
25
25
  type ScraperResult<T> = {
26
- data?: T;
27
- error?: unknown;
26
+ data: T;
27
+ error?: undefined;
28
+ } | {
29
+ data?: undefined;
30
+ error: unknown;
28
31
  };
29
-
30
- declare function defineScraper<S extends StandardSchemaV1, T extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>, R extends T = T>(config: ScraperConfig<S, R>): (html: string) => Promise<ScraperResult<R>>;
31
-
32
- export { type ScraperConfig, type ScraperResult, type ValidationResult, defineScraper };
32
+ //#endregion
33
+ //#region src/index.d.ts
34
+ declare function defineScraper<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>>(config: ScraperConfig<S, R>): (html: string) => Promise<ScraperResult<R>>;
35
+ //#endregion
36
+ export { type ExtractConfig, type ExtractDescriptor, type ExtractField, type ExtractNode, type ExtractValueCallback, type ScraperConfig, type ScraperResult, defineScraper };
@@ -0,0 +1,36 @@
1
+ import { StandardSchemaV1 } from "@standard-schema/spec";
2
+
3
+ //#region src/types/extract.d.ts
4
+ type ExtractObjectShape<T> = NonNullable<T> extends readonly unknown[] ? never : NonNullable<T> extends object ? NonNullable<T> : never;
5
+ interface ExtractNode {
6
+ attr(name: string): string | undefined;
7
+ html(): string | undefined;
8
+ text(): string;
9
+ }
10
+ type ExtractValueCallback<T = unknown> = (node: ExtractNode, key: string, object: Record<string, unknown>) => T | undefined;
11
+ interface ExtractDescriptor<T = unknown> {
12
+ selector: string;
13
+ value?: string | ExtractValueCallback<T> | ExtractConfig<ExtractObjectShape<T>>;
14
+ }
15
+ type ExtractField<T = unknown> = string | ExtractDescriptor<T> | (T extends readonly (infer Item)[] ? [string | ExtractDescriptor<NonNullable<Item>>] : never);
16
+ type ExtractShape<T> = [T] extends [never] ? Record<string, unknown> : NonNullable<T> extends object ? NonNullable<T> : Record<string, unknown>;
17
+ type ExtractConfig<T = Record<string, unknown>> = { [K in keyof ExtractShape<T>]: ExtractField<ExtractShape<T>[K]> };
18
+ //#endregion
19
+ //#region src/types/main.d.ts
20
+ interface ScraperConfig<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>> {
21
+ extract: ExtractConfig<StandardSchemaV1.InferOutput<S>>;
22
+ schema: S;
23
+ transform?: (data: StandardSchemaV1.InferOutput<S>) => Promise<R> | R;
24
+ }
25
+ type ScraperResult<T> = {
26
+ data: T;
27
+ error?: undefined;
28
+ } | {
29
+ data?: undefined;
30
+ error: unknown;
31
+ };
32
+ //#endregion
33
+ //#region src/index.d.ts
34
+ declare function defineScraper<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>>(config: ScraperConfig<S, R>): (html: string) => Promise<ScraperResult<R>>;
35
+ //#endregion
36
+ export { type ExtractConfig, type ExtractDescriptor, type ExtractField, type ExtractNode, type ExtractValueCallback, type ScraperConfig, type ScraperResult, defineScraper };
package/dist/index.mjs ADDED
@@ -0,0 +1,80 @@
1
+ import { load } from "cheerio";
2
+ //#region src/internal/runtime.ts
3
+ function createScraperRuntime(config) {
4
+ return async (html) => {
5
+ try {
6
+ const extractedData = extractHtml(html, config.extract);
7
+ const validation = await validateExtractedData(config.schema, extractedData);
8
+ if (!validation.ok) return { error: validation.error };
9
+ return { data: await transformValidatedData(validation.value, config.transform) };
10
+ } catch (error) {
11
+ return { error };
12
+ }
13
+ };
14
+ }
15
+ function extractHtml(html, extract) {
16
+ const $ = load(html);
17
+ const plan = compileExtractConfig($, extract);
18
+ return $.extract(plan);
19
+ }
20
+ async function validateExtractedData(schema, extractedData) {
21
+ const validationResult = await Promise.resolve(schema["~standard"].validate(extractedData));
22
+ if (validationResult.issues) return {
23
+ ok: false,
24
+ error: validationResult.issues
25
+ };
26
+ if (!("value" in validationResult)) return {
27
+ ok: false,
28
+ error: /* @__PURE__ */ new Error("xscrape: Validation succeeded but no data was returned")
29
+ };
30
+ return {
31
+ ok: true,
32
+ value: validationResult.value
33
+ };
34
+ }
35
+ function transformValidatedData(value, transform) {
36
+ return Promise.resolve(transform ? transform(value) : value);
37
+ }
38
+ function compileExtractConfig($, extract) {
39
+ return Object.fromEntries(Object.entries(extract).map(([key, value]) => [key, compileExtractField($, value)]));
40
+ }
41
+ function compileExtractField($, field) {
42
+ if (Array.isArray(field)) {
43
+ const item = field[0];
44
+ return [typeof item === "string" ? item : compileExtractDescriptor($, item)];
45
+ }
46
+ if (typeof field === "string") return field;
47
+ return compileExtractDescriptor($, field);
48
+ }
49
+ function compileExtractDescriptor($, descriptor) {
50
+ const value = compileDescriptorValue($, descriptor.value);
51
+ return value === void 0 ? { selector: descriptor.selector } : {
52
+ selector: descriptor.selector,
53
+ value
54
+ };
55
+ }
56
+ function compileDescriptorValue($, value) {
57
+ if (value === void 0 || typeof value === "string") return value;
58
+ if (typeof value === "function") return (element, key, obj) => value(createExtractNode($, element), key, obj);
59
+ return compileExtractConfig($, value);
60
+ }
61
+ function createExtractNode($, element) {
62
+ return {
63
+ attr(name) {
64
+ return element.attribs[name] ?? void 0;
65
+ },
66
+ text() {
67
+ return $(element).text();
68
+ },
69
+ html() {
70
+ return $(element).html() ?? void 0;
71
+ }
72
+ };
73
+ }
74
+ //#endregion
75
+ //#region src/index.ts
76
+ function defineScraper(config) {
77
+ return createScraperRuntime(config);
78
+ }
79
+ //#endregion
80
+ export { defineScraper };
package/package.json CHANGED
@@ -1,19 +1,49 @@
1
1
  {
2
+ "dependencies": {
3
+ "@standard-schema/spec": "^1.1.0",
4
+ "cheerio": "^1.2.0",
5
+ "domhandler": "^6.0.1"
6
+ },
7
+ "devDependencies": {
8
+ "@arethetypeswrong/cli": "^0.18.2",
9
+ "@biomejs/biome": "2.4.14",
10
+ "@changesets/changelog-github": "^0.7.0",
11
+ "@changesets/cli": "^2.31.0",
12
+ "@types/node": "^25.6.0",
13
+ "arktype": "^2.2.0",
14
+ "effect": "^3.21.2",
15
+ "jsdom": "^29.1.1",
16
+ "lefthook": "^2.1.6",
17
+ "tsdown": "^0.22.0",
18
+ "typescript": "^6.0.3",
19
+ "ultracite": "7.6.3",
20
+ "valibot": "^1.4.0",
21
+ "vite": "^8.0.11",
22
+ "vitepress": "^1.6.4",
23
+ "vitest": "^4.1.5",
24
+ "zod": "^4.4.3"
25
+ },
2
26
  "name": "xscrape",
3
- "version": "3.0.4",
27
+ "type": "module",
28
+ "version": "3.2.1",
4
29
  "description": "A flexible and powerful library designed to extract and transform data from HTML documents using user-defined schemas",
5
- "main": "dist/index.js",
30
+ "main": "dist/index.cjs",
6
31
  "exports": {
7
32
  ".": {
8
- "import": "./dist/index.js",
9
- "require": "./dist/index.cjs"
33
+ "import": {
34
+ "types": "./dist/index.d.mts",
35
+ "default": "./dist/index.mjs"
36
+ },
37
+ "require": {
38
+ "types": "./dist/index.d.cts",
39
+ "default": "./dist/index.cjs"
40
+ }
10
41
  }
11
42
  },
12
- "typings": "dist/index.d.ts",
43
+ "typings": "dist/index.d.mts",
13
44
  "files": [
14
45
  "dist"
15
46
  ],
16
- "type": "module",
17
47
  "keywords": [
18
48
  "web-scraping",
19
49
  "data-extraction",
@@ -39,38 +69,17 @@
39
69
  "url": "https://github.com/johnie/xscrape/issues"
40
70
  },
41
71
  "homepage": "https://github.com/johnie/xscrape#readme",
42
- "devDependencies": {
43
- "@arethetypeswrong/cli": "^0.18.2",
44
- "@biomejs/biome": "2.1.2",
45
- "@changesets/changelog-github": "^0.5.1",
46
- "@changesets/cli": "^2.29.5",
47
- "arktype": "^2.1.20",
48
- "effect": "^3.17.0",
49
- "jsdom": "^26.1.0",
50
- "lefthook": "^1.12.2",
51
- "tsup": "^8.5.0",
52
- "typescript": "^5.8.3",
53
- "valibot": "^1.1.0",
54
- "vite": "^7.0.5",
55
- "vitepress": "^1.6.3",
56
- "vitest": "^3.2.4",
57
- "zod": "^4.0.5"
58
- },
59
- "dependencies": {
60
- "@standard-schema/spec": "^1.0.0",
61
- "cheerio": "^1.1.2",
62
- "domhandler": "^5.0.3"
63
- },
64
72
  "scripts": {
65
- "build": "tsup",
66
- "ci": "pnpm run build && pnpm run lint && pnpm run typecheck && pnpm run check-exports && pnpm run test",
73
+ "build": "tsdown",
74
+ "ci": "pnpm run build && pnpm run check && pnpm run typecheck && pnpm run check-exports && pnpm run test",
67
75
  "typecheck": "tsc",
68
76
  "test": "vitest run",
69
77
  "test:watch": "vitest",
70
- "format": "biome format --write ./src",
71
- "lint": "biome check ./src",
78
+ "check": "ultracite check",
79
+ "fix": "ultracite fix",
72
80
  "check-exports": "attw --pack .",
73
- "local-release": "pnpm run ci && changeset version && changeset publish",
81
+ "version": "changeset version && pnpm run fix",
82
+ "local-release": "pnpm run ci && pnpm run version && changeset publish",
74
83
  "release": "pnpm run ci && changeset publish",
75
84
  "docs:dev": "vitepress dev docs",
76
85
  "docs:build": "vitepress build docs",
package/dist/index.d.ts DELETED
@@ -1,32 +0,0 @@
1
- import { StandardSchemaV1 } from '@standard-schema/spec';
2
- import { Element } from 'domhandler';
3
-
4
- type ExtractDescriptorFn = (el: Element, key: string, obj: Record<string, unknown>) => unknown;
5
- interface ExtractDescriptor {
6
- selector: string;
7
- value?: string | ExtractDescriptorFn | ExtractMap;
8
- }
9
- type ExtractValue = string | ExtractDescriptor | [string | ExtractDescriptor];
10
- type ExtractMap = Record<string, ExtractValue>;
11
-
12
- type SchemaAwareExtractMap<T> = {
13
- [K in keyof T]: ExtractMap[string];
14
- };
15
- type ScraperConfig<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>> = {
16
- schema: S;
17
- extract: SchemaAwareExtractMap<StandardSchemaV1.InferOutput<S>>;
18
- transform?: (data: StandardSchemaV1.InferOutput<S>) => Promise<R> | R;
19
- };
20
- type ValidationResult<T> = {
21
- success: boolean;
22
- data?: T;
23
- error?: unknown;
24
- };
25
- type ScraperResult<T> = {
26
- data?: T;
27
- error?: unknown;
28
- };
29
-
30
- declare function defineScraper<S extends StandardSchemaV1, T extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>, R extends T = T>(config: ScraperConfig<S, R>): (html: string) => Promise<ScraperResult<R>>;
31
-
32
- export { type ScraperConfig, type ScraperResult, type ValidationResult, defineScraper };
package/dist/index.js DELETED
@@ -1,35 +0,0 @@
1
- // src/defineScraper.ts
2
- import * as cheerio from "cheerio";
3
- function defineScraper(config) {
4
- return async (html) => {
5
- try {
6
- const $ = cheerio.load(html);
7
- const extractedData = $.extract(config.extract);
8
- const validationResult = await Promise.resolve(
9
- config.schema["~standard"].validate(extractedData)
10
- );
11
- if (validationResult.issues) {
12
- return { error: validationResult.issues };
13
- }
14
- if (!("value" in validationResult)) {
15
- return {
16
- error: new Error(
17
- "xscrape: Validation succeeded but no data was returned"
18
- )
19
- };
20
- }
21
- if (config.transform) {
22
- const transformed = await Promise.resolve(
23
- config.transform(validationResult.value)
24
- );
25
- return { data: transformed };
26
- }
27
- return { data: validationResult.value };
28
- } catch (error) {
29
- return { error };
30
- }
31
- };
32
- }
33
- export {
34
- defineScraper
35
- };