xscrape 3.0.4 → 3.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +15 -11
- package/dist/index.cjs +79 -70
- package/dist/index.d.cts +32 -28
- package/dist/index.d.mts +36 -0
- package/dist/index.mjs +80 -0
- package/package.json +42 -33
- package/dist/index.d.ts +0 -32
- package/dist/index.js +0 -35
package/README.md
CHANGED
|
@@ -66,7 +66,7 @@ const scraper = defineScraper({
|
|
|
66
66
|
description: { selector: 'meta[name="description"]', value: 'content' },
|
|
67
67
|
keywords: {
|
|
68
68
|
selector: 'meta[name="keywords"]',
|
|
69
|
-
value: (
|
|
69
|
+
value: (node) => node.attr('content')?.split(',') || [],
|
|
70
70
|
},
|
|
71
71
|
views: { selector: 'meta[name="views"]', value: 'content' },
|
|
72
72
|
},
|
|
@@ -218,16 +218,16 @@ const scraper = defineScraper({
|
|
|
218
218
|
extract: {
|
|
219
219
|
tags: {
|
|
220
220
|
selector: 'meta[name="keywords"]',
|
|
221
|
-
value: (
|
|
221
|
+
value: (node) => node.attr('content')?.split(',').map(tag => tag.trim()) || [],
|
|
222
222
|
},
|
|
223
223
|
publishedDate: {
|
|
224
224
|
selector: 'meta[name="published"]',
|
|
225
|
-
value: (
|
|
225
|
+
value: (node) => new Date(node.attr('content') ?? ''),
|
|
226
226
|
},
|
|
227
227
|
readingTime: {
|
|
228
228
|
selector: 'article',
|
|
229
|
-
value: (
|
|
230
|
-
const text =
|
|
229
|
+
value: (node) => {
|
|
230
|
+
const text = node.text();
|
|
231
231
|
const wordsPerMinute = 200;
|
|
232
232
|
const wordCount = text.split(/\s+/).length;
|
|
233
233
|
return Math.ceil(wordCount / wordsPerMinute);
|
|
@@ -253,7 +253,7 @@ const scraper = defineScraper({
|
|
|
253
253
|
description: { selector: 'meta[name="description"]', value: 'content' },
|
|
254
254
|
tags: {
|
|
255
255
|
selector: 'meta[name="keywords"]',
|
|
256
|
-
value: (
|
|
256
|
+
value: (node) => node.attr('content')?.split(',') || [],
|
|
257
257
|
},
|
|
258
258
|
},
|
|
259
259
|
transform: (data) => ({
|
|
@@ -336,14 +336,18 @@ A scraper function that takes HTML string and returns `Promise<{ data?: T, error
|
|
|
336
336
|
The `extract` object defines how to extract data from HTML:
|
|
337
337
|
|
|
338
338
|
```typescript
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
339
|
+
interface ExtractNode {
|
|
340
|
+
attr(name: string): string | undefined;
|
|
341
|
+
text(): string;
|
|
342
|
+
html(): string | undefined;
|
|
343
|
+
}
|
|
342
344
|
|
|
343
345
|
type ExtractDescriptor = {
|
|
344
346
|
selector: string;
|
|
345
|
-
value?: string | ((
|
|
347
|
+
value?: string | ((node: ExtractNode) => unknown) | ExtractConfig;
|
|
346
348
|
};
|
|
349
|
+
|
|
350
|
+
type ExtractConfig = Record<string, string | ExtractDescriptor | [string | ExtractDescriptor]>;
|
|
347
351
|
```
|
|
348
352
|
|
|
349
353
|
#### Properties
|
|
@@ -351,7 +355,7 @@ type ExtractDescriptor = {
|
|
|
351
355
|
- `selector`: CSS selector to find elements
|
|
352
356
|
- `value`: How to extract the value:
|
|
353
357
|
- `string`: Attribute name (e.g., `'href'`, `'content'`)
|
|
354
|
-
- `function`: Custom extraction function
|
|
358
|
+
- `function`: Custom extraction function receiving an xscrape `ExtractNode`
|
|
355
359
|
- `object`: Nested extraction configuration
|
|
356
360
|
- `undefined`: Extract text content
|
|
357
361
|
|
package/dist/index.cjs
CHANGED
|
@@ -1,72 +1,81 @@
|
|
|
1
|
-
"
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
}
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
))
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
1
|
+
Object.defineProperty(exports, Symbol.toStringTag, { value: "Module" });
|
|
2
|
+
let cheerio = require("cheerio");
|
|
3
|
+
//#region src/internal/runtime.ts
|
|
4
|
+
function createScraperRuntime(config) {
|
|
5
|
+
return async (html) => {
|
|
6
|
+
try {
|
|
7
|
+
const extractedData = extractHtml(html, config.extract);
|
|
8
|
+
const validation = await validateExtractedData(config.schema, extractedData);
|
|
9
|
+
if (!validation.ok) return { error: validation.error };
|
|
10
|
+
return { data: await transformValidatedData(validation.value, config.transform) };
|
|
11
|
+
} catch (error) {
|
|
12
|
+
return { error };
|
|
13
|
+
}
|
|
14
|
+
};
|
|
15
|
+
}
|
|
16
|
+
function extractHtml(html, extract) {
|
|
17
|
+
const $ = (0, cheerio.load)(html);
|
|
18
|
+
const plan = compileExtractConfig($, extract);
|
|
19
|
+
return $.extract(plan);
|
|
20
|
+
}
|
|
21
|
+
async function validateExtractedData(schema, extractedData) {
|
|
22
|
+
const validationResult = await Promise.resolve(schema["~standard"].validate(extractedData));
|
|
23
|
+
if (validationResult.issues) return {
|
|
24
|
+
ok: false,
|
|
25
|
+
error: validationResult.issues
|
|
26
|
+
};
|
|
27
|
+
if (!("value" in validationResult)) return {
|
|
28
|
+
ok: false,
|
|
29
|
+
error: /* @__PURE__ */ new Error("xscrape: Validation succeeded but no data was returned")
|
|
30
|
+
};
|
|
31
|
+
return {
|
|
32
|
+
ok: true,
|
|
33
|
+
value: validationResult.value
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
function transformValidatedData(value, transform) {
|
|
37
|
+
return Promise.resolve(transform ? transform(value) : value);
|
|
38
|
+
}
|
|
39
|
+
function compileExtractConfig($, extract) {
|
|
40
|
+
return Object.fromEntries(Object.entries(extract).map(([key, value]) => [key, compileExtractField($, value)]));
|
|
41
|
+
}
|
|
42
|
+
function compileExtractField($, field) {
|
|
43
|
+
if (Array.isArray(field)) {
|
|
44
|
+
const item = field[0];
|
|
45
|
+
return [typeof item === "string" ? item : compileExtractDescriptor($, item)];
|
|
46
|
+
}
|
|
47
|
+
if (typeof field === "string") return field;
|
|
48
|
+
return compileExtractDescriptor($, field);
|
|
49
|
+
}
|
|
50
|
+
function compileExtractDescriptor($, descriptor) {
|
|
51
|
+
const value = compileDescriptorValue($, descriptor.value);
|
|
52
|
+
return value === void 0 ? { selector: descriptor.selector } : {
|
|
53
|
+
selector: descriptor.selector,
|
|
54
|
+
value
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
function compileDescriptorValue($, value) {
|
|
58
|
+
if (value === void 0 || typeof value === "string") return value;
|
|
59
|
+
if (typeof value === "function") return (element, key, obj) => value(createExtractNode($, element), key, obj);
|
|
60
|
+
return compileExtractConfig($, value);
|
|
61
|
+
}
|
|
62
|
+
function createExtractNode($, element) {
|
|
63
|
+
return {
|
|
64
|
+
attr(name) {
|
|
65
|
+
return element.attribs[name] ?? void 0;
|
|
66
|
+
},
|
|
67
|
+
text() {
|
|
68
|
+
return $(element).text();
|
|
69
|
+
},
|
|
70
|
+
html() {
|
|
71
|
+
return $(element).html() ?? void 0;
|
|
72
|
+
}
|
|
73
|
+
};
|
|
74
|
+
}
|
|
75
|
+
//#endregion
|
|
76
|
+
//#region src/index.ts
|
|
39
77
|
function defineScraper(config) {
|
|
40
|
-
|
|
41
|
-
try {
|
|
42
|
-
const $ = cheerio.load(html);
|
|
43
|
-
const extractedData = $.extract(config.extract);
|
|
44
|
-
const validationResult = await Promise.resolve(
|
|
45
|
-
config.schema["~standard"].validate(extractedData)
|
|
46
|
-
);
|
|
47
|
-
if (validationResult.issues) {
|
|
48
|
-
return { error: validationResult.issues };
|
|
49
|
-
}
|
|
50
|
-
if (!("value" in validationResult)) {
|
|
51
|
-
return {
|
|
52
|
-
error: new Error(
|
|
53
|
-
"xscrape: Validation succeeded but no data was returned"
|
|
54
|
-
)
|
|
55
|
-
};
|
|
56
|
-
}
|
|
57
|
-
if (config.transform) {
|
|
58
|
-
const transformed = await Promise.resolve(
|
|
59
|
-
config.transform(validationResult.value)
|
|
60
|
-
);
|
|
61
|
-
return { data: transformed };
|
|
62
|
-
}
|
|
63
|
-
return { data: validationResult.value };
|
|
64
|
-
} catch (error) {
|
|
65
|
-
return { error };
|
|
66
|
-
}
|
|
67
|
-
};
|
|
78
|
+
return createScraperRuntime(config);
|
|
68
79
|
}
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
defineScraper
|
|
72
|
-
});
|
|
80
|
+
//#endregion
|
|
81
|
+
exports.defineScraper = defineScraper;
|
package/dist/index.d.cts
CHANGED
|
@@ -1,32 +1,36 @@
|
|
|
1
|
-
import { StandardSchemaV1 } from
|
|
2
|
-
import { Element } from 'domhandler';
|
|
1
|
+
import { StandardSchemaV1 } from "@standard-schema/spec";
|
|
3
2
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
3
|
+
//#region src/types/extract.d.ts
|
|
4
|
+
type ExtractObjectShape<T> = NonNullable<T> extends readonly unknown[] ? never : NonNullable<T> extends object ? NonNullable<T> : never;
|
|
5
|
+
interface ExtractNode {
|
|
6
|
+
attr(name: string): string | undefined;
|
|
7
|
+
html(): string | undefined;
|
|
8
|
+
text(): string;
|
|
9
|
+
}
|
|
10
|
+
type ExtractValueCallback<T = unknown> = (node: ExtractNode, key: string, object: Record<string, unknown>) => T | undefined;
|
|
11
|
+
interface ExtractDescriptor<T = unknown> {
|
|
12
|
+
selector: string;
|
|
13
|
+
value?: string | ExtractValueCallback<T> | ExtractConfig<ExtractObjectShape<T>>;
|
|
14
|
+
}
|
|
15
|
+
type ExtractField<T = unknown> = string | ExtractDescriptor<T> | (T extends readonly (infer Item)[] ? [string | ExtractDescriptor<NonNullable<Item>>] : never);
|
|
16
|
+
type ExtractShape<T> = [T] extends [never] ? Record<string, unknown> : NonNullable<T> extends object ? NonNullable<T> : Record<string, unknown>;
|
|
17
|
+
type ExtractConfig<T = Record<string, unknown>> = { [K in keyof ExtractShape<T>]: ExtractField<ExtractShape<T>[K]> };
|
|
18
|
+
//#endregion
|
|
19
|
+
//#region src/types/main.d.ts
|
|
20
|
+
interface ScraperConfig<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>> {
|
|
21
|
+
extract: ExtractConfig<StandardSchemaV1.InferOutput<S>>;
|
|
22
|
+
schema: S;
|
|
23
|
+
transform?: (data: StandardSchemaV1.InferOutput<S>) => Promise<R> | R;
|
|
8
24
|
}
|
|
9
|
-
type ExtractValue = string | ExtractDescriptor | [string | ExtractDescriptor];
|
|
10
|
-
type ExtractMap = Record<string, ExtractValue>;
|
|
11
|
-
|
|
12
|
-
type SchemaAwareExtractMap<T> = {
|
|
13
|
-
[K in keyof T]: ExtractMap[string];
|
|
14
|
-
};
|
|
15
|
-
type ScraperConfig<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>> = {
|
|
16
|
-
schema: S;
|
|
17
|
-
extract: SchemaAwareExtractMap<StandardSchemaV1.InferOutput<S>>;
|
|
18
|
-
transform?: (data: StandardSchemaV1.InferOutput<S>) => Promise<R> | R;
|
|
19
|
-
};
|
|
20
|
-
type ValidationResult<T> = {
|
|
21
|
-
success: boolean;
|
|
22
|
-
data?: T;
|
|
23
|
-
error?: unknown;
|
|
24
|
-
};
|
|
25
25
|
type ScraperResult<T> = {
|
|
26
|
-
|
|
27
|
-
|
|
26
|
+
data: T;
|
|
27
|
+
error?: undefined;
|
|
28
|
+
} | {
|
|
29
|
+
data?: undefined;
|
|
30
|
+
error: unknown;
|
|
28
31
|
};
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
32
|
+
//#endregion
|
|
33
|
+
//#region src/index.d.ts
|
|
34
|
+
declare function defineScraper<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>>(config: ScraperConfig<S, R>): (html: string) => Promise<ScraperResult<R>>;
|
|
35
|
+
//#endregion
|
|
36
|
+
export { type ExtractConfig, type ExtractDescriptor, type ExtractField, type ExtractNode, type ExtractValueCallback, type ScraperConfig, type ScraperResult, defineScraper };
|
package/dist/index.d.mts
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { StandardSchemaV1 } from "@standard-schema/spec";
|
|
2
|
+
|
|
3
|
+
//#region src/types/extract.d.ts
|
|
4
|
+
type ExtractObjectShape<T> = NonNullable<T> extends readonly unknown[] ? never : NonNullable<T> extends object ? NonNullable<T> : never;
|
|
5
|
+
interface ExtractNode {
|
|
6
|
+
attr(name: string): string | undefined;
|
|
7
|
+
html(): string | undefined;
|
|
8
|
+
text(): string;
|
|
9
|
+
}
|
|
10
|
+
type ExtractValueCallback<T = unknown> = (node: ExtractNode, key: string, object: Record<string, unknown>) => T | undefined;
|
|
11
|
+
interface ExtractDescriptor<T = unknown> {
|
|
12
|
+
selector: string;
|
|
13
|
+
value?: string | ExtractValueCallback<T> | ExtractConfig<ExtractObjectShape<T>>;
|
|
14
|
+
}
|
|
15
|
+
type ExtractField<T = unknown> = string | ExtractDescriptor<T> | (T extends readonly (infer Item)[] ? [string | ExtractDescriptor<NonNullable<Item>>] : never);
|
|
16
|
+
type ExtractShape<T> = [T] extends [never] ? Record<string, unknown> : NonNullable<T> extends object ? NonNullable<T> : Record<string, unknown>;
|
|
17
|
+
type ExtractConfig<T = Record<string, unknown>> = { [K in keyof ExtractShape<T>]: ExtractField<ExtractShape<T>[K]> };
|
|
18
|
+
//#endregion
|
|
19
|
+
//#region src/types/main.d.ts
|
|
20
|
+
interface ScraperConfig<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>> {
|
|
21
|
+
extract: ExtractConfig<StandardSchemaV1.InferOutput<S>>;
|
|
22
|
+
schema: S;
|
|
23
|
+
transform?: (data: StandardSchemaV1.InferOutput<S>) => Promise<R> | R;
|
|
24
|
+
}
|
|
25
|
+
type ScraperResult<T> = {
|
|
26
|
+
data: T;
|
|
27
|
+
error?: undefined;
|
|
28
|
+
} | {
|
|
29
|
+
data?: undefined;
|
|
30
|
+
error: unknown;
|
|
31
|
+
};
|
|
32
|
+
//#endregion
|
|
33
|
+
//#region src/index.d.ts
|
|
34
|
+
declare function defineScraper<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>>(config: ScraperConfig<S, R>): (html: string) => Promise<ScraperResult<R>>;
|
|
35
|
+
//#endregion
|
|
36
|
+
export { type ExtractConfig, type ExtractDescriptor, type ExtractField, type ExtractNode, type ExtractValueCallback, type ScraperConfig, type ScraperResult, defineScraper };
|
package/dist/index.mjs
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { load } from "cheerio";
|
|
2
|
+
//#region src/internal/runtime.ts
|
|
3
|
+
function createScraperRuntime(config) {
|
|
4
|
+
return async (html) => {
|
|
5
|
+
try {
|
|
6
|
+
const extractedData = extractHtml(html, config.extract);
|
|
7
|
+
const validation = await validateExtractedData(config.schema, extractedData);
|
|
8
|
+
if (!validation.ok) return { error: validation.error };
|
|
9
|
+
return { data: await transformValidatedData(validation.value, config.transform) };
|
|
10
|
+
} catch (error) {
|
|
11
|
+
return { error };
|
|
12
|
+
}
|
|
13
|
+
};
|
|
14
|
+
}
|
|
15
|
+
function extractHtml(html, extract) {
|
|
16
|
+
const $ = load(html);
|
|
17
|
+
const plan = compileExtractConfig($, extract);
|
|
18
|
+
return $.extract(plan);
|
|
19
|
+
}
|
|
20
|
+
async function validateExtractedData(schema, extractedData) {
|
|
21
|
+
const validationResult = await Promise.resolve(schema["~standard"].validate(extractedData));
|
|
22
|
+
if (validationResult.issues) return {
|
|
23
|
+
ok: false,
|
|
24
|
+
error: validationResult.issues
|
|
25
|
+
};
|
|
26
|
+
if (!("value" in validationResult)) return {
|
|
27
|
+
ok: false,
|
|
28
|
+
error: /* @__PURE__ */ new Error("xscrape: Validation succeeded but no data was returned")
|
|
29
|
+
};
|
|
30
|
+
return {
|
|
31
|
+
ok: true,
|
|
32
|
+
value: validationResult.value
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
function transformValidatedData(value, transform) {
|
|
36
|
+
return Promise.resolve(transform ? transform(value) : value);
|
|
37
|
+
}
|
|
38
|
+
function compileExtractConfig($, extract) {
|
|
39
|
+
return Object.fromEntries(Object.entries(extract).map(([key, value]) => [key, compileExtractField($, value)]));
|
|
40
|
+
}
|
|
41
|
+
function compileExtractField($, field) {
|
|
42
|
+
if (Array.isArray(field)) {
|
|
43
|
+
const item = field[0];
|
|
44
|
+
return [typeof item === "string" ? item : compileExtractDescriptor($, item)];
|
|
45
|
+
}
|
|
46
|
+
if (typeof field === "string") return field;
|
|
47
|
+
return compileExtractDescriptor($, field);
|
|
48
|
+
}
|
|
49
|
+
function compileExtractDescriptor($, descriptor) {
|
|
50
|
+
const value = compileDescriptorValue($, descriptor.value);
|
|
51
|
+
return value === void 0 ? { selector: descriptor.selector } : {
|
|
52
|
+
selector: descriptor.selector,
|
|
53
|
+
value
|
|
54
|
+
};
|
|
55
|
+
}
|
|
56
|
+
function compileDescriptorValue($, value) {
|
|
57
|
+
if (value === void 0 || typeof value === "string") return value;
|
|
58
|
+
if (typeof value === "function") return (element, key, obj) => value(createExtractNode($, element), key, obj);
|
|
59
|
+
return compileExtractConfig($, value);
|
|
60
|
+
}
|
|
61
|
+
function createExtractNode($, element) {
|
|
62
|
+
return {
|
|
63
|
+
attr(name) {
|
|
64
|
+
return element.attribs[name] ?? void 0;
|
|
65
|
+
},
|
|
66
|
+
text() {
|
|
67
|
+
return $(element).text();
|
|
68
|
+
},
|
|
69
|
+
html() {
|
|
70
|
+
return $(element).html() ?? void 0;
|
|
71
|
+
}
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
//#endregion
|
|
75
|
+
//#region src/index.ts
|
|
76
|
+
function defineScraper(config) {
|
|
77
|
+
return createScraperRuntime(config);
|
|
78
|
+
}
|
|
79
|
+
//#endregion
|
|
80
|
+
export { defineScraper };
|
package/package.json
CHANGED
|
@@ -1,19 +1,49 @@
|
|
|
1
1
|
{
|
|
2
|
+
"dependencies": {
|
|
3
|
+
"@standard-schema/spec": "^1.1.0",
|
|
4
|
+
"cheerio": "^1.2.0",
|
|
5
|
+
"domhandler": "^6.0.1"
|
|
6
|
+
},
|
|
7
|
+
"devDependencies": {
|
|
8
|
+
"@arethetypeswrong/cli": "^0.18.2",
|
|
9
|
+
"@biomejs/biome": "2.4.14",
|
|
10
|
+
"@changesets/changelog-github": "^0.7.0",
|
|
11
|
+
"@changesets/cli": "^2.31.0",
|
|
12
|
+
"@types/node": "^25.6.0",
|
|
13
|
+
"arktype": "^2.2.0",
|
|
14
|
+
"effect": "^3.21.2",
|
|
15
|
+
"jsdom": "^29.1.1",
|
|
16
|
+
"lefthook": "^2.1.6",
|
|
17
|
+
"tsdown": "^0.22.0",
|
|
18
|
+
"typescript": "^6.0.3",
|
|
19
|
+
"ultracite": "7.6.3",
|
|
20
|
+
"valibot": "^1.4.0",
|
|
21
|
+
"vite": "^8.0.11",
|
|
22
|
+
"vitepress": "^1.6.4",
|
|
23
|
+
"vitest": "^4.1.5",
|
|
24
|
+
"zod": "^4.4.3"
|
|
25
|
+
},
|
|
2
26
|
"name": "xscrape",
|
|
3
|
-
"
|
|
27
|
+
"type": "module",
|
|
28
|
+
"version": "3.2.1",
|
|
4
29
|
"description": "A flexible and powerful library designed to extract and transform data from HTML documents using user-defined schemas",
|
|
5
|
-
"main": "dist/index.
|
|
30
|
+
"main": "dist/index.cjs",
|
|
6
31
|
"exports": {
|
|
7
32
|
".": {
|
|
8
|
-
"import":
|
|
9
|
-
|
|
33
|
+
"import": {
|
|
34
|
+
"types": "./dist/index.d.mts",
|
|
35
|
+
"default": "./dist/index.mjs"
|
|
36
|
+
},
|
|
37
|
+
"require": {
|
|
38
|
+
"types": "./dist/index.d.cts",
|
|
39
|
+
"default": "./dist/index.cjs"
|
|
40
|
+
}
|
|
10
41
|
}
|
|
11
42
|
},
|
|
12
|
-
"typings": "dist/index.d.
|
|
43
|
+
"typings": "dist/index.d.mts",
|
|
13
44
|
"files": [
|
|
14
45
|
"dist"
|
|
15
46
|
],
|
|
16
|
-
"type": "module",
|
|
17
47
|
"keywords": [
|
|
18
48
|
"web-scraping",
|
|
19
49
|
"data-extraction",
|
|
@@ -39,38 +69,17 @@
|
|
|
39
69
|
"url": "https://github.com/johnie/xscrape/issues"
|
|
40
70
|
},
|
|
41
71
|
"homepage": "https://github.com/johnie/xscrape#readme",
|
|
42
|
-
"devDependencies": {
|
|
43
|
-
"@arethetypeswrong/cli": "^0.18.2",
|
|
44
|
-
"@biomejs/biome": "2.1.2",
|
|
45
|
-
"@changesets/changelog-github": "^0.5.1",
|
|
46
|
-
"@changesets/cli": "^2.29.5",
|
|
47
|
-
"arktype": "^2.1.20",
|
|
48
|
-
"effect": "^3.17.0",
|
|
49
|
-
"jsdom": "^26.1.0",
|
|
50
|
-
"lefthook": "^1.12.2",
|
|
51
|
-
"tsup": "^8.5.0",
|
|
52
|
-
"typescript": "^5.8.3",
|
|
53
|
-
"valibot": "^1.1.0",
|
|
54
|
-
"vite": "^7.0.5",
|
|
55
|
-
"vitepress": "^1.6.3",
|
|
56
|
-
"vitest": "^3.2.4",
|
|
57
|
-
"zod": "^4.0.5"
|
|
58
|
-
},
|
|
59
|
-
"dependencies": {
|
|
60
|
-
"@standard-schema/spec": "^1.0.0",
|
|
61
|
-
"cheerio": "^1.1.2",
|
|
62
|
-
"domhandler": "^5.0.3"
|
|
63
|
-
},
|
|
64
72
|
"scripts": {
|
|
65
|
-
"build": "
|
|
66
|
-
"ci": "pnpm run build && pnpm run
|
|
73
|
+
"build": "tsdown",
|
|
74
|
+
"ci": "pnpm run build && pnpm run check && pnpm run typecheck && pnpm run check-exports && pnpm run test",
|
|
67
75
|
"typecheck": "tsc",
|
|
68
76
|
"test": "vitest run",
|
|
69
77
|
"test:watch": "vitest",
|
|
70
|
-
"
|
|
71
|
-
"
|
|
78
|
+
"check": "ultracite check",
|
|
79
|
+
"fix": "ultracite fix",
|
|
72
80
|
"check-exports": "attw --pack .",
|
|
73
|
-
"
|
|
81
|
+
"version": "changeset version && pnpm run fix",
|
|
82
|
+
"local-release": "pnpm run ci && pnpm run version && changeset publish",
|
|
74
83
|
"release": "pnpm run ci && changeset publish",
|
|
75
84
|
"docs:dev": "vitepress dev docs",
|
|
76
85
|
"docs:build": "vitepress build docs",
|
package/dist/index.d.ts
DELETED
|
@@ -1,32 +0,0 @@
|
|
|
1
|
-
import { StandardSchemaV1 } from '@standard-schema/spec';
|
|
2
|
-
import { Element } from 'domhandler';
|
|
3
|
-
|
|
4
|
-
type ExtractDescriptorFn = (el: Element, key: string, obj: Record<string, unknown>) => unknown;
|
|
5
|
-
interface ExtractDescriptor {
|
|
6
|
-
selector: string;
|
|
7
|
-
value?: string | ExtractDescriptorFn | ExtractMap;
|
|
8
|
-
}
|
|
9
|
-
type ExtractValue = string | ExtractDescriptor | [string | ExtractDescriptor];
|
|
10
|
-
type ExtractMap = Record<string, ExtractValue>;
|
|
11
|
-
|
|
12
|
-
type SchemaAwareExtractMap<T> = {
|
|
13
|
-
[K in keyof T]: ExtractMap[string];
|
|
14
|
-
};
|
|
15
|
-
type ScraperConfig<S extends StandardSchemaV1, R extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>> = {
|
|
16
|
-
schema: S;
|
|
17
|
-
extract: SchemaAwareExtractMap<StandardSchemaV1.InferOutput<S>>;
|
|
18
|
-
transform?: (data: StandardSchemaV1.InferOutput<S>) => Promise<R> | R;
|
|
19
|
-
};
|
|
20
|
-
type ValidationResult<T> = {
|
|
21
|
-
success: boolean;
|
|
22
|
-
data?: T;
|
|
23
|
-
error?: unknown;
|
|
24
|
-
};
|
|
25
|
-
type ScraperResult<T> = {
|
|
26
|
-
data?: T;
|
|
27
|
-
error?: unknown;
|
|
28
|
-
};
|
|
29
|
-
|
|
30
|
-
declare function defineScraper<S extends StandardSchemaV1, T extends StandardSchemaV1.InferOutput<S> = StandardSchemaV1.InferOutput<S>, R extends T = T>(config: ScraperConfig<S, R>): (html: string) => Promise<ScraperResult<R>>;
|
|
31
|
-
|
|
32
|
-
export { type ScraperConfig, type ScraperResult, type ValidationResult, defineScraper };
|
package/dist/index.js
DELETED
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
// src/defineScraper.ts
|
|
2
|
-
import * as cheerio from "cheerio";
|
|
3
|
-
function defineScraper(config) {
|
|
4
|
-
return async (html) => {
|
|
5
|
-
try {
|
|
6
|
-
const $ = cheerio.load(html);
|
|
7
|
-
const extractedData = $.extract(config.extract);
|
|
8
|
-
const validationResult = await Promise.resolve(
|
|
9
|
-
config.schema["~standard"].validate(extractedData)
|
|
10
|
-
);
|
|
11
|
-
if (validationResult.issues) {
|
|
12
|
-
return { error: validationResult.issues };
|
|
13
|
-
}
|
|
14
|
-
if (!("value" in validationResult)) {
|
|
15
|
-
return {
|
|
16
|
-
error: new Error(
|
|
17
|
-
"xscrape: Validation succeeded but no data was returned"
|
|
18
|
-
)
|
|
19
|
-
};
|
|
20
|
-
}
|
|
21
|
-
if (config.transform) {
|
|
22
|
-
const transformed = await Promise.resolve(
|
|
23
|
-
config.transform(validationResult.value)
|
|
24
|
-
);
|
|
25
|
-
return { data: transformed };
|
|
26
|
-
}
|
|
27
|
-
return { data: validationResult.value };
|
|
28
|
-
} catch (error) {
|
|
29
|
-
return { error };
|
|
30
|
-
}
|
|
31
|
-
};
|
|
32
|
-
}
|
|
33
|
-
export {
|
|
34
|
-
defineScraper
|
|
35
|
-
};
|