recker 1.0.26 → 1.0.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/browser/cache.d.ts +40 -0
- package/dist/browser/browser/cache.js +199 -0
- package/dist/browser/browser/crypto.d.ts +24 -0
- package/dist/browser/browser/crypto.js +80 -0
- package/dist/browser/browser/index.d.ts +31 -0
- package/dist/browser/browser/index.js +31 -0
- package/dist/browser/browser/recker.d.ts +26 -0
- package/dist/browser/browser/recker.js +61 -0
- package/dist/browser/cache/basic-file-storage.d.ts +12 -0
- package/dist/browser/cache/basic-file-storage.js +50 -0
- package/dist/browser/cache/memory-limits.d.ts +20 -0
- package/dist/browser/cache/memory-limits.js +96 -0
- package/dist/browser/cache/memory-storage.d.ts +132 -0
- package/dist/browser/cache/memory-storage.js +454 -0
- package/dist/browser/cache.d.ts +40 -0
- package/dist/browser/cache.js +199 -0
- package/dist/browser/constants/http-status.d.ts +73 -0
- package/dist/browser/constants/http-status.js +156 -0
- package/dist/browser/cookies/memory-cookie-jar.d.ts +30 -0
- package/dist/browser/cookies/memory-cookie-jar.js +210 -0
- package/dist/browser/core/client.d.ts +118 -0
- package/dist/browser/core/client.js +667 -0
- package/dist/browser/core/errors.d.ts +142 -0
- package/dist/browser/core/errors.js +308 -0
- package/dist/browser/core/index.d.ts +5 -0
- package/dist/browser/core/index.js +5 -0
- package/dist/browser/core/request-promise.d.ts +23 -0
- package/dist/browser/core/request-promise.js +82 -0
- package/dist/browser/core/request.d.ts +20 -0
- package/dist/browser/core/request.js +76 -0
- package/dist/browser/core/response.d.ts +34 -0
- package/dist/browser/core/response.js +178 -0
- package/dist/browser/crypto.d.ts +24 -0
- package/dist/browser/crypto.js +80 -0
- package/dist/browser/index.d.ts +31 -0
- package/dist/browser/index.js +31 -0
- package/dist/browser/plugins/auth/api-key.d.ts +8 -0
- package/dist/browser/plugins/auth/api-key.js +27 -0
- package/dist/browser/plugins/auth/auth0.d.ts +33 -0
- package/dist/browser/plugins/auth/auth0.js +94 -0
- package/dist/browser/plugins/auth/aws-sigv4.d.ts +10 -0
- package/dist/browser/plugins/auth/aws-sigv4.js +88 -0
- package/dist/browser/plugins/auth/azure-ad.d.ts +48 -0
- package/dist/browser/plugins/auth/azure-ad.js +152 -0
- package/dist/browser/plugins/auth/basic.d.ts +7 -0
- package/dist/browser/plugins/auth/basic.js +13 -0
- package/dist/browser/plugins/auth/bearer.d.ts +8 -0
- package/dist/browser/plugins/auth/bearer.js +17 -0
- package/dist/browser/plugins/auth/cognito.d.ts +45 -0
- package/dist/browser/plugins/auth/cognito.js +208 -0
- package/dist/browser/plugins/auth/digest.d.ts +8 -0
- package/dist/browser/plugins/auth/digest.js +100 -0
- package/dist/browser/plugins/auth/firebase.d.ts +32 -0
- package/dist/browser/plugins/auth/firebase.js +195 -0
- package/dist/browser/plugins/auth/github-app.d.ts +36 -0
- package/dist/browser/plugins/auth/github-app.js +170 -0
- package/dist/browser/plugins/auth/google-service-account.d.ts +49 -0
- package/dist/browser/plugins/auth/google-service-account.js +172 -0
- package/dist/browser/plugins/auth/index.d.ts +15 -0
- package/dist/browser/plugins/auth/index.js +15 -0
- package/dist/browser/plugins/auth/mtls.d.ts +37 -0
- package/dist/browser/plugins/auth/mtls.js +140 -0
- package/dist/browser/plugins/auth/oauth2.d.ts +8 -0
- package/dist/browser/plugins/auth/oauth2.js +26 -0
- package/dist/browser/plugins/auth/oidc.d.ts +55 -0
- package/dist/browser/plugins/auth/oidc.js +222 -0
- package/dist/browser/plugins/auth/okta.d.ts +47 -0
- package/dist/browser/plugins/auth/okta.js +157 -0
- package/dist/browser/plugins/auth.d.ts +1 -0
- package/dist/browser/plugins/auth.js +1 -0
- package/dist/browser/plugins/cache.d.ts +15 -0
- package/dist/browser/plugins/cache.js +486 -0
- package/dist/browser/plugins/circuit-breaker.d.ts +13 -0
- package/dist/browser/plugins/circuit-breaker.js +100 -0
- package/dist/browser/plugins/compression.d.ts +4 -0
- package/dist/browser/plugins/compression.js +130 -0
- package/dist/browser/plugins/cookie-jar.d.ts +5 -0
- package/dist/browser/plugins/cookie-jar.js +72 -0
- package/dist/browser/plugins/dedup.d.ts +5 -0
- package/dist/browser/plugins/dedup.js +35 -0
- package/dist/browser/plugins/graphql.d.ts +13 -0
- package/dist/browser/plugins/graphql.js +58 -0
- package/dist/browser/plugins/grpc-web.d.ts +79 -0
- package/dist/browser/plugins/grpc-web.js +261 -0
- package/dist/browser/plugins/hls.d.ts +105 -0
- package/dist/browser/plugins/hls.js +395 -0
- package/dist/browser/plugins/jsonrpc.d.ts +75 -0
- package/dist/browser/plugins/jsonrpc.js +143 -0
- package/dist/browser/plugins/logger.d.ts +13 -0
- package/dist/browser/plugins/logger.js +108 -0
- package/dist/browser/plugins/odata.d.ts +181 -0
- package/dist/browser/plugins/odata.js +564 -0
- package/dist/browser/plugins/pagination.d.ts +16 -0
- package/dist/browser/plugins/pagination.js +105 -0
- package/dist/browser/plugins/rate-limit.d.ts +15 -0
- package/dist/browser/plugins/rate-limit.js +162 -0
- package/dist/browser/plugins/retry.d.ts +14 -0
- package/dist/browser/plugins/retry.js +116 -0
- package/dist/browser/plugins/scrape.d.ts +21 -0
- package/dist/browser/plugins/scrape.js +82 -0
- package/dist/browser/plugins/server-timing.d.ts +7 -0
- package/dist/browser/plugins/server-timing.js +24 -0
- package/dist/browser/plugins/soap.d.ts +72 -0
- package/dist/browser/plugins/soap.js +347 -0
- package/dist/browser/plugins/xml.d.ts +9 -0
- package/dist/browser/plugins/xml.js +194 -0
- package/dist/browser/plugins/xsrf.d.ts +9 -0
- package/dist/browser/plugins/xsrf.js +48 -0
- package/dist/browser/recker.d.ts +26 -0
- package/dist/browser/recker.js +61 -0
- package/dist/browser/runner/request-runner.d.ts +46 -0
- package/dist/browser/runner/request-runner.js +89 -0
- package/dist/browser/scrape/document.d.ts +44 -0
- package/dist/browser/scrape/document.js +210 -0
- package/dist/browser/scrape/element.d.ts +49 -0
- package/dist/browser/scrape/element.js +176 -0
- package/dist/browser/scrape/extractors.d.ts +16 -0
- package/dist/browser/scrape/extractors.js +356 -0
- package/dist/browser/scrape/types.d.ts +107 -0
- package/dist/browser/scrape/types.js +1 -0
- package/dist/browser/transport/fetch.d.ts +11 -0
- package/dist/browser/transport/fetch.js +143 -0
- package/dist/browser/transport/undici.d.ts +38 -0
- package/dist/browser/transport/undici.js +897 -0
- package/dist/browser/types/ai.d.ts +267 -0
- package/dist/browser/types/ai.js +1 -0
- package/dist/browser/types/index.d.ts +351 -0
- package/dist/browser/types/index.js +1 -0
- package/dist/browser/types/logger.d.ts +16 -0
- package/dist/browser/types/logger.js +66 -0
- package/dist/browser/types/udp.d.ts +138 -0
- package/dist/browser/types/udp.js +1 -0
- package/dist/browser/utils/agent-manager.d.ts +29 -0
- package/dist/browser/utils/agent-manager.js +160 -0
- package/dist/browser/utils/body.d.ts +10 -0
- package/dist/browser/utils/body.js +148 -0
- package/dist/browser/utils/charset.d.ts +15 -0
- package/dist/browser/utils/charset.js +169 -0
- package/dist/browser/utils/concurrency.d.ts +20 -0
- package/dist/browser/utils/concurrency.js +120 -0
- package/dist/browser/utils/dns.d.ts +6 -0
- package/dist/browser/utils/dns.js +26 -0
- package/dist/browser/utils/header-parser.d.ts +94 -0
- package/dist/browser/utils/header-parser.js +617 -0
- package/dist/browser/utils/html-cleaner.d.ts +1 -0
- package/dist/browser/utils/html-cleaner.js +21 -0
- package/dist/browser/utils/link-header.d.ts +69 -0
- package/dist/browser/utils/link-header.js +190 -0
- package/dist/browser/utils/optional-require.d.ts +19 -0
- package/dist/browser/utils/optional-require.js +105 -0
- package/dist/browser/utils/progress.d.ts +8 -0
- package/dist/browser/utils/progress.js +82 -0
- package/dist/browser/utils/request-pool.d.ts +22 -0
- package/dist/browser/utils/request-pool.js +101 -0
- package/dist/browser/utils/sse.d.ts +7 -0
- package/dist/browser/utils/sse.js +67 -0
- package/dist/browser/utils/streaming.d.ts +17 -0
- package/dist/browser/utils/streaming.js +84 -0
- package/dist/browser/utils/try-fn.d.ts +3 -0
- package/dist/browser/utils/try-fn.js +59 -0
- package/dist/browser/utils/user-agent.d.ts +44 -0
- package/dist/browser/utils/user-agent.js +100 -0
- package/dist/browser/utils/whois.d.ts +32 -0
- package/dist/browser/utils/whois.js +246 -0
- package/dist/browser/websocket/client.d.ts +65 -0
- package/dist/browser/websocket/client.js +313 -0
- package/dist/cli/index.d.ts +1 -0
- package/dist/cli/index.js +1 -0
- package/dist/transport/fetch.d.ts +7 -1
- package/dist/transport/fetch.js +58 -76
- package/package.json +34 -2
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
import { createClient } from '../core/client.js';
|
|
2
|
+
import { FetchTransport } from '../transport/fetch.js';
|
|
3
|
+
let _defaultClient = null;
|
|
4
|
+
function getDefaultClient() {
|
|
5
|
+
if (!_defaultClient) {
|
|
6
|
+
_defaultClient = createClient({
|
|
7
|
+
transport: new FetchTransport(),
|
|
8
|
+
});
|
|
9
|
+
}
|
|
10
|
+
return _defaultClient;
|
|
11
|
+
}
|
|
12
|
+
export function get(url, options) {
|
|
13
|
+
return getDefaultClient().get(url, options);
|
|
14
|
+
}
|
|
15
|
+
export function post(url, options) {
|
|
16
|
+
return getDefaultClient().post(url, options);
|
|
17
|
+
}
|
|
18
|
+
export function put(url, options) {
|
|
19
|
+
return getDefaultClient().put(url, options);
|
|
20
|
+
}
|
|
21
|
+
export function patch(url, options) {
|
|
22
|
+
return getDefaultClient().patch(url, options);
|
|
23
|
+
}
|
|
24
|
+
export function del(url, options) {
|
|
25
|
+
return getDefaultClient().delete(url, options);
|
|
26
|
+
}
|
|
27
|
+
export function head(url, options) {
|
|
28
|
+
return getDefaultClient().head(url, options);
|
|
29
|
+
}
|
|
30
|
+
export function options(url, options) {
|
|
31
|
+
return getDefaultClient().options(url, options);
|
|
32
|
+
}
|
|
33
|
+
export function ws(url, protocols) {
|
|
34
|
+
return new WebSocket(url, protocols);
|
|
35
|
+
}
|
|
36
|
+
export const recker = {
|
|
37
|
+
get,
|
|
38
|
+
post,
|
|
39
|
+
put,
|
|
40
|
+
patch,
|
|
41
|
+
delete: del,
|
|
42
|
+
head,
|
|
43
|
+
options,
|
|
44
|
+
ws,
|
|
45
|
+
client: (opts) => createClient({ ...opts, transport: new FetchTransport() }),
|
|
46
|
+
reset: () => {
|
|
47
|
+
_defaultClient = null;
|
|
48
|
+
},
|
|
49
|
+
isBrowser: true,
|
|
50
|
+
unavailable: [
|
|
51
|
+
'whois',
|
|
52
|
+
'whoisAvailable',
|
|
53
|
+
'dns',
|
|
54
|
+
'dnsSecurity',
|
|
55
|
+
'dnsClient',
|
|
56
|
+
'whoisClient',
|
|
57
|
+
'ai',
|
|
58
|
+
'aiClient',
|
|
59
|
+
],
|
|
60
|
+
};
|
|
61
|
+
export default recker;
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import { EventEmitter } from 'events';
|
|
2
|
+
export interface RunnerOptions {
|
|
3
|
+
concurrency?: number;
|
|
4
|
+
retries?: number;
|
|
5
|
+
retryDelay?: number;
|
|
6
|
+
}
|
|
7
|
+
export interface RequestTask<T = any> {
|
|
8
|
+
id: string;
|
|
9
|
+
fn: () => Promise<T>;
|
|
10
|
+
priority: number;
|
|
11
|
+
retries?: number;
|
|
12
|
+
}
|
|
13
|
+
export interface RunnerResult<T = any> {
|
|
14
|
+
results: (T | Error)[];
|
|
15
|
+
stats: {
|
|
16
|
+
total: number;
|
|
17
|
+
successful: number;
|
|
18
|
+
failed: number;
|
|
19
|
+
duration: number;
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
export declare class RequestRunner extends EventEmitter {
|
|
23
|
+
private concurrency;
|
|
24
|
+
private queue;
|
|
25
|
+
private activeCount;
|
|
26
|
+
private paused;
|
|
27
|
+
private results;
|
|
28
|
+
private stats;
|
|
29
|
+
private startTime;
|
|
30
|
+
constructor(options?: RunnerOptions);
|
|
31
|
+
add<T>(fn: () => Promise<T>, options?: {
|
|
32
|
+
priority?: number;
|
|
33
|
+
id?: string;
|
|
34
|
+
}): void;
|
|
35
|
+
run<T>(items: any[], processor: (item: any, index: number) => Promise<T>, options?: {
|
|
36
|
+
priority?: number;
|
|
37
|
+
}): Promise<RunnerResult<T>>;
|
|
38
|
+
private processNext;
|
|
39
|
+
getProgress(): {
|
|
40
|
+
total: number;
|
|
41
|
+
completed: number;
|
|
42
|
+
pending: number;
|
|
43
|
+
active: number;
|
|
44
|
+
percent: number;
|
|
45
|
+
};
|
|
46
|
+
}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import { EventEmitter } from 'events';
|
|
2
|
+
export class RequestRunner extends EventEmitter {
|
|
3
|
+
concurrency;
|
|
4
|
+
queue = [];
|
|
5
|
+
activeCount = 0;
|
|
6
|
+
paused = false;
|
|
7
|
+
results = new Map();
|
|
8
|
+
stats = { total: 0, successful: 0, failed: 0 };
|
|
9
|
+
startTime = 0;
|
|
10
|
+
constructor(options = {}) {
|
|
11
|
+
super();
|
|
12
|
+
this.concurrency = options.concurrency || 5;
|
|
13
|
+
}
|
|
14
|
+
add(fn, options = {}) {
|
|
15
|
+
this.queue.push({
|
|
16
|
+
id: options.id || Math.random().toString(36).slice(2),
|
|
17
|
+
fn,
|
|
18
|
+
priority: options.priority || 0,
|
|
19
|
+
});
|
|
20
|
+
this.queue.sort((a, b) => b.priority - a.priority);
|
|
21
|
+
this.stats.total++;
|
|
22
|
+
this.processNext();
|
|
23
|
+
}
|
|
24
|
+
async run(items, processor, options = {}) {
|
|
25
|
+
this.startTime = Date.now();
|
|
26
|
+
this.stats = { total: items.length, successful: 0, failed: 0 };
|
|
27
|
+
this.results.clear();
|
|
28
|
+
const promises = items.map((item, index) => {
|
|
29
|
+
return new Promise((resolve) => {
|
|
30
|
+
this.add(async () => {
|
|
31
|
+
try {
|
|
32
|
+
const res = await processor(item, index);
|
|
33
|
+
resolve(res);
|
|
34
|
+
return res;
|
|
35
|
+
}
|
|
36
|
+
catch (err) {
|
|
37
|
+
resolve(err);
|
|
38
|
+
throw err;
|
|
39
|
+
}
|
|
40
|
+
}, { priority: options.priority, id: String(index) });
|
|
41
|
+
});
|
|
42
|
+
});
|
|
43
|
+
const results = await Promise.all(promises);
|
|
44
|
+
return {
|
|
45
|
+
results,
|
|
46
|
+
stats: {
|
|
47
|
+
...this.stats,
|
|
48
|
+
duration: Date.now() - this.startTime
|
|
49
|
+
}
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
async processNext() {
|
|
53
|
+
if (this.paused || this.activeCount >= this.concurrency || this.queue.length === 0) {
|
|
54
|
+
return;
|
|
55
|
+
}
|
|
56
|
+
const task = this.queue.shift();
|
|
57
|
+
if (!task)
|
|
58
|
+
return;
|
|
59
|
+
this.activeCount++;
|
|
60
|
+
this.emit('taskStart', task);
|
|
61
|
+
try {
|
|
62
|
+
const result = await task.fn();
|
|
63
|
+
this.stats.successful++;
|
|
64
|
+
this.emit('taskComplete', { task, result });
|
|
65
|
+
}
|
|
66
|
+
catch (error) {
|
|
67
|
+
this.stats.failed++;
|
|
68
|
+
this.emit('taskError', { task, error });
|
|
69
|
+
}
|
|
70
|
+
finally {
|
|
71
|
+
this.activeCount--;
|
|
72
|
+
this.emit('progress', this.getProgress());
|
|
73
|
+
if (this.activeCount === 0 && this.queue.length === 0) {
|
|
74
|
+
this.emit('drained');
|
|
75
|
+
}
|
|
76
|
+
this.processNext();
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
getProgress() {
|
|
80
|
+
const completed = this.stats.successful + this.stats.failed;
|
|
81
|
+
return {
|
|
82
|
+
total: this.stats.total,
|
|
83
|
+
completed,
|
|
84
|
+
pending: this.queue.length,
|
|
85
|
+
active: this.activeCount,
|
|
86
|
+
percent: this.stats.total > 0 ? (completed / this.stats.total) * 100 : 0
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import type { CheerioAPI } from 'cheerio';
|
|
2
|
+
import { ScrapeElement } from './element.js';
|
|
3
|
+
import type { ScrapeOptions, ExtractionSchema, ExtractedLink, ExtractedImage, ExtractedMeta, OpenGraphData, TwitterCardData, JsonLdData, ExtractedForm, ExtractedTable, ExtractedScript, ExtractedStyle, LinkExtractionOptions, ImageExtractionOptions } from './types.js';
|
|
4
|
+
export declare class ScrapeDocument {
|
|
5
|
+
private $;
|
|
6
|
+
private options;
|
|
7
|
+
constructor($: CheerioAPI, options?: ScrapeOptions);
|
|
8
|
+
static create(html: string, options?: ScrapeOptions): Promise<ScrapeDocument>;
|
|
9
|
+
select(selector: string): ScrapeElement;
|
|
10
|
+
selectFirst(selector: string): ScrapeElement;
|
|
11
|
+
selectAll(selector: string): ScrapeElement[];
|
|
12
|
+
query(selector: string): ScrapeElement;
|
|
13
|
+
queryAll(selector: string): ScrapeElement[];
|
|
14
|
+
text(selector: string): string;
|
|
15
|
+
texts(selector: string): string[];
|
|
16
|
+
attr(selector: string, attribute: string): string | undefined;
|
|
17
|
+
attrs(selector: string, attribute: string): string[];
|
|
18
|
+
innerHtml(selector: string): string | null;
|
|
19
|
+
outerHtml(selector: string): string;
|
|
20
|
+
links(options?: LinkExtractionOptions): ExtractedLink[];
|
|
21
|
+
images(options?: ImageExtractionOptions): ExtractedImage[];
|
|
22
|
+
meta(): ExtractedMeta;
|
|
23
|
+
openGraph(): OpenGraphData;
|
|
24
|
+
twitterCard(): TwitterCardData;
|
|
25
|
+
jsonLd(): JsonLdData[];
|
|
26
|
+
forms(selector?: string): ExtractedForm[];
|
|
27
|
+
tables(selector?: string): ExtractedTable[];
|
|
28
|
+
scripts(): ExtractedScript[];
|
|
29
|
+
styles(): ExtractedStyle[];
|
|
30
|
+
extract<T extends Record<string, unknown>>(schema: ExtractionSchema): T;
|
|
31
|
+
private extractField;
|
|
32
|
+
title(): string | undefined;
|
|
33
|
+
body(): ScrapeElement;
|
|
34
|
+
head(): ScrapeElement;
|
|
35
|
+
html(): string;
|
|
36
|
+
root(): ScrapeElement;
|
|
37
|
+
exists(selector: string): boolean;
|
|
38
|
+
count(selector: string): number;
|
|
39
|
+
findByText(text: string, selector?: string): ScrapeElement[];
|
|
40
|
+
findByExactText(text: string, selector?: string): ScrapeElement[];
|
|
41
|
+
findByData(name: string, value?: string): ScrapeElement[];
|
|
42
|
+
get raw(): CheerioAPI;
|
|
43
|
+
get baseUrl(): string | undefined;
|
|
44
|
+
}
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
import { requireOptional } from '../utils/optional-require.js';
|
|
2
|
+
import { ScrapeElement } from './element.js';
|
|
3
|
+
import { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
|
|
4
|
+
let cheerioModule = null;
|
|
5
|
+
async function loadCheerio() {
|
|
6
|
+
if (cheerioModule) {
|
|
7
|
+
return cheerioModule;
|
|
8
|
+
}
|
|
9
|
+
cheerioModule = await requireOptional('cheerio', 'recker/scrape');
|
|
10
|
+
return cheerioModule;
|
|
11
|
+
}
|
|
12
|
+
export class ScrapeDocument {
|
|
13
|
+
$;
|
|
14
|
+
options;
|
|
15
|
+
constructor($, options) {
|
|
16
|
+
this.$ = $;
|
|
17
|
+
this.options = options || {};
|
|
18
|
+
}
|
|
19
|
+
static async create(html, options) {
|
|
20
|
+
const { load } = await loadCheerio();
|
|
21
|
+
return new ScrapeDocument(load(html), options);
|
|
22
|
+
}
|
|
23
|
+
select(selector) {
|
|
24
|
+
return new ScrapeElement(this.$(selector), this.$);
|
|
25
|
+
}
|
|
26
|
+
selectFirst(selector) {
|
|
27
|
+
return new ScrapeElement(this.$(selector).first(), this.$);
|
|
28
|
+
}
|
|
29
|
+
selectAll(selector) {
|
|
30
|
+
const elements = [];
|
|
31
|
+
this.$(selector).each((_, element) => {
|
|
32
|
+
elements.push(new ScrapeElement(this.$(element), this.$));
|
|
33
|
+
});
|
|
34
|
+
return elements;
|
|
35
|
+
}
|
|
36
|
+
query(selector) {
|
|
37
|
+
return this.select(selector);
|
|
38
|
+
}
|
|
39
|
+
queryAll(selector) {
|
|
40
|
+
return this.selectAll(selector);
|
|
41
|
+
}
|
|
42
|
+
text(selector) {
|
|
43
|
+
return this.$(selector).first().text().trim();
|
|
44
|
+
}
|
|
45
|
+
texts(selector) {
|
|
46
|
+
const texts = [];
|
|
47
|
+
this.$(selector).each((_, element) => {
|
|
48
|
+
const text = this.$(element).text().trim();
|
|
49
|
+
if (text) {
|
|
50
|
+
texts.push(text);
|
|
51
|
+
}
|
|
52
|
+
});
|
|
53
|
+
return texts;
|
|
54
|
+
}
|
|
55
|
+
attr(selector, attribute) {
|
|
56
|
+
return this.$(selector).first().attr(attribute);
|
|
57
|
+
}
|
|
58
|
+
attrs(selector, attribute) {
|
|
59
|
+
const attrs = [];
|
|
60
|
+
this.$(selector).each((_, element) => {
|
|
61
|
+
const value = this.$(element).attr(attribute);
|
|
62
|
+
if (value !== undefined) {
|
|
63
|
+
attrs.push(value);
|
|
64
|
+
}
|
|
65
|
+
});
|
|
66
|
+
return attrs;
|
|
67
|
+
}
|
|
68
|
+
innerHtml(selector) {
|
|
69
|
+
return this.$(selector).first().html();
|
|
70
|
+
}
|
|
71
|
+
outerHtml(selector) {
|
|
72
|
+
const el = this.$(selector).first();
|
|
73
|
+
return this.$.html(el) || '';
|
|
74
|
+
}
|
|
75
|
+
links(options) {
|
|
76
|
+
return extractLinks(this.$, {
|
|
77
|
+
...options,
|
|
78
|
+
baseUrl: this.options.baseUrl,
|
|
79
|
+
});
|
|
80
|
+
}
|
|
81
|
+
images(options) {
|
|
82
|
+
return extractImages(this.$, {
|
|
83
|
+
...options,
|
|
84
|
+
baseUrl: this.options.baseUrl,
|
|
85
|
+
});
|
|
86
|
+
}
|
|
87
|
+
meta() {
|
|
88
|
+
return extractMeta(this.$);
|
|
89
|
+
}
|
|
90
|
+
openGraph() {
|
|
91
|
+
return extractOpenGraph(this.$);
|
|
92
|
+
}
|
|
93
|
+
twitterCard() {
|
|
94
|
+
return extractTwitterCard(this.$);
|
|
95
|
+
}
|
|
96
|
+
jsonLd() {
|
|
97
|
+
return extractJsonLd(this.$);
|
|
98
|
+
}
|
|
99
|
+
forms(selector) {
|
|
100
|
+
return extractForms(this.$, selector);
|
|
101
|
+
}
|
|
102
|
+
tables(selector) {
|
|
103
|
+
return extractTables(this.$, selector);
|
|
104
|
+
}
|
|
105
|
+
scripts() {
|
|
106
|
+
return extractScripts(this.$);
|
|
107
|
+
}
|
|
108
|
+
styles() {
|
|
109
|
+
return extractStyles(this.$);
|
|
110
|
+
}
|
|
111
|
+
extract(schema) {
|
|
112
|
+
const result = {};
|
|
113
|
+
for (const [key, fieldConfig] of Object.entries(schema)) {
|
|
114
|
+
result[key] = this.extractField(fieldConfig);
|
|
115
|
+
}
|
|
116
|
+
return result;
|
|
117
|
+
}
|
|
118
|
+
extractField(field) {
|
|
119
|
+
if (typeof field === 'string') {
|
|
120
|
+
return this.text(field) || undefined;
|
|
121
|
+
}
|
|
122
|
+
const { selector, attribute, multiple, transform } = field;
|
|
123
|
+
if (multiple) {
|
|
124
|
+
const values = [];
|
|
125
|
+
this.$(selector).each((_, element) => {
|
|
126
|
+
const $el = this.$(element);
|
|
127
|
+
let value;
|
|
128
|
+
if (attribute) {
|
|
129
|
+
value = $el.attr(attribute) || '';
|
|
130
|
+
}
|
|
131
|
+
else {
|
|
132
|
+
value = $el.text().trim();
|
|
133
|
+
}
|
|
134
|
+
if (value) {
|
|
135
|
+
values.push(transform ? transform(value) : value);
|
|
136
|
+
}
|
|
137
|
+
});
|
|
138
|
+
return values;
|
|
139
|
+
}
|
|
140
|
+
else {
|
|
141
|
+
const $el = this.$(selector).first();
|
|
142
|
+
let value;
|
|
143
|
+
if (attribute) {
|
|
144
|
+
value = $el.attr(attribute) || '';
|
|
145
|
+
}
|
|
146
|
+
else {
|
|
147
|
+
value = $el.text().trim();
|
|
148
|
+
}
|
|
149
|
+
if (!value)
|
|
150
|
+
return undefined;
|
|
151
|
+
return transform ? transform(value) : value;
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
title() {
|
|
155
|
+
const title = this.$('title').first().text().trim();
|
|
156
|
+
return title || undefined;
|
|
157
|
+
}
|
|
158
|
+
body() {
|
|
159
|
+
return new ScrapeElement(this.$('body').first(), this.$);
|
|
160
|
+
}
|
|
161
|
+
head() {
|
|
162
|
+
return new ScrapeElement(this.$('head').first(), this.$);
|
|
163
|
+
}
|
|
164
|
+
html() {
|
|
165
|
+
return this.$.html() || '';
|
|
166
|
+
}
|
|
167
|
+
root() {
|
|
168
|
+
return new ScrapeElement(this.$.root(), this.$);
|
|
169
|
+
}
|
|
170
|
+
exists(selector) {
|
|
171
|
+
return this.$(selector).length > 0;
|
|
172
|
+
}
|
|
173
|
+
count(selector) {
|
|
174
|
+
return this.$(selector).length;
|
|
175
|
+
}
|
|
176
|
+
findByText(text, selector) {
|
|
177
|
+
const baseSelector = selector || '*';
|
|
178
|
+
const elements = [];
|
|
179
|
+
this.$(baseSelector).each((_, element) => {
|
|
180
|
+
const $el = this.$(element);
|
|
181
|
+
if ($el.text().includes(text)) {
|
|
182
|
+
elements.push(new ScrapeElement($el, this.$));
|
|
183
|
+
}
|
|
184
|
+
});
|
|
185
|
+
return elements;
|
|
186
|
+
}
|
|
187
|
+
findByExactText(text, selector) {
|
|
188
|
+
const baseSelector = selector || '*';
|
|
189
|
+
const elements = [];
|
|
190
|
+
this.$(baseSelector).each((_, element) => {
|
|
191
|
+
const $el = this.$(element);
|
|
192
|
+
if ($el.text().trim() === text) {
|
|
193
|
+
elements.push(new ScrapeElement($el, this.$));
|
|
194
|
+
}
|
|
195
|
+
});
|
|
196
|
+
return elements;
|
|
197
|
+
}
|
|
198
|
+
findByData(name, value) {
|
|
199
|
+
const selector = value !== undefined
|
|
200
|
+
? `[data-${name}="${value}"]`
|
|
201
|
+
: `[data-${name}]`;
|
|
202
|
+
return this.selectAll(selector);
|
|
203
|
+
}
|
|
204
|
+
get raw() {
|
|
205
|
+
return this.$;
|
|
206
|
+
}
|
|
207
|
+
get baseUrl() {
|
|
208
|
+
return this.options.baseUrl;
|
|
209
|
+
}
|
|
210
|
+
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
import type { Cheerio, CheerioAPI } from 'cheerio';
|
|
2
|
+
import type { Element } from 'domhandler';
|
|
3
|
+
export declare class ScrapeElement {
|
|
4
|
+
private $el;
|
|
5
|
+
private $;
|
|
6
|
+
constructor($el: Cheerio<Element>, $: CheerioAPI);
|
|
7
|
+
find(selector: string): ScrapeElement;
|
|
8
|
+
parent(selector?: string): ScrapeElement;
|
|
9
|
+
children(selector?: string): ScrapeElement;
|
|
10
|
+
siblings(selector?: string): ScrapeElement;
|
|
11
|
+
next(selector?: string): ScrapeElement;
|
|
12
|
+
prev(selector?: string): ScrapeElement;
|
|
13
|
+
nextAll(selector?: string): ScrapeElement;
|
|
14
|
+
prevAll(selector?: string): ScrapeElement;
|
|
15
|
+
closest(selector: string): ScrapeElement;
|
|
16
|
+
first(): ScrapeElement;
|
|
17
|
+
last(): ScrapeElement;
|
|
18
|
+
eq(index: number): ScrapeElement;
|
|
19
|
+
filter(selector: string): ScrapeElement;
|
|
20
|
+
not(selector: string): ScrapeElement;
|
|
21
|
+
has(selector: string): ScrapeElement;
|
|
22
|
+
add(selector: string): ScrapeElement;
|
|
23
|
+
parents(selector?: string): ScrapeElement;
|
|
24
|
+
contents(): ScrapeElement;
|
|
25
|
+
text(): string;
|
|
26
|
+
html(): string | null;
|
|
27
|
+
outerHtml(): string;
|
|
28
|
+
attr(name: string): string | undefined;
|
|
29
|
+
attrs(): Record<string, string>;
|
|
30
|
+
data(name?: string): unknown;
|
|
31
|
+
val(): string | string[] | undefined;
|
|
32
|
+
prop(name: string): unknown;
|
|
33
|
+
exists(): boolean;
|
|
34
|
+
get length(): number;
|
|
35
|
+
is(selector: string): boolean;
|
|
36
|
+
hasClass(className: string): boolean;
|
|
37
|
+
index(selector?: string): number;
|
|
38
|
+
each(callback: (el: ScrapeElement, index: number) => void): this;
|
|
39
|
+
map<T>(callback: (el: ScrapeElement, index: number) => T): T[];
|
|
40
|
+
toArray(): ScrapeElement[];
|
|
41
|
+
reduce<T>(callback: (acc: T, el: ScrapeElement, index: number) => T, initialValue: T): T;
|
|
42
|
+
some(callback: (el: ScrapeElement, index: number) => boolean): boolean;
|
|
43
|
+
every(callback: (el: ScrapeElement, index: number) => boolean): boolean;
|
|
44
|
+
tagName(): string | undefined;
|
|
45
|
+
clone(): ScrapeElement;
|
|
46
|
+
toString(): string;
|
|
47
|
+
get raw(): Cheerio<Element>;
|
|
48
|
+
get(index?: number): Element | undefined;
|
|
49
|
+
}
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
export class ScrapeElement {
|
|
2
|
+
$el;
|
|
3
|
+
$;
|
|
4
|
+
constructor($el, $) {
|
|
5
|
+
this.$el = $el;
|
|
6
|
+
this.$ = $;
|
|
7
|
+
}
|
|
8
|
+
find(selector) {
|
|
9
|
+
return new ScrapeElement(this.$el.find(selector), this.$);
|
|
10
|
+
}
|
|
11
|
+
parent(selector) {
|
|
12
|
+
const parent = selector ? this.$el.parent(selector) : this.$el.parent();
|
|
13
|
+
return new ScrapeElement(parent, this.$);
|
|
14
|
+
}
|
|
15
|
+
children(selector) {
|
|
16
|
+
const children = selector ? this.$el.children(selector) : this.$el.children();
|
|
17
|
+
return new ScrapeElement(children, this.$);
|
|
18
|
+
}
|
|
19
|
+
siblings(selector) {
|
|
20
|
+
const siblings = selector ? this.$el.siblings(selector) : this.$el.siblings();
|
|
21
|
+
return new ScrapeElement(siblings, this.$);
|
|
22
|
+
}
|
|
23
|
+
next(selector) {
|
|
24
|
+
const next = selector ? this.$el.next(selector) : this.$el.next();
|
|
25
|
+
return new ScrapeElement(next, this.$);
|
|
26
|
+
}
|
|
27
|
+
prev(selector) {
|
|
28
|
+
const prev = selector ? this.$el.prev(selector) : this.$el.prev();
|
|
29
|
+
return new ScrapeElement(prev, this.$);
|
|
30
|
+
}
|
|
31
|
+
nextAll(selector) {
|
|
32
|
+
const nextAll = selector ? this.$el.nextAll(selector) : this.$el.nextAll();
|
|
33
|
+
return new ScrapeElement(nextAll, this.$);
|
|
34
|
+
}
|
|
35
|
+
prevAll(selector) {
|
|
36
|
+
const prevAll = selector ? this.$el.prevAll(selector) : this.$el.prevAll();
|
|
37
|
+
return new ScrapeElement(prevAll, this.$);
|
|
38
|
+
}
|
|
39
|
+
closest(selector) {
|
|
40
|
+
return new ScrapeElement(this.$el.closest(selector), this.$);
|
|
41
|
+
}
|
|
42
|
+
first() {
|
|
43
|
+
return new ScrapeElement(this.$el.first(), this.$);
|
|
44
|
+
}
|
|
45
|
+
last() {
|
|
46
|
+
return new ScrapeElement(this.$el.last(), this.$);
|
|
47
|
+
}
|
|
48
|
+
eq(index) {
|
|
49
|
+
return new ScrapeElement(this.$el.eq(index), this.$);
|
|
50
|
+
}
|
|
51
|
+
filter(selector) {
|
|
52
|
+
return new ScrapeElement(this.$el.filter(selector), this.$);
|
|
53
|
+
}
|
|
54
|
+
not(selector) {
|
|
55
|
+
return new ScrapeElement(this.$el.not(selector), this.$);
|
|
56
|
+
}
|
|
57
|
+
has(selector) {
|
|
58
|
+
return new ScrapeElement(this.$el.has(selector), this.$);
|
|
59
|
+
}
|
|
60
|
+
add(selector) {
|
|
61
|
+
return new ScrapeElement(this.$el.add(selector), this.$);
|
|
62
|
+
}
|
|
63
|
+
parents(selector) {
|
|
64
|
+
const parents = selector ? this.$el.parents(selector) : this.$el.parents();
|
|
65
|
+
return new ScrapeElement(parents, this.$);
|
|
66
|
+
}
|
|
67
|
+
contents() {
|
|
68
|
+
return new ScrapeElement(this.$el.contents(), this.$);
|
|
69
|
+
}
|
|
70
|
+
text() {
|
|
71
|
+
return this.$el.text().trim();
|
|
72
|
+
}
|
|
73
|
+
html() {
|
|
74
|
+
return this.$el.html();
|
|
75
|
+
}
|
|
76
|
+
outerHtml() {
|
|
77
|
+
return this.$.html(this.$el) || '';
|
|
78
|
+
}
|
|
79
|
+
attr(name) {
|
|
80
|
+
return this.$el.attr(name);
|
|
81
|
+
}
|
|
82
|
+
attrs() {
|
|
83
|
+
const attributes = {};
|
|
84
|
+
const el = this.$el.get(0);
|
|
85
|
+
if (el && 'attribs' in el) {
|
|
86
|
+
Object.assign(attributes, el.attribs);
|
|
87
|
+
}
|
|
88
|
+
return attributes;
|
|
89
|
+
}
|
|
90
|
+
data(name) {
|
|
91
|
+
if (name) {
|
|
92
|
+
return this.$el.data(name);
|
|
93
|
+
}
|
|
94
|
+
return this.$el.data();
|
|
95
|
+
}
|
|
96
|
+
val() {
|
|
97
|
+
return this.$el.val();
|
|
98
|
+
}
|
|
99
|
+
prop(name) {
|
|
100
|
+
return this.$el.prop(name);
|
|
101
|
+
}
|
|
102
|
+
exists() {
|
|
103
|
+
return this.$el.length > 0;
|
|
104
|
+
}
|
|
105
|
+
get length() {
|
|
106
|
+
return this.$el.length;
|
|
107
|
+
}
|
|
108
|
+
is(selector) {
|
|
109
|
+
return this.$el.is(selector);
|
|
110
|
+
}
|
|
111
|
+
hasClass(className) {
|
|
112
|
+
return this.$el.hasClass(className);
|
|
113
|
+
}
|
|
114
|
+
index(selector) {
|
|
115
|
+
return selector ? this.$el.index(selector) : this.$el.index();
|
|
116
|
+
}
|
|
117
|
+
each(callback) {
|
|
118
|
+
this.$el.each((index, element) => {
|
|
119
|
+
callback(new ScrapeElement(this.$(element), this.$), index);
|
|
120
|
+
});
|
|
121
|
+
return this;
|
|
122
|
+
}
|
|
123
|
+
map(callback) {
|
|
124
|
+
const results = [];
|
|
125
|
+
this.$el.each((index, element) => {
|
|
126
|
+
results.push(callback(new ScrapeElement(this.$(element), this.$), index));
|
|
127
|
+
});
|
|
128
|
+
return results;
|
|
129
|
+
}
|
|
130
|
+
toArray() {
|
|
131
|
+
return this.$el.toArray().map((element) => new ScrapeElement(this.$(element), this.$));
|
|
132
|
+
}
|
|
133
|
+
reduce(callback, initialValue) {
|
|
134
|
+
let accumulator = initialValue;
|
|
135
|
+
this.$el.each((index, element) => {
|
|
136
|
+
accumulator = callback(accumulator, new ScrapeElement(this.$(element), this.$), index);
|
|
137
|
+
});
|
|
138
|
+
return accumulator;
|
|
139
|
+
}
|
|
140
|
+
some(callback) {
|
|
141
|
+
let found = false;
|
|
142
|
+
this.$el.each((index, element) => {
|
|
143
|
+
if (callback(new ScrapeElement(this.$(element), this.$), index)) {
|
|
144
|
+
found = true;
|
|
145
|
+
return false;
|
|
146
|
+
}
|
|
147
|
+
});
|
|
148
|
+
return found;
|
|
149
|
+
}
|
|
150
|
+
every(callback) {
|
|
151
|
+
let allMatch = true;
|
|
152
|
+
this.$el.each((index, element) => {
|
|
153
|
+
if (!callback(new ScrapeElement(this.$(element), this.$), index)) {
|
|
154
|
+
allMatch = false;
|
|
155
|
+
return false;
|
|
156
|
+
}
|
|
157
|
+
});
|
|
158
|
+
return allMatch;
|
|
159
|
+
}
|
|
160
|
+
tagName() {
|
|
161
|
+
const el = this.$el.get(0);
|
|
162
|
+
return el ? el.tagName?.toLowerCase() : undefined;
|
|
163
|
+
}
|
|
164
|
+
clone() {
|
|
165
|
+
return new ScrapeElement(this.$el.clone(), this.$);
|
|
166
|
+
}
|
|
167
|
+
toString() {
|
|
168
|
+
return this.outerHtml();
|
|
169
|
+
}
|
|
170
|
+
get raw() {
|
|
171
|
+
return this.$el;
|
|
172
|
+
}
|
|
173
|
+
get(index = 0) {
|
|
174
|
+
return this.$el.get(index);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import type { CheerioAPI } from 'cheerio';
|
|
2
|
+
import type { ExtractedLink, ExtractedImage, ExtractedMeta, OpenGraphData, TwitterCardData, JsonLdData, ExtractedForm, ExtractedTable, ExtractedScript, ExtractedStyle, LinkExtractionOptions, ImageExtractionOptions } from './types.js';
|
|
3
|
+
export declare function extractLinks($: CheerioAPI, options?: LinkExtractionOptions & {
|
|
4
|
+
baseUrl?: string;
|
|
5
|
+
}): ExtractedLink[];
|
|
6
|
+
export declare function extractImages($: CheerioAPI, options?: ImageExtractionOptions & {
|
|
7
|
+
baseUrl?: string;
|
|
8
|
+
}): ExtractedImage[];
|
|
9
|
+
export declare function extractMeta($: CheerioAPI): ExtractedMeta;
|
|
10
|
+
export declare function extractOpenGraph($: CheerioAPI): OpenGraphData;
|
|
11
|
+
export declare function extractTwitterCard($: CheerioAPI): TwitterCardData;
|
|
12
|
+
export declare function extractJsonLd($: CheerioAPI): JsonLdData[];
|
|
13
|
+
export declare function extractForms($: CheerioAPI, selector?: string): ExtractedForm[];
|
|
14
|
+
export declare function extractTables($: CheerioAPI, selector?: string): ExtractedTable[];
|
|
15
|
+
export declare function extractScripts($: CheerioAPI): ExtractedScript[];
|
|
16
|
+
export declare function extractStyles($: CheerioAPI): ExtractedStyle[];
|