recker 1.0.2-0 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +0 -2
- package/README.md +121 -72
- package/dist/cache/memory-storage.d.ts.map +1 -1
- package/dist/cache/memory-storage.js +7 -1
- package/dist/constants/http-status.d.ts +74 -0
- package/dist/constants/http-status.d.ts.map +1 -0
- package/dist/constants/http-status.js +156 -0
- package/dist/constants.d.ts.map +1 -1
- package/dist/constants.js +6 -6
- package/dist/cookies/memory-cookie-jar.d.ts +31 -0
- package/dist/cookies/memory-cookie-jar.d.ts.map +1 -0
- package/dist/cookies/memory-cookie-jar.js +210 -0
- package/dist/core/client.d.ts +9 -0
- package/dist/core/client.d.ts.map +1 -1
- package/dist/core/client.js +252 -53
- package/dist/core/errors.d.ts +18 -2
- package/dist/core/errors.d.ts.map +1 -1
- package/dist/core/errors.js +66 -5
- package/dist/core/index.d.ts +6 -0
- package/dist/core/index.d.ts.map +1 -0
- package/dist/core/index.js +5 -0
- package/dist/core/request-promise.d.ts.map +1 -1
- package/dist/core/request-promise.js +8 -2
- package/dist/core/request.d.ts +7 -1
- package/dist/core/request.d.ts.map +1 -1
- package/dist/core/request.js +32 -0
- package/dist/core/response.d.ts +2 -0
- package/dist/core/response.d.ts.map +1 -1
- package/dist/core/response.js +44 -19
- package/dist/events/request-events.d.ts +48 -0
- package/dist/events/request-events.d.ts.map +1 -0
- package/dist/events/request-events.js +85 -0
- package/dist/index.d.ts +28 -2
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +28 -2
- package/dist/mcp/client.d.ts.map +1 -1
- package/dist/mcp/client.js +16 -5
- package/dist/mcp/contract.d.ts +77 -0
- package/dist/mcp/contract.d.ts.map +1 -0
- package/dist/mcp/contract.js +278 -0
- package/dist/mcp/types.d.ts +1 -0
- package/dist/mcp/types.d.ts.map +1 -1
- package/dist/plugins/auth.d.ts +45 -0
- package/dist/plugins/auth.d.ts.map +1 -0
- package/dist/plugins/auth.js +268 -0
- package/dist/plugins/cache.d.ts +7 -1
- package/dist/plugins/cache.d.ts.map +1 -1
- package/dist/plugins/cache.js +470 -49
- package/dist/plugins/circuit-breaker.js +1 -1
- package/dist/plugins/compression.d.ts.map +1 -1
- package/dist/plugins/compression.js +3 -3
- package/dist/plugins/dedup.d.ts.map +1 -1
- package/dist/plugins/dedup.js +2 -1
- package/dist/plugins/graphql.d.ts +4 -3
- package/dist/plugins/graphql.d.ts.map +1 -1
- package/dist/plugins/graphql.js +24 -5
- package/dist/plugins/grpc-web.d.ts +80 -0
- package/dist/plugins/grpc-web.d.ts.map +1 -0
- package/dist/plugins/grpc-web.js +261 -0
- package/dist/plugins/har-player.d.ts.map +1 -1
- package/dist/plugins/har-player.js +11 -2
- package/dist/plugins/hls.d.ts +33 -0
- package/dist/plugins/hls.d.ts.map +1 -0
- package/dist/plugins/hls.js +225 -0
- package/dist/plugins/http2-push.d.ts +64 -0
- package/dist/plugins/http2-push.d.ts.map +1 -0
- package/dist/plugins/http2-push.js +274 -0
- package/dist/plugins/http3.d.ts +76 -0
- package/dist/plugins/http3.d.ts.map +1 -0
- package/dist/plugins/http3.js +231 -0
- package/dist/plugins/interface-rotator.d.ts +10 -0
- package/dist/plugins/interface-rotator.d.ts.map +1 -0
- package/dist/plugins/interface-rotator.js +57 -0
- package/dist/plugins/jsonrpc.d.ts +76 -0
- package/dist/plugins/jsonrpc.d.ts.map +1 -0
- package/dist/plugins/jsonrpc.js +143 -0
- package/dist/plugins/logger.d.ts +8 -5
- package/dist/plugins/logger.d.ts.map +1 -1
- package/dist/plugins/logger.js +66 -30
- package/dist/plugins/odata.d.ts +182 -0
- package/dist/plugins/odata.d.ts.map +1 -0
- package/dist/plugins/odata.js +561 -0
- package/dist/plugins/retry.d.ts +1 -0
- package/dist/plugins/retry.d.ts.map +1 -1
- package/dist/plugins/retry.js +26 -2
- package/dist/plugins/scrape.d.ts +22 -0
- package/dist/plugins/scrape.d.ts.map +1 -0
- package/dist/plugins/scrape.js +87 -0
- package/dist/plugins/soap.d.ts +73 -0
- package/dist/plugins/soap.d.ts.map +1 -0
- package/dist/plugins/soap.js +347 -0
- package/dist/plugins/user-agent.d.ts +8 -0
- package/dist/plugins/user-agent.d.ts.map +1 -0
- package/dist/plugins/user-agent.js +46 -0
- package/dist/plugins/xml.d.ts +10 -0
- package/dist/plugins/xml.d.ts.map +1 -0
- package/dist/plugins/xml.js +194 -0
- package/dist/presets/anthropic.d.ts +7 -0
- package/dist/presets/anthropic.d.ts.map +1 -0
- package/dist/presets/anthropic.js +17 -0
- package/dist/presets/azure-openai.d.ts +9 -0
- package/dist/presets/azure-openai.d.ts.map +1 -0
- package/dist/presets/azure-openai.js +25 -0
- package/dist/presets/cloudflare.d.ts +13 -0
- package/dist/presets/cloudflare.d.ts.map +1 -0
- package/dist/presets/cloudflare.js +39 -0
- package/dist/presets/cohere.d.ts +6 -0
- package/dist/presets/cohere.d.ts.map +1 -0
- package/dist/presets/cohere.js +16 -0
- package/dist/presets/deepseek.d.ts +6 -0
- package/dist/presets/deepseek.d.ts.map +1 -0
- package/dist/presets/deepseek.js +16 -0
- package/dist/presets/digitalocean.d.ts +6 -0
- package/dist/presets/digitalocean.d.ts.map +1 -0
- package/dist/presets/digitalocean.js +16 -0
- package/dist/presets/discord.d.ts +7 -0
- package/dist/presets/discord.d.ts.map +1 -0
- package/dist/presets/discord.js +17 -0
- package/dist/presets/fireworks.d.ts +6 -0
- package/dist/presets/fireworks.d.ts.map +1 -0
- package/dist/presets/fireworks.js +16 -0
- package/dist/presets/gemini.d.ts +6 -0
- package/dist/presets/gemini.d.ts.map +1 -0
- package/dist/presets/gemini.js +16 -0
- package/dist/presets/github.d.ts +7 -0
- package/dist/presets/github.d.ts.map +1 -0
- package/dist/presets/github.js +17 -0
- package/dist/presets/gitlab.d.ts +7 -0
- package/dist/presets/gitlab.d.ts.map +1 -0
- package/dist/presets/gitlab.js +16 -0
- package/dist/presets/groq.d.ts +6 -0
- package/dist/presets/groq.d.ts.map +1 -0
- package/dist/presets/groq.js +16 -0
- package/dist/presets/huggingface.d.ts +6 -0
- package/dist/presets/huggingface.d.ts.map +1 -0
- package/dist/presets/huggingface.js +16 -0
- package/dist/presets/index.d.ts +28 -0
- package/dist/presets/index.d.ts.map +1 -0
- package/dist/presets/index.js +27 -0
- package/dist/presets/linear.d.ts +6 -0
- package/dist/presets/linear.d.ts.map +1 -0
- package/dist/presets/linear.js +16 -0
- package/dist/presets/mistral.d.ts +6 -0
- package/dist/presets/mistral.d.ts.map +1 -0
- package/dist/presets/mistral.js +16 -0
- package/dist/presets/notion.d.ts +7 -0
- package/dist/presets/notion.d.ts.map +1 -0
- package/dist/presets/notion.js +17 -0
- package/dist/presets/openai.d.ts +8 -0
- package/dist/presets/openai.d.ts.map +1 -0
- package/dist/presets/openai.js +23 -0
- package/dist/presets/perplexity.d.ts +6 -0
- package/dist/presets/perplexity.d.ts.map +1 -0
- package/dist/presets/perplexity.js +16 -0
- package/dist/presets/registry.d.ts +20 -0
- package/dist/presets/registry.d.ts.map +1 -0
- package/dist/presets/registry.js +311 -0
- package/dist/presets/replicate.d.ts +6 -0
- package/dist/presets/replicate.d.ts.map +1 -0
- package/dist/presets/replicate.js +16 -0
- package/dist/presets/slack.d.ts +6 -0
- package/dist/presets/slack.d.ts.map +1 -0
- package/dist/presets/slack.js +16 -0
- package/dist/presets/stripe.d.ts +8 -0
- package/dist/presets/stripe.d.ts.map +1 -0
- package/dist/presets/stripe.js +23 -0
- package/dist/presets/supabase.d.ts +7 -0
- package/dist/presets/supabase.d.ts.map +1 -0
- package/dist/presets/supabase.js +18 -0
- package/dist/presets/together.d.ts +6 -0
- package/dist/presets/together.d.ts.map +1 -0
- package/dist/presets/together.js +16 -0
- package/dist/presets/twilio.d.ts +7 -0
- package/dist/presets/twilio.d.ts.map +1 -0
- package/dist/presets/twilio.js +17 -0
- package/dist/presets/vercel.d.ts +7 -0
- package/dist/presets/vercel.d.ts.map +1 -0
- package/dist/presets/vercel.js +23 -0
- package/dist/presets/xai.d.ts +7 -0
- package/dist/presets/xai.d.ts.map +1 -0
- package/dist/presets/xai.js +17 -0
- package/dist/protocols/ftp.d.ts +63 -0
- package/dist/protocols/ftp.d.ts.map +1 -0
- package/dist/protocols/ftp.js +388 -0
- package/dist/protocols/index.d.ts +4 -0
- package/dist/protocols/index.d.ts.map +1 -0
- package/dist/protocols/index.js +3 -0
- package/dist/protocols/sftp.d.ts +65 -0
- package/dist/protocols/sftp.d.ts.map +1 -0
- package/dist/protocols/sftp.js +346 -0
- package/dist/protocols/telnet.d.ts +50 -0
- package/dist/protocols/telnet.d.ts.map +1 -0
- package/dist/protocols/telnet.js +139 -0
- package/dist/runner/request-runner.d.ts.map +1 -1
- package/dist/runner/request-runner.js +1 -0
- package/dist/scrape/document.d.ts +44 -0
- package/dist/scrape/document.d.ts.map +1 -0
- package/dist/scrape/document.js +198 -0
- package/dist/scrape/element.d.ts +50 -0
- package/dist/scrape/element.d.ts.map +1 -0
- package/dist/scrape/element.js +176 -0
- package/dist/scrape/extractors.d.ts +17 -0
- package/dist/scrape/extractors.d.ts.map +1 -0
- package/dist/scrape/extractors.js +356 -0
- package/dist/scrape/index.d.ts +5 -0
- package/dist/scrape/index.d.ts.map +1 -0
- package/dist/scrape/index.js +3 -0
- package/dist/scrape/types.d.ts +108 -0
- package/dist/scrape/types.d.ts.map +1 -0
- package/dist/scrape/types.js +1 -0
- package/dist/testing/index.d.ts +3 -0
- package/dist/testing/index.d.ts.map +1 -0
- package/dist/testing/index.js +1 -0
- package/dist/testing/mock.d.ts +58 -0
- package/dist/testing/mock.d.ts.map +1 -0
- package/dist/testing/mock.js +252 -0
- package/dist/transport/fetch.d.ts.map +1 -1
- package/dist/transport/fetch.js +12 -4
- package/dist/transport/undici.d.ts +17 -1
- package/dist/transport/undici.d.ts.map +1 -1
- package/dist/transport/undici.js +708 -47
- package/dist/types/index.d.ts +111 -10
- package/dist/types/index.d.ts.map +1 -1
- package/dist/types/index.js +1 -1
- package/dist/types/logger.d.ts +17 -0
- package/dist/types/logger.d.ts.map +1 -0
- package/dist/types/logger.js +66 -0
- package/dist/utils/agent-manager.d.ts.map +1 -1
- package/dist/utils/agent-manager.js +20 -4
- package/dist/utils/body.d.ts.map +1 -1
- package/dist/utils/body.js +14 -2
- package/dist/utils/charset.d.ts +16 -0
- package/dist/utils/charset.d.ts.map +1 -0
- package/dist/utils/charset.js +169 -0
- package/dist/utils/client-pool.d.ts +21 -0
- package/dist/utils/client-pool.d.ts.map +1 -0
- package/dist/utils/client-pool.js +49 -0
- package/dist/utils/concurrency.d.ts.map +1 -1
- package/dist/utils/concurrency.js +8 -4
- package/dist/utils/dns-toolkit.d.ts +13 -0
- package/dist/utils/dns-toolkit.d.ts.map +1 -0
- package/dist/utils/dns-toolkit.js +48 -0
- package/dist/utils/doh.d.ts.map +1 -1
- package/dist/utils/doh.js +16 -3
- package/dist/utils/download.d.ts +15 -0
- package/dist/utils/download.d.ts.map +1 -0
- package/dist/utils/download.js +44 -0
- package/dist/utils/env-proxy.d.ts +13 -0
- package/dist/utils/env-proxy.d.ts.map +1 -0
- package/dist/utils/env-proxy.js +105 -0
- package/dist/utils/header-parser.d.ts +15 -1
- package/dist/utils/header-parser.d.ts.map +1 -1
- package/dist/utils/header-parser.js +161 -1
- package/dist/utils/link-header.d.ts +70 -0
- package/dist/utils/link-header.d.ts.map +1 -0
- package/dist/utils/link-header.js +190 -0
- package/dist/utils/progress.d.ts +7 -2
- package/dist/utils/progress.d.ts.map +1 -1
- package/dist/utils/progress.js +48 -15
- package/dist/utils/rdap.d.ts +17 -0
- package/dist/utils/rdap.d.ts.map +1 -0
- package/dist/utils/rdap.js +32 -0
- package/dist/utils/request-pool.d.ts.map +1 -1
- package/dist/utils/request-pool.js +4 -3
- package/dist/utils/sse.d.ts.map +1 -1
- package/dist/utils/sse.js +8 -2
- package/dist/utils/status-codes.d.ts +84 -0
- package/dist/utils/status-codes.d.ts.map +1 -0
- package/dist/utils/status-codes.js +204 -0
- package/dist/utils/streaming.d.ts.map +1 -1
- package/dist/utils/streaming.js +1 -0
- package/dist/utils/tls-inspector.d.ts +21 -0
- package/dist/utils/tls-inspector.d.ts.map +1 -0
- package/dist/utils/tls-inspector.js +39 -0
- package/dist/utils/try-fn.d.ts.map +1 -1
- package/dist/utils/try-fn.js +11 -5
- package/dist/utils/upload.d.ts +1 -0
- package/dist/utils/upload.d.ts.map +1 -1
- package/dist/utils/upload.js +20 -3
- package/dist/utils/user-agent.d.ts +9 -9
- package/dist/utils/user-agent.js +9 -9
- package/dist/utils/whois.d.ts.map +1 -1
- package/dist/utils/whois.js +11 -2
- package/dist/websocket/client.d.ts +29 -1
- package/dist/websocket/client.d.ts.map +1 -1
- package/dist/websocket/client.js +145 -13
- package/package.json +45 -8
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
import { load } from 'cheerio';
|
|
2
|
+
import { ScrapeElement } from './element.js';
|
|
3
|
+
import { extractLinks, extractImages, extractMeta, extractOpenGraph, extractTwitterCard, extractJsonLd, extractForms, extractTables, extractScripts, extractStyles, } from './extractors.js';
|
|
4
|
+
export class ScrapeDocument {
|
|
5
|
+
$;
|
|
6
|
+
options;
|
|
7
|
+
constructor(html, options) {
|
|
8
|
+
this.$ = load(html);
|
|
9
|
+
this.options = options || {};
|
|
10
|
+
}
|
|
11
|
+
select(selector) {
|
|
12
|
+
return new ScrapeElement(this.$(selector), this.$);
|
|
13
|
+
}
|
|
14
|
+
selectFirst(selector) {
|
|
15
|
+
return new ScrapeElement(this.$(selector).first(), this.$);
|
|
16
|
+
}
|
|
17
|
+
selectAll(selector) {
|
|
18
|
+
const elements = [];
|
|
19
|
+
this.$(selector).each((_, element) => {
|
|
20
|
+
elements.push(new ScrapeElement(this.$(element), this.$));
|
|
21
|
+
});
|
|
22
|
+
return elements;
|
|
23
|
+
}
|
|
24
|
+
query(selector) {
|
|
25
|
+
return this.select(selector);
|
|
26
|
+
}
|
|
27
|
+
queryAll(selector) {
|
|
28
|
+
return this.selectAll(selector);
|
|
29
|
+
}
|
|
30
|
+
text(selector) {
|
|
31
|
+
return this.$(selector).first().text().trim();
|
|
32
|
+
}
|
|
33
|
+
texts(selector) {
|
|
34
|
+
const texts = [];
|
|
35
|
+
this.$(selector).each((_, element) => {
|
|
36
|
+
const text = this.$(element).text().trim();
|
|
37
|
+
if (text) {
|
|
38
|
+
texts.push(text);
|
|
39
|
+
}
|
|
40
|
+
});
|
|
41
|
+
return texts;
|
|
42
|
+
}
|
|
43
|
+
attr(selector, attribute) {
|
|
44
|
+
return this.$(selector).first().attr(attribute);
|
|
45
|
+
}
|
|
46
|
+
attrs(selector, attribute) {
|
|
47
|
+
const attrs = [];
|
|
48
|
+
this.$(selector).each((_, element) => {
|
|
49
|
+
const value = this.$(element).attr(attribute);
|
|
50
|
+
if (value !== undefined) {
|
|
51
|
+
attrs.push(value);
|
|
52
|
+
}
|
|
53
|
+
});
|
|
54
|
+
return attrs;
|
|
55
|
+
}
|
|
56
|
+
innerHtml(selector) {
|
|
57
|
+
return this.$(selector).first().html();
|
|
58
|
+
}
|
|
59
|
+
outerHtml(selector) {
|
|
60
|
+
const el = this.$(selector).first();
|
|
61
|
+
return this.$.html(el) || '';
|
|
62
|
+
}
|
|
63
|
+
links(options) {
|
|
64
|
+
return extractLinks(this.$, {
|
|
65
|
+
...options,
|
|
66
|
+
baseUrl: this.options.baseUrl,
|
|
67
|
+
});
|
|
68
|
+
}
|
|
69
|
+
images(options) {
|
|
70
|
+
return extractImages(this.$, {
|
|
71
|
+
...options,
|
|
72
|
+
baseUrl: this.options.baseUrl,
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
meta() {
|
|
76
|
+
return extractMeta(this.$);
|
|
77
|
+
}
|
|
78
|
+
openGraph() {
|
|
79
|
+
return extractOpenGraph(this.$);
|
|
80
|
+
}
|
|
81
|
+
twitterCard() {
|
|
82
|
+
return extractTwitterCard(this.$);
|
|
83
|
+
}
|
|
84
|
+
jsonLd() {
|
|
85
|
+
return extractJsonLd(this.$);
|
|
86
|
+
}
|
|
87
|
+
forms(selector) {
|
|
88
|
+
return extractForms(this.$, selector);
|
|
89
|
+
}
|
|
90
|
+
tables(selector) {
|
|
91
|
+
return extractTables(this.$, selector);
|
|
92
|
+
}
|
|
93
|
+
scripts() {
|
|
94
|
+
return extractScripts(this.$);
|
|
95
|
+
}
|
|
96
|
+
styles() {
|
|
97
|
+
return extractStyles(this.$);
|
|
98
|
+
}
|
|
99
|
+
extract(schema) {
|
|
100
|
+
const result = {};
|
|
101
|
+
for (const [key, fieldConfig] of Object.entries(schema)) {
|
|
102
|
+
result[key] = this.extractField(fieldConfig);
|
|
103
|
+
}
|
|
104
|
+
return result;
|
|
105
|
+
}
|
|
106
|
+
extractField(field) {
|
|
107
|
+
if (typeof field === 'string') {
|
|
108
|
+
return this.text(field) || undefined;
|
|
109
|
+
}
|
|
110
|
+
const { selector, attribute, multiple, transform } = field;
|
|
111
|
+
if (multiple) {
|
|
112
|
+
const values = [];
|
|
113
|
+
this.$(selector).each((_, element) => {
|
|
114
|
+
const $el = this.$(element);
|
|
115
|
+
let value;
|
|
116
|
+
if (attribute) {
|
|
117
|
+
value = $el.attr(attribute) || '';
|
|
118
|
+
}
|
|
119
|
+
else {
|
|
120
|
+
value = $el.text().trim();
|
|
121
|
+
}
|
|
122
|
+
if (value) {
|
|
123
|
+
values.push(transform ? transform(value) : value);
|
|
124
|
+
}
|
|
125
|
+
});
|
|
126
|
+
return values;
|
|
127
|
+
}
|
|
128
|
+
else {
|
|
129
|
+
const $el = this.$(selector).first();
|
|
130
|
+
let value;
|
|
131
|
+
if (attribute) {
|
|
132
|
+
value = $el.attr(attribute) || '';
|
|
133
|
+
}
|
|
134
|
+
else {
|
|
135
|
+
value = $el.text().trim();
|
|
136
|
+
}
|
|
137
|
+
if (!value)
|
|
138
|
+
return undefined;
|
|
139
|
+
return transform ? transform(value) : value;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
title() {
|
|
143
|
+
const title = this.$('title').first().text().trim();
|
|
144
|
+
return title || undefined;
|
|
145
|
+
}
|
|
146
|
+
body() {
|
|
147
|
+
return new ScrapeElement(this.$('body').first(), this.$);
|
|
148
|
+
}
|
|
149
|
+
head() {
|
|
150
|
+
return new ScrapeElement(this.$('head').first(), this.$);
|
|
151
|
+
}
|
|
152
|
+
html() {
|
|
153
|
+
return this.$.html() || '';
|
|
154
|
+
}
|
|
155
|
+
root() {
|
|
156
|
+
return new ScrapeElement(this.$.root(), this.$);
|
|
157
|
+
}
|
|
158
|
+
exists(selector) {
|
|
159
|
+
return this.$(selector).length > 0;
|
|
160
|
+
}
|
|
161
|
+
count(selector) {
|
|
162
|
+
return this.$(selector).length;
|
|
163
|
+
}
|
|
164
|
+
findByText(text, selector) {
|
|
165
|
+
const baseSelector = selector || '*';
|
|
166
|
+
const elements = [];
|
|
167
|
+
this.$(baseSelector).each((_, element) => {
|
|
168
|
+
const $el = this.$(element);
|
|
169
|
+
if ($el.text().includes(text)) {
|
|
170
|
+
elements.push(new ScrapeElement($el, this.$));
|
|
171
|
+
}
|
|
172
|
+
});
|
|
173
|
+
return elements;
|
|
174
|
+
}
|
|
175
|
+
findByExactText(text, selector) {
|
|
176
|
+
const baseSelector = selector || '*';
|
|
177
|
+
const elements = [];
|
|
178
|
+
this.$(baseSelector).each((_, element) => {
|
|
179
|
+
const $el = this.$(element);
|
|
180
|
+
if ($el.text().trim() === text) {
|
|
181
|
+
elements.push(new ScrapeElement($el, this.$));
|
|
182
|
+
}
|
|
183
|
+
});
|
|
184
|
+
return elements;
|
|
185
|
+
}
|
|
186
|
+
findByData(name, value) {
|
|
187
|
+
const selector = value !== undefined
|
|
188
|
+
? `[data-${name}="${value}"]`
|
|
189
|
+
: `[data-${name}]`;
|
|
190
|
+
return this.selectAll(selector);
|
|
191
|
+
}
|
|
192
|
+
get raw() {
|
|
193
|
+
return this.$;
|
|
194
|
+
}
|
|
195
|
+
get baseUrl() {
|
|
196
|
+
return this.options.baseUrl;
|
|
197
|
+
}
|
|
198
|
+
}
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import type { Cheerio, CheerioAPI } from 'cheerio';
|
|
2
|
+
import type { Element } from 'domhandler';
|
|
3
|
+
export declare class ScrapeElement {
|
|
4
|
+
private $el;
|
|
5
|
+
private $;
|
|
6
|
+
constructor($el: Cheerio<Element>, $: CheerioAPI);
|
|
7
|
+
find(selector: string): ScrapeElement;
|
|
8
|
+
parent(selector?: string): ScrapeElement;
|
|
9
|
+
children(selector?: string): ScrapeElement;
|
|
10
|
+
siblings(selector?: string): ScrapeElement;
|
|
11
|
+
next(selector?: string): ScrapeElement;
|
|
12
|
+
prev(selector?: string): ScrapeElement;
|
|
13
|
+
nextAll(selector?: string): ScrapeElement;
|
|
14
|
+
prevAll(selector?: string): ScrapeElement;
|
|
15
|
+
closest(selector: string): ScrapeElement;
|
|
16
|
+
first(): ScrapeElement;
|
|
17
|
+
last(): ScrapeElement;
|
|
18
|
+
eq(index: number): ScrapeElement;
|
|
19
|
+
filter(selector: string): ScrapeElement;
|
|
20
|
+
not(selector: string): ScrapeElement;
|
|
21
|
+
has(selector: string): ScrapeElement;
|
|
22
|
+
add(selector: string): ScrapeElement;
|
|
23
|
+
parents(selector?: string): ScrapeElement;
|
|
24
|
+
contents(): ScrapeElement;
|
|
25
|
+
text(): string;
|
|
26
|
+
html(): string | null;
|
|
27
|
+
outerHtml(): string;
|
|
28
|
+
attr(name: string): string | undefined;
|
|
29
|
+
attrs(): Record<string, string>;
|
|
30
|
+
data(name?: string): unknown;
|
|
31
|
+
val(): string | string[] | undefined;
|
|
32
|
+
prop(name: string): unknown;
|
|
33
|
+
exists(): boolean;
|
|
34
|
+
get length(): number;
|
|
35
|
+
is(selector: string): boolean;
|
|
36
|
+
hasClass(className: string): boolean;
|
|
37
|
+
index(selector?: string): number;
|
|
38
|
+
each(callback: (el: ScrapeElement, index: number) => void): this;
|
|
39
|
+
map<T>(callback: (el: ScrapeElement, index: number) => T): T[];
|
|
40
|
+
toArray(): ScrapeElement[];
|
|
41
|
+
reduce<T>(callback: (acc: T, el: ScrapeElement, index: number) => T, initialValue: T): T;
|
|
42
|
+
some(callback: (el: ScrapeElement, index: number) => boolean): boolean;
|
|
43
|
+
every(callback: (el: ScrapeElement, index: number) => boolean): boolean;
|
|
44
|
+
tagName(): string | undefined;
|
|
45
|
+
clone(): ScrapeElement;
|
|
46
|
+
toString(): string;
|
|
47
|
+
get raw(): Cheerio<Element>;
|
|
48
|
+
get(index?: number): Element | undefined;
|
|
49
|
+
}
|
|
50
|
+
//# sourceMappingURL=element.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"element.d.ts","sourceRoot":"","sources":["../../src/scrape/element.ts"],"names":[],"mappings":"AAMA,OAAO,KAAK,EAAE,OAAO,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AACnD,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AAE1C,qBAAa,aAAa;IACxB,OAAO,CAAC,GAAG,CAAmB;IAC9B,OAAO,CAAC,CAAC,CAAa;gBAEV,GAAG,EAAE,OAAO,CAAC,OAAO,CAAC,EAAE,CAAC,EAAE,UAAU;IAUhD,IAAI,CAAC,QAAQ,EAAE,MAAM,GAAG,aAAa;IAOrC,MAAM,CAAC,QAAQ,CAAC,EAAE,MAAM,GAAG,aAAa;IAQxC,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,GAAG,aAAa;IAQ1C,QAAQ,CAAC,QAAQ,CAAC,EAAE,MAAM,GAAG,aAAa;IAQ1C,IAAI,CAAC,QAAQ,CAAC,EAAE,MAAM,GAAG,aAAa;IAQtC,IAAI,CAAC,QAAQ,CAAC,EAAE,MAAM,GAAG,aAAa;IAQtC,OAAO,CAAC,QAAQ,CAAC,EAAE,MAAM,GAAG,aAAa;IAQzC,OAAO,CAAC,QAAQ,CAAC,EAAE,MAAM,GAAG,aAAa;IAQzC,OAAO,CAAC,QAAQ,EAAE,MAAM,GAAG,aAAa;IAOxC,KAAK,IAAI,aAAa;IAOtB,IAAI,IAAI,aAAa;IAOrB,EAAE,CAAC,KAAK,EAAE,MAAM,GAAG,aAAa;IAOhC,MAAM,CAAC,QAAQ,EAAE,MAAM,GAAG,aAAa;IAOvC,GAAG,CAAC,QAAQ,EAAE,MAAM,GAAG,aAAa;IAOpC,GAAG,CAAC,QAAQ,EAAE,MAAM,GAAG,aAAa;IAOpC,GAAG,CAAC,QAAQ,EAAE,MAAM,GAAG,aAAa;IAOpC,OAAO,CAAC,QAAQ,CAAC,EAAE,MAAM,GAAG,aAAa;IAQzC,QAAQ,IAAI,aAAa;IASzB,IAAI,IAAI,MAAM;IAOd,IAAI,IAAI,MAAM,GAAG,IAAI;IAOrB,SAAS,IAAI,MAAM;IAOnB,IAAI,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,SAAS;IAOtC,KAAK,IAAI,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC;IAY/B,IAAI,CAAC,IAAI,CAAC,EAAE,MAAM,GAAG,OAAO;IAU5B,GAAG,IAAI,MAAM,GAAG,MAAM,EAAE,GAAG,SAAS;IAOpC,IAAI,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO;IAS3B,MAAM,IAAI,OAAO;IAOjB,IAAI,MAAM,IAAI,MAAM,CAEnB;IAKD,EAAE,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO;IAO7B,QAAQ,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO;IAOpC,KAAK,CAAC,QAAQ,CAAC,EAAE,MAAM,GAAG,MAAM;IAShC,IAAI,CAAC,QAAQ,EAAE,CAAC,EAAE,EAAE,aAAa,EAAE,KAAK,EAAE,MAAM,KAAK,IAAI,GAAG,IAAI;IAUhE,GAAG,CAAC,CAAC,EAAE,QAAQ,EAAE,CAAC,EAAE,EAAE,aAAa,EAAE,KAAK,EAAE,MAAM,KAAK,CAAC,GAAG,CAAC,EAAE;IAW9D,OAAO,IAAI,aAAa,EAAE;IAS1B,MAAM,CAAC,CAAC,EAAE,QAAQ,EAAE,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,EAAE,aAAa,EAAE,KAAK,EAAE,MAAM,KAAK,CAAC,EAAE,YAAY,EAAE,CAAC,GAAG,CAAC;IAexF,IAAI,CAAC,QAAQ,EAAE,CAAC,EAAE,EAAE,aAAa,EAAE,KAAK,EAAE,MAAM,KAAK,OAAO,GAAG,OAAO;IActE,KAAK,CAAC,QAAQ,EAAE,CAAC,EAAE,EAAE,aAAa,EAAE,KAAK,EAAE,MAAM,KAAK,OAAO,GAAG,OAAO;IAgBvE,OAAO,IAAI,MAAM,GAAG,SAAS;IAQ7B,KAAK,IAAI,aAAa;IAOtB,QAAQ,IAAI,MAAM;IASlB,IAAI,GAAG,IAAI,OAAO,CAAC,OAAO,CAAC,CAE1B;IAKD,GAAG,CAAC,KAAK,GAAE,MAAU,GAAG,OAAO,GAAG,SAAS;CAG5C"}
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
export class ScrapeElement {
|
|
2
|
+
$el;
|
|
3
|
+
$;
|
|
4
|
+
constructor($el, $) {
|
|
5
|
+
this.$el = $el;
|
|
6
|
+
this.$ = $;
|
|
7
|
+
}
|
|
8
|
+
find(selector) {
|
|
9
|
+
return new ScrapeElement(this.$el.find(selector), this.$);
|
|
10
|
+
}
|
|
11
|
+
parent(selector) {
|
|
12
|
+
const parent = selector ? this.$el.parent(selector) : this.$el.parent();
|
|
13
|
+
return new ScrapeElement(parent, this.$);
|
|
14
|
+
}
|
|
15
|
+
children(selector) {
|
|
16
|
+
const children = selector ? this.$el.children(selector) : this.$el.children();
|
|
17
|
+
return new ScrapeElement(children, this.$);
|
|
18
|
+
}
|
|
19
|
+
siblings(selector) {
|
|
20
|
+
const siblings = selector ? this.$el.siblings(selector) : this.$el.siblings();
|
|
21
|
+
return new ScrapeElement(siblings, this.$);
|
|
22
|
+
}
|
|
23
|
+
next(selector) {
|
|
24
|
+
const next = selector ? this.$el.next(selector) : this.$el.next();
|
|
25
|
+
return new ScrapeElement(next, this.$);
|
|
26
|
+
}
|
|
27
|
+
prev(selector) {
|
|
28
|
+
const prev = selector ? this.$el.prev(selector) : this.$el.prev();
|
|
29
|
+
return new ScrapeElement(prev, this.$);
|
|
30
|
+
}
|
|
31
|
+
nextAll(selector) {
|
|
32
|
+
const nextAll = selector ? this.$el.nextAll(selector) : this.$el.nextAll();
|
|
33
|
+
return new ScrapeElement(nextAll, this.$);
|
|
34
|
+
}
|
|
35
|
+
prevAll(selector) {
|
|
36
|
+
const prevAll = selector ? this.$el.prevAll(selector) : this.$el.prevAll();
|
|
37
|
+
return new ScrapeElement(prevAll, this.$);
|
|
38
|
+
}
|
|
39
|
+
closest(selector) {
|
|
40
|
+
return new ScrapeElement(this.$el.closest(selector), this.$);
|
|
41
|
+
}
|
|
42
|
+
first() {
|
|
43
|
+
return new ScrapeElement(this.$el.first(), this.$);
|
|
44
|
+
}
|
|
45
|
+
last() {
|
|
46
|
+
return new ScrapeElement(this.$el.last(), this.$);
|
|
47
|
+
}
|
|
48
|
+
eq(index) {
|
|
49
|
+
return new ScrapeElement(this.$el.eq(index), this.$);
|
|
50
|
+
}
|
|
51
|
+
filter(selector) {
|
|
52
|
+
return new ScrapeElement(this.$el.filter(selector), this.$);
|
|
53
|
+
}
|
|
54
|
+
not(selector) {
|
|
55
|
+
return new ScrapeElement(this.$el.not(selector), this.$);
|
|
56
|
+
}
|
|
57
|
+
has(selector) {
|
|
58
|
+
return new ScrapeElement(this.$el.has(selector), this.$);
|
|
59
|
+
}
|
|
60
|
+
add(selector) {
|
|
61
|
+
return new ScrapeElement(this.$el.add(selector), this.$);
|
|
62
|
+
}
|
|
63
|
+
parents(selector) {
|
|
64
|
+
const parents = selector ? this.$el.parents(selector) : this.$el.parents();
|
|
65
|
+
return new ScrapeElement(parents, this.$);
|
|
66
|
+
}
|
|
67
|
+
contents() {
|
|
68
|
+
return new ScrapeElement(this.$el.contents(), this.$);
|
|
69
|
+
}
|
|
70
|
+
text() {
|
|
71
|
+
return this.$el.text().trim();
|
|
72
|
+
}
|
|
73
|
+
html() {
|
|
74
|
+
return this.$el.html();
|
|
75
|
+
}
|
|
76
|
+
outerHtml() {
|
|
77
|
+
return this.$.html(this.$el) || '';
|
|
78
|
+
}
|
|
79
|
+
attr(name) {
|
|
80
|
+
return this.$el.attr(name);
|
|
81
|
+
}
|
|
82
|
+
attrs() {
|
|
83
|
+
const attributes = {};
|
|
84
|
+
const el = this.$el.get(0);
|
|
85
|
+
if (el && 'attribs' in el) {
|
|
86
|
+
Object.assign(attributes, el.attribs);
|
|
87
|
+
}
|
|
88
|
+
return attributes;
|
|
89
|
+
}
|
|
90
|
+
data(name) {
|
|
91
|
+
if (name) {
|
|
92
|
+
return this.$el.data(name);
|
|
93
|
+
}
|
|
94
|
+
return this.$el.data();
|
|
95
|
+
}
|
|
96
|
+
val() {
|
|
97
|
+
return this.$el.val();
|
|
98
|
+
}
|
|
99
|
+
prop(name) {
|
|
100
|
+
return this.$el.prop(name);
|
|
101
|
+
}
|
|
102
|
+
exists() {
|
|
103
|
+
return this.$el.length > 0;
|
|
104
|
+
}
|
|
105
|
+
get length() {
|
|
106
|
+
return this.$el.length;
|
|
107
|
+
}
|
|
108
|
+
is(selector) {
|
|
109
|
+
return this.$el.is(selector);
|
|
110
|
+
}
|
|
111
|
+
hasClass(className) {
|
|
112
|
+
return this.$el.hasClass(className);
|
|
113
|
+
}
|
|
114
|
+
index(selector) {
|
|
115
|
+
return selector ? this.$el.index(selector) : this.$el.index();
|
|
116
|
+
}
|
|
117
|
+
each(callback) {
|
|
118
|
+
this.$el.each((index, element) => {
|
|
119
|
+
callback(new ScrapeElement(this.$(element), this.$), index);
|
|
120
|
+
});
|
|
121
|
+
return this;
|
|
122
|
+
}
|
|
123
|
+
map(callback) {
|
|
124
|
+
const results = [];
|
|
125
|
+
this.$el.each((index, element) => {
|
|
126
|
+
results.push(callback(new ScrapeElement(this.$(element), this.$), index));
|
|
127
|
+
});
|
|
128
|
+
return results;
|
|
129
|
+
}
|
|
130
|
+
toArray() {
|
|
131
|
+
return this.$el.toArray().map((element) => new ScrapeElement(this.$(element), this.$));
|
|
132
|
+
}
|
|
133
|
+
reduce(callback, initialValue) {
|
|
134
|
+
let accumulator = initialValue;
|
|
135
|
+
this.$el.each((index, element) => {
|
|
136
|
+
accumulator = callback(accumulator, new ScrapeElement(this.$(element), this.$), index);
|
|
137
|
+
});
|
|
138
|
+
return accumulator;
|
|
139
|
+
}
|
|
140
|
+
some(callback) {
|
|
141
|
+
let found = false;
|
|
142
|
+
this.$el.each((index, element) => {
|
|
143
|
+
if (callback(new ScrapeElement(this.$(element), this.$), index)) {
|
|
144
|
+
found = true;
|
|
145
|
+
return false;
|
|
146
|
+
}
|
|
147
|
+
});
|
|
148
|
+
return found;
|
|
149
|
+
}
|
|
150
|
+
every(callback) {
|
|
151
|
+
let allMatch = true;
|
|
152
|
+
this.$el.each((index, element) => {
|
|
153
|
+
if (!callback(new ScrapeElement(this.$(element), this.$), index)) {
|
|
154
|
+
allMatch = false;
|
|
155
|
+
return false;
|
|
156
|
+
}
|
|
157
|
+
});
|
|
158
|
+
return allMatch;
|
|
159
|
+
}
|
|
160
|
+
tagName() {
|
|
161
|
+
const el = this.$el.get(0);
|
|
162
|
+
return el ? el.tagName?.toLowerCase() : undefined;
|
|
163
|
+
}
|
|
164
|
+
clone() {
|
|
165
|
+
return new ScrapeElement(this.$el.clone(), this.$);
|
|
166
|
+
}
|
|
167
|
+
toString() {
|
|
168
|
+
return this.outerHtml();
|
|
169
|
+
}
|
|
170
|
+
get raw() {
|
|
171
|
+
return this.$el;
|
|
172
|
+
}
|
|
173
|
+
get(index = 0) {
|
|
174
|
+
return this.$el.get(index);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
import type { CheerioAPI } from 'cheerio';
|
|
2
|
+
import type { ExtractedLink, ExtractedImage, ExtractedMeta, OpenGraphData, TwitterCardData, JsonLdData, ExtractedForm, ExtractedTable, ExtractedScript, ExtractedStyle, LinkExtractionOptions, ImageExtractionOptions } from './types.js';
|
|
3
|
+
export declare function extractLinks($: CheerioAPI, options?: LinkExtractionOptions & {
|
|
4
|
+
baseUrl?: string;
|
|
5
|
+
}): ExtractedLink[];
|
|
6
|
+
export declare function extractImages($: CheerioAPI, options?: ImageExtractionOptions & {
|
|
7
|
+
baseUrl?: string;
|
|
8
|
+
}): ExtractedImage[];
|
|
9
|
+
export declare function extractMeta($: CheerioAPI): ExtractedMeta;
|
|
10
|
+
export declare function extractOpenGraph($: CheerioAPI): OpenGraphData;
|
|
11
|
+
export declare function extractTwitterCard($: CheerioAPI): TwitterCardData;
|
|
12
|
+
export declare function extractJsonLd($: CheerioAPI): JsonLdData[];
|
|
13
|
+
export declare function extractForms($: CheerioAPI, selector?: string): ExtractedForm[];
|
|
14
|
+
export declare function extractTables($: CheerioAPI, selector?: string): ExtractedTable[];
|
|
15
|
+
export declare function extractScripts($: CheerioAPI): ExtractedScript[];
|
|
16
|
+
export declare function extractStyles($: CheerioAPI): ExtractedStyle[];
|
|
17
|
+
//# sourceMappingURL=extractors.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"extractors.d.ts","sourceRoot":"","sources":["../../src/scrape/extractors.ts"],"names":[],"mappings":"AAMA,OAAO,KAAK,EAAE,UAAU,EAAE,MAAM,SAAS,CAAC;AAC1C,OAAO,KAAK,EACV,aAAa,EACb,cAAc,EACd,aAAa,EACb,aAAa,EACb,eAAe,EACf,UAAU,EACV,aAAa,EAEb,cAAc,EACd,eAAe,EACf,cAAc,EACd,qBAAqB,EACrB,sBAAsB,EACvB,MAAM,YAAY,CAAC;AAoDpB,wBAAgB,YAAY,CAC1B,CAAC,EAAE,UAAU,EACb,OAAO,CAAC,EAAE,qBAAqB,GAAG;IAAE,OAAO,CAAC,EAAE,MAAM,CAAA;CAAE,GACrD,aAAa,EAAE,CAwBjB;AAKD,wBAAgB,aAAa,CAC3B,CAAC,EAAE,UAAU,EACb,OAAO,CAAC,EAAE,sBAAsB,GAAG;IAAE,OAAO,CAAC,EAAE,MAAM,CAAA;CAAE,GACtD,cAAc,EAAE,CA4BlB;AAKD,wBAAgB,WAAW,CAAC,CAAC,EAAE,UAAU,GAAG,aAAa,CA0DxD;AAKD,wBAAgB,gBAAgB,CAAC,CAAC,EAAE,UAAU,GAAG,aAAa,CAiD7D;AAKD,wBAAgB,kBAAkB,CAAC,CAAC,EAAE,UAAU,GAAG,eAAe,CAqCjE;AAKD,wBAAgB,aAAa,CAAC,CAAC,EAAE,UAAU,GAAG,UAAU,EAAE,CAwBzD;AAKD,wBAAgB,YAAY,CAAC,CAAC,EAAE,UAAU,EAAE,QAAQ,CAAC,EAAE,MAAM,GAAG,aAAa,EAAE,CA+C9E;AAKD,wBAAgB,aAAa,CAAC,CAAC,EAAE,UAAU,EAAE,QAAQ,CAAC,EAAE,MAAM,GAAG,cAAc,EAAE,CA2DhF;AAKD,wBAAgB,cAAc,CAAC,CAAC,EAAE,UAAU,GAAG,eAAe,EAAE,CAqB/D;AAKD,wBAAgB,aAAa,CAAC,CAAC,EAAE,UAAU,GAAG,cAAc,EAAE,CAyB7D"}
|