@ioodev/nodescraper 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,534 @@
1
+ 'use strict';
2
+
3
+ const axios = require('axios');
4
+ const cheerio = require('cheerio');
5
+
6
+ const {
7
+ DEFAULT_USER_AGENT,
8
+ DEFAULT_TIMEOUT,
9
+ DEFAULT_MAX_REDIRECTS,
10
+ ALLOWED_PROTOCOLS,
11
+ OPEN_GRAPH_PROPERTIES,
12
+ TWITTER_CARD_PROPERTIES
13
+ } = require('./constants');
14
+
15
+ const {
16
+ isValidUrl,
17
+ splitList,
18
+ splitRel,
19
+ normalizeWhitespace,
20
+ resolveUrl,
21
+ escapeAttributeValue
22
+ } = require('./utils');
23
+
24
+ /**
25
+ * @typedef {Object} NodeScraperOptions
26
+ * @property {number} [timeout=10000] Request timeout in milliseconds.
27
+ * @property {string} [userAgent] User-Agent header sent with the request.
28
+ * @property {Object<string,string>} [headers] Extra headers merged into the request.
29
+ * @property {number} [maxRedirects=5] Maximum number of redirects to follow.
30
+ * @property {string[]} [allowedProtocols=['http:','https:']] Protocols accepted by the URL validator.
31
+ * @property {boolean} [throwOnError=false] If true, `init()` rejects instead of swallowing errors.
32
+ */
33
+
34
+ class NodeScraper {
35
+ /**
36
+ * @param {string} url The page URL to scrape.
37
+ * @param {NodeScraperOptions} [options]
38
+ */
39
+ constructor(url, options = {}) {
40
+ this.url = url;
41
+
42
+ /** Parsed cheerio document, or `null` until `init()`/`loadHTML()` succeeds. */
43
+ this.soup = null;
44
+ /** Raw HTML of the last successful load. */
45
+ this.rawHtml = null;
46
+ /** HTTP status code of the last request, if any. */
47
+ this.statusCode = null;
48
+ /** Error from the last failed `init()` call, if any. */
49
+ this.error = null;
50
+
51
+ this.options = {
52
+ timeout: options.timeout ?? DEFAULT_TIMEOUT,
53
+ userAgent: options.userAgent ?? DEFAULT_USER_AGENT,
54
+ headers: options.headers ?? {},
55
+ maxRedirects: options.maxRedirects ?? DEFAULT_MAX_REDIRECTS,
56
+ allowedProtocols: options.allowedProtocols ?? ALLOWED_PROTOCOLS,
57
+ throwOnError: options.throwOnError ?? false
58
+ };
59
+ }
60
+
61
+ // ---------------------------------------------------------------------
62
+ // Loading
63
+ // ---------------------------------------------------------------------
64
+
65
+ /**
66
+ * Fetch `this.url` and parse the response with cheerio.
67
+ *
68
+ * Unlike v1.0, failures are no longer silent: `this.error` and
69
+ * `this.statusCode` are populated, and `getError()` exposes the reason
70
+ * (invalid URL, network failure, non-2xx response, etc). Pass
71
+ * `{ throwOnError: true }` to the constructor to have this method
72
+ * reject instead.
73
+ *
74
+ * @returns {Promise<NodeScraper>} `this`, for chaining.
75
+ */
76
+ async init() {
77
+ this.error = null;
78
+
79
+ if (!this._isValidUrl(this.url)) {
80
+ this.error = new Error(`Invalid or unsupported URL: ${this.url}`);
81
+ this.soup = null;
82
+ if (this.options.throwOnError) throw this.error;
83
+ return this;
84
+ }
85
+
86
+ try {
87
+ const response = await axios.get(this.url, {
88
+ timeout: this.options.timeout,
89
+ maxRedirects: this.options.maxRedirects,
90
+ // Resolve regardless of status so we can report *why* a scrape
91
+ // failed instead of throwing away the status code.
92
+ validateStatus: () => true,
93
+ headers: {
94
+ 'User-Agent': this.options.userAgent,
95
+ Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
96
+ ...this.options.headers
97
+ }
98
+ });
99
+
100
+ this.statusCode = response.status;
101
+
102
+ if (response.status < 200 || response.status >= 300) {
103
+ this.error = new Error(`Request failed with status code ${response.status}`);
104
+ this.soup = null;
105
+ if (this.options.throwOnError) throw this.error;
106
+ return this;
107
+ }
108
+
109
+ this.rawHtml = typeof response.data === 'string' ? response.data : String(response.data);
110
+ this.soup = cheerio.load(this.rawHtml);
111
+ } catch (err) {
112
+ this.error = err;
113
+ this.soup = null;
114
+ if (this.options.throwOnError) throw err;
115
+ }
116
+
117
+ return this;
118
+ }
119
+
120
+ /**
121
+ * Parse a raw HTML string directly, without making an HTTP request.
122
+ * Handy for tests or when the HTML was obtained some other way.
123
+ *
124
+ * @param {string} html
125
+ * @returns {NodeScraper} `this`, for chaining.
126
+ */
127
+ loadHTML(html) {
128
+ if (typeof html !== 'string') {
129
+ throw new TypeError('loadHTML() expects an HTML string');
130
+ }
131
+ this.rawHtml = html;
132
+ this.soup = cheerio.load(html);
133
+ this.error = null;
134
+ this.statusCode = null;
135
+ return this;
136
+ }
137
+
138
+ /** @returns {boolean} Whether a document is currently loaded. */
139
+ isLoaded() {
140
+ return this.soup !== null;
141
+ }
142
+
143
+ /** @returns {Error|null} The error from the last failed load, if any. */
144
+ getError() {
145
+ return this.error;
146
+ }
147
+
148
+ /** @returns {number|null} The HTTP status code of the last request, if any. */
149
+ getStatusCode() {
150
+ return this.statusCode;
151
+ }
152
+
153
+ /**
154
+ * @param {string} url
155
+ * @returns {boolean}
156
+ */
157
+ _isValidUrl(url) {
158
+ return isValidUrl(url, this.options.allowedProtocols);
159
+ }
160
+
161
+ // ---------------------------------------------------------------------
162
+ // Page metadata
163
+ // ---------------------------------------------------------------------
164
+
165
+ title() {
166
+ return this.soup ? this.soup('title').text() || null : null;
167
+ }
168
+
169
+ charset() {
170
+ if (!this.soup) return null;
171
+ const direct = this.soup('meta[charset]').attr('charset');
172
+ if (direct) return direct;
173
+ // Fallback: some pages only declare charset via the legacy http-equiv form.
174
+ const contentType = this.content_type();
175
+ const match = contentType && contentType.match(/charset=([^;]+)/i);
176
+ return match ? match[1].trim() : null;
177
+ }
178
+
179
+ /** @returns {string[]|null} Trimmed viewport directives, e.g. `["width=device-width", "initial-scale=1"]`. */
180
+ viewport() {
181
+ return splitList(this.viewport_string());
182
+ }
183
+
184
+ viewport_string() {
185
+ return this.soup ? this.soup('meta[name="viewport"]').attr('content') || null : null;
186
+ }
187
+
188
+ /** @returns {Object<string,string>|null} Viewport directives parsed into key/value pairs. */
189
+ viewport_object() {
190
+ const items = this.viewport();
191
+ if (!items) return null;
192
+ const result = {};
193
+ for (const item of items) {
194
+ const [key, value] = item.split('=').map((part) => part && part.trim());
195
+ if (key) result[key] = value ?? '';
196
+ }
197
+ return result;
198
+ }
199
+
200
+ canonical() {
201
+ return this.soup ? this.soup('link[rel="canonical"]').attr('href') || null : null;
202
+ }
203
+
204
+ content_type() {
205
+ return this.soup ? this.soup('meta[http-equiv="Content-Type"]').attr('content') || null : null;
206
+ }
207
+
208
+ csrf_token() {
209
+ if (!this.soup) return null;
210
+ let tag = this.soup('meta[name="csrf-token"]');
211
+ if (tag.length === 0) tag = this.soup('input[name="csrf-token"]');
212
+ return tag.attr('content') || tag.attr('value') || null;
213
+ }
214
+
215
+ author() {
216
+ return this.soup ? this.soup('meta[name="author"]').attr('content') || null : null;
217
+ }
218
+
219
+ description() {
220
+ return this.soup ? this.soup('meta[name="description"]').attr('content') || null : null;
221
+ }
222
+
223
+ image() {
224
+ return this.soup ? this.soup('meta[property="og:image"]').attr('content') || null : null;
225
+ }
226
+
227
+ /** @returns {string|null} The page's `<html lang="...">` attribute. */
228
+ lang() {
229
+ return this.soup ? this.soup('html').attr('lang') || null : null;
230
+ }
231
+
232
+ /** @returns {string|null} The `robots` meta directive, e.g. `"index, follow"`. */
233
+ robots() {
234
+ return this.meta('robots');
235
+ }
236
+
237
+ /** @returns {string|null} Absolute URL of the page favicon, if declared. */
238
+ favicon() {
239
+ if (!this.soup) return null;
240
+ const href = this.soup('link[rel="icon"], link[rel="shortcut icon"]').attr('href') || null;
241
+ if (!href) return null;
242
+ return resolveUrl(href, this.url) ?? href;
243
+ }
244
+
245
+ /** @returns {string[]|null} Trimmed keyword list from the `keywords` meta tag. */
246
+ keywords() {
247
+ return splitList(this.keyword_string());
248
+ }
249
+
250
+ keyword_string() {
251
+ return this.soup ? this.soup('meta[name="keywords"]').attr('content') || null : null;
252
+ }
253
+
254
+ /**
255
+ * Generic meta tag reader.
256
+ *
257
+ * @param {string} name The `name`/`property` value to look up.
258
+ * @param {'name'|'property'} [attr='name'] Which attribute to match on.
259
+ * @returns {string|null}
260
+ */
261
+ meta(name, attr = 'name') {
262
+ if (!this.soup || !name) return null;
263
+ const safe = escapeAttributeValue(name);
264
+ return this.soup(`meta[${attr}="${safe}"]`).attr('content') || null;
265
+ }
266
+
267
+ // ---------------------------------------------------------------------
268
+ // Open Graph / Twitter Card
269
+ // ---------------------------------------------------------------------
270
+
271
+ open_graph(prop = null) {
272
+ if (!this.soup) return null;
273
+ if (prop) return this.meta(prop, 'property');
274
+
275
+ const result = {};
276
+ for (const p of OPEN_GRAPH_PROPERTIES) {
277
+ result[p] = this.meta(p, 'property');
278
+ }
279
+ return result;
280
+ }
281
+
282
+ twitter_card(prop = null) {
283
+ if (!this.soup) return null;
284
+ if (prop) return this.meta(prop, 'name');
285
+
286
+ const result = {};
287
+ for (const p of TWITTER_CARD_PROPERTIES) {
288
+ result[p] = this.meta(p, 'name');
289
+ }
290
+ return result;
291
+ }
292
+
293
+ /** @returns {Object[]|null} Parsed `application/ld+json` blocks found on the page. */
294
+ jsonLd() {
295
+ if (!this.soup) return null;
296
+ const results = [];
297
+ this.soup('script[type="application/ld+json"]').each((_, el) => {
298
+ const raw = this.soup(el).contents().text();
299
+ try {
300
+ results.push(JSON.parse(raw));
301
+ } catch {
302
+ // Skip malformed JSON-LD blocks rather than failing the whole scrape.
303
+ }
304
+ });
305
+ return results;
306
+ }
307
+
308
+ // ---------------------------------------------------------------------
309
+ // Headings & text
310
+ // ---------------------------------------------------------------------
311
+
312
+ _tagList(tagName) {
313
+ if (!this.soup) return null;
314
+ return this.soup(tagName)
315
+ .map((_, el) => this.soup(el).text().trim())
316
+ .get();
317
+ }
318
+
319
+ h1() { return this._tagList('h1'); }
320
+ h2() { return this._tagList('h2'); }
321
+ h3() { return this._tagList('h3'); }
322
+ h4() { return this._tagList('h4'); }
323
+ h5() { return this._tagList('h5'); }
324
+ h6() { return this._tagList('h6'); }
325
+ p() { return this._tagList('p'); }
326
+
327
+ /** @returns {string|null} Normalized, whitespace-collapsed visible body text. */
328
+ text() {
329
+ if (!this.soup) return null;
330
+ return normalizeWhitespace(this.soup('body').text());
331
+ }
332
+
333
+ /** @returns {string|null} The raw HTML of the last successful load. */
334
+ html() {
335
+ return this.rawHtml;
336
+ }
337
+
338
+ // ---------------------------------------------------------------------
339
+ // Lists
340
+ // ---------------------------------------------------------------------
341
+
342
+ ul() {
343
+ if (!this.soup) return null;
344
+ const result = [];
345
+ this.soup('ul').each((_, ul) => {
346
+ this.soup(ul).find('li').each((_, li) => {
347
+ result.push(this.soup(li).text().trim());
348
+ });
349
+ });
350
+ return result;
351
+ }
352
+
353
+ ol() {
354
+ if (!this.soup) return null;
355
+ const result = [];
356
+ this.soup('ol').each((_, ol) => {
357
+ this.soup(ol).find('li').each((_, li) => {
358
+ result.push(this.soup(li).text().trim());
359
+ });
360
+ });
361
+ return result;
362
+ }
363
+
364
+ // ---------------------------------------------------------------------
365
+ // Images
366
+ // ---------------------------------------------------------------------
367
+
368
+ images() {
369
+ if (!this.soup) return null;
370
+ return this.soup('img').map((_, el) => this.soup(el).attr('src')).get();
371
+ }
372
+
373
+ image_details() {
374
+ if (!this.soup) return null;
375
+ return this.soup('img').map((_, el) => {
376
+ const src = this.soup(el).attr('src');
377
+ return {
378
+ url: src,
379
+ absolute_url: resolveUrl(src, this.url),
380
+ alt_text: this.soup(el).attr('alt') || null,
381
+ title: this.soup(el).attr('title') || null
382
+ };
383
+ }).get();
384
+ }
385
+
386
+ // ---------------------------------------------------------------------
387
+ // Links
388
+ // ---------------------------------------------------------------------
389
+
390
+ links() {
391
+ if (!this.soup) return null;
392
+ return this.soup('a').map((_, el) => this.soup(el).attr('href')).get().filter(Boolean);
393
+ }
394
+
395
+ link_details() {
396
+ if (!this.soup) return null;
397
+ const result = [];
398
+ this.soup('a').each((_, el) => {
399
+ const $el = this.soup(el);
400
+ const href = $el.attr('href') || '';
401
+ const rel = splitRel($el.attr('rel'));
402
+ result.push({
403
+ url: href,
404
+ absolute_url: resolveUrl(href, this.url),
405
+ protocol: href.includes(':') ? href.split(':')[0] : '',
406
+ text: $el.text().trim(),
407
+ title: $el.attr('title') || '',
408
+ target: $el.attr('target') || '',
409
+ rel,
410
+ is_nofollow: rel.includes('nofollow'),
411
+ is_ugc: rel.includes('ugc'),
412
+ is_noopener: rel.includes('noopener'),
413
+ is_noreferrer: rel.includes('noreferrer')
414
+ });
415
+ });
416
+ return result;
417
+ }
418
+
419
+ // ---------------------------------------------------------------------
420
+ // Custom filtering
421
+ // ---------------------------------------------------------------------
422
+
423
+ /**
424
+ * @param {Object} params
425
+ * @param {string} params.element Tag name to match, e.g. `"div"`.
426
+ * @param {Object<string,string>} [params.attributes] Exact attribute values to match.
427
+ * @param {boolean} [params.multiple=false] Return all matches instead of just the first.
428
+ * @param {string[]} [params.extract=[]] Tag/class/id selectors to extract from each match.
429
+ * @param {boolean} [params.returnHtml=true] Return inner HTML instead of trimmed text.
430
+ * @returns {*} Depends on `multiple`/`extract`; `null` on no match or invalid input.
431
+ */
432
+ filter({ element, attributes = {}, multiple = false, extract = [], returnHtml = true } = {}) {
433
+ if (!this.soup || typeof element !== 'string' || typeof attributes !== 'object') return null;
434
+
435
+ try {
436
+ const match = this.soup(element).filter((_, el) => {
437
+ return Object.entries(attributes).every(([key, value]) => this.soup(el).attr(key) === value);
438
+ });
439
+
440
+ const extractContentFromTag = (el, selectors) => {
441
+ const result = {};
442
+ for (const sel of selectors) {
443
+ let found;
444
+ let key;
445
+ if (sel.startsWith('.')) {
446
+ key = `class__${sel.slice(1)}`;
447
+ found = this.soup(el).find(`.${sel.slice(1)}`);
448
+ } else if (sel.startsWith('#')) {
449
+ key = `id__${sel.slice(1)}`;
450
+ found = this.soup(el).find(`#${sel.slice(1)}`);
451
+ } else {
452
+ key = sel;
453
+ found = this.soup(el).find(sel);
454
+ }
455
+ result[key] = found.text().trim() || null;
456
+ }
457
+ return result;
458
+ };
459
+
460
+ if (multiple) {
461
+ return match.map((_, el) => {
462
+ if (Array.isArray(extract) && extract.length > 0) {
463
+ return extractContentFromTag(el, extract);
464
+ }
465
+ return returnHtml ? this.soup.html(el) : this.soup(el).text().trim();
466
+ }).get();
467
+ }
468
+
469
+ const el = match.get(0);
470
+ if (!el) return null;
471
+ if (Array.isArray(extract) && extract.length > 0) {
472
+ return extractContentFromTag(el, extract);
473
+ }
474
+ return returnHtml ? this.soup.html(el) : this.soup(el).text().trim();
475
+ } catch {
476
+ // Malformed selector or unexpected DOM shape: fail soft instead of throwing.
477
+ return null;
478
+ }
479
+ }
480
+
481
+ // ---------------------------------------------------------------------
482
+ // Convenience
483
+ // ---------------------------------------------------------------------
484
+
485
+ /** @returns {Object|null} A snapshot of the most commonly used fields. */
486
+ toJSON() {
487
+ if (!this.soup) return null;
488
+ return {
489
+ url: this.url,
490
+ statusCode: this.statusCode,
491
+ title: this.title(),
492
+ description: this.description(),
493
+ canonical: this.canonical(),
494
+ lang: this.lang(),
495
+ charset: this.charset(),
496
+ robots: this.robots(),
497
+ keywords: this.keywords(),
498
+ author: this.author(),
499
+ image: this.image(),
500
+ favicon: this.favicon(),
501
+ openGraph: this.open_graph(),
502
+ twitterCard: this.twitter_card(),
503
+ headings: { h1: this.h1(), h2: this.h2(), h3: this.h3() },
504
+ linkCount: (this.links() || []).length,
505
+ imageCount: (this.images() || []).length
506
+ };
507
+ }
508
+
509
+ /**
510
+ * Create and load a NodeScraper in one call.
511
+ *
512
+ * @param {string} url
513
+ * @param {NodeScraperOptions} [options]
514
+ * @returns {Promise<NodeScraper>}
515
+ */
516
+ static async scrape(url, options) {
517
+ const instance = new NodeScraper(url, options);
518
+ await instance.init();
519
+ return instance;
520
+ }
521
+
522
+ /**
523
+ * Scrape multiple URLs concurrently.
524
+ *
525
+ * @param {string[]} urls
526
+ * @param {NodeScraperOptions} [options]
527
+ * @returns {Promise<NodeScraper[]>}
528
+ */
529
+ static async scrapeAll(urls, options) {
530
+ return Promise.all(urls.map((url) => NodeScraper.scrape(url, options)));
531
+ }
532
+ }
533
+
534
+ module.exports = NodeScraper;
@@ -0,0 +1,46 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Default browser-like User-Agent.
5
+ * Many sites return 403/empty bodies for the default axios UA, so
6
+ * NodeScraper sends a realistic one unless the caller overrides it.
7
+ */
8
+ const DEFAULT_USER_AGENT =
9
+ 'Mozilla/5.0 (compatible; NodeScraper/1.1; +https://github.com/ioodev/nodescraper)';
10
+
11
+ /** Default request timeout, in milliseconds. */
12
+ const DEFAULT_TIMEOUT = 10000;
13
+
14
+ /** Default maximum number of redirects axios will follow. */
15
+ const DEFAULT_MAX_REDIRECTS = 5;
16
+
17
+ /** Protocols allowed by default when validating a target URL. */
18
+ const ALLOWED_PROTOCOLS = ['http:', 'https:'];
19
+
20
+ /** Open Graph properties read by `open_graph()` when called with no argument. */
21
+ const OPEN_GRAPH_PROPERTIES = [
22
+ 'og:site_name',
23
+ 'og:type',
24
+ 'og:title',
25
+ 'og:description',
26
+ 'og:url',
27
+ 'og:image'
28
+ ];
29
+
30
+ /** Twitter Card properties read by `twitter_card()` when called with no argument. */
31
+ const TWITTER_CARD_PROPERTIES = [
32
+ 'twitter:card',
33
+ 'twitter:title',
34
+ 'twitter:description',
35
+ 'twitter:url',
36
+ 'twitter:image'
37
+ ];
38
+
39
+ module.exports = {
40
+ DEFAULT_USER_AGENT,
41
+ DEFAULT_TIMEOUT,
42
+ DEFAULT_MAX_REDIRECTS,
43
+ ALLOWED_PROTOCOLS,
44
+ OPEN_GRAPH_PROPERTIES,
45
+ TWITTER_CARD_PROPERTIES
46
+ };
package/src/utils.js ADDED
@@ -0,0 +1,105 @@
1
+ 'use strict';
2
+
3
+ const { URL } = require('url');
4
+ const { ALLOWED_PROTOCOLS } = require('./constants');
5
+
6
+ /**
7
+ * Check whether `url` is a syntactically valid URL whose protocol is in
8
+ * `allowedProtocols`. Restricting the protocol (http/https by default)
9
+ * fails fast with a clear error instead of silently attempting and
10
+ * swallowing a request to an unsupported scheme (e.g. `file:`).
11
+ *
12
+ * @param {string} url
13
+ * @param {string[]} [allowedProtocols]
14
+ * @returns {boolean}
15
+ */
16
+ function isValidUrl(url, allowedProtocols = ALLOWED_PROTOCOLS) {
17
+ if (typeof url !== 'string' || url.trim() === '') return false;
18
+ try {
19
+ const parsed = new URL(url);
20
+ return allowedProtocols.includes(parsed.protocol);
21
+ } catch {
22
+ return false;
23
+ }
24
+ }
25
+
26
+ /**
27
+ * Split a comma-separated meta tag value (e.g. `keywords`, `viewport`)
28
+ * into a clean array, trimming whitespace and dropping empty entries.
29
+ *
30
+ * @param {string|null|undefined} value
31
+ * @returns {string[]|null}
32
+ */
33
+ function splitList(value) {
34
+ if (!value || typeof value !== 'string') return null;
35
+ const items = value
36
+ .split(',')
37
+ .map((item) => item.trim())
38
+ .filter((item) => item.length > 0);
39
+ return items.length > 0 ? items : null;
40
+ }
41
+
42
+ /**
43
+ * Split a `rel` attribute into its individual tokens, dropping empty
44
+ * entries so elements without a `rel` attribute yield `[]` instead of `['']`.
45
+ *
46
+ * @param {string|null|undefined} value
47
+ * @returns {string[]}
48
+ */
49
+ function splitRel(value) {
50
+ if (!value || typeof value !== 'string') return [];
51
+ return value
52
+ .split(/\s+/)
53
+ .map((item) => item.trim())
54
+ .filter((item) => item.length > 0);
55
+ }
56
+
57
+ /**
58
+ * Collapse runs of whitespace into a single space and trim the ends.
59
+ * Useful for turning raw `.text()` output into readable strings.
60
+ *
61
+ * @param {string|null|undefined} text
62
+ * @returns {string|null}
63
+ */
64
+ function normalizeWhitespace(text) {
65
+ if (typeof text !== 'string') return null;
66
+ const cleaned = text.replace(/\s+/g, ' ').trim();
67
+ return cleaned.length > 0 ? cleaned : null;
68
+ }
69
+
70
+ /**
71
+ * Resolve a possibly-relative URL against a base URL.
72
+ * Returns `null` if either input is missing or invalid, instead of throwing.
73
+ *
74
+ * @param {string|null|undefined} value
75
+ * @param {string|null|undefined} base
76
+ * @returns {string|null}
77
+ */
78
+ function resolveUrl(value, base) {
79
+ if (!value || typeof value !== 'string') return null;
80
+ try {
81
+ return new URL(value, base).toString();
82
+ } catch {
83
+ return null;
84
+ }
85
+ }
86
+
87
+ /**
88
+ * Escape double quotes so a value can be safely interpolated into a
89
+ * cheerio/CSS attribute selector, e.g. `meta[name="${escaped}"]`.
90
+ *
91
+ * @param {string} value
92
+ * @returns {string}
93
+ */
94
+ function escapeAttributeValue(value) {
95
+ return String(value).replace(/"/g, '\\"');
96
+ }
97
+
98
+ module.exports = {
99
+ isValidUrl,
100
+ splitList,
101
+ splitRel,
102
+ normalizeWhitespace,
103
+ resolveUrl,
104
+ escapeAttributeValue
105
+ };