apify 2.3.1-beta.4 → 3.0.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -5
- package/package.json +69 -128
- package/build/actor.d.ts +0 -113
- package/build/actor.d.ts.map +0 -1
- package/build/actor.js +0 -582
- package/build/actor.js.map +0 -1
- package/build/apify.d.ts +0 -752
- package/build/apify.d.ts.map +0 -1
- package/build/apify.js +0 -877
- package/build/apify.js.map +0 -1
- package/build/autoscaling/autoscaled_pool.d.ts +0 -384
- package/build/autoscaling/autoscaled_pool.d.ts.map +0 -1
- package/build/autoscaling/autoscaled_pool.js +0 -557
- package/build/autoscaling/autoscaled_pool.js.map +0 -1
- package/build/autoscaling/snapshotter.d.ts +0 -278
- package/build/autoscaling/snapshotter.d.ts.map +0 -1
- package/build/autoscaling/snapshotter.js +0 -447
- package/build/autoscaling/snapshotter.js.map +0 -1
- package/build/autoscaling/system_status.d.ts +0 -224
- package/build/autoscaling/system_status.d.ts.map +0 -1
- package/build/autoscaling/system_status.js +0 -228
- package/build/autoscaling/system_status.js.map +0 -1
- package/build/browser_launchers/browser_launcher.d.ts +0 -154
- package/build/browser_launchers/browser_launcher.d.ts.map +0 -1
- package/build/browser_launchers/browser_launcher.js +0 -160
- package/build/browser_launchers/browser_launcher.js.map +0 -1
- package/build/browser_launchers/browser_plugin.d.ts +0 -23
- package/build/browser_launchers/browser_plugin.d.ts.map +0 -1
- package/build/browser_launchers/browser_plugin.js +0 -25
- package/build/browser_launchers/browser_plugin.js.map +0 -1
- package/build/browser_launchers/playwright_launcher.d.ts +0 -131
- package/build/browser_launchers/playwright_launcher.d.ts.map +0 -1
- package/build/browser_launchers/playwright_launcher.js +0 -150
- package/build/browser_launchers/playwright_launcher.js.map +0 -1
- package/build/browser_launchers/puppeteer_launcher.d.ts +0 -153
- package/build/browser_launchers/puppeteer_launcher.d.ts.map +0 -1
- package/build/browser_launchers/puppeteer_launcher.js +0 -197
- package/build/browser_launchers/puppeteer_launcher.js.map +0 -1
- package/build/cache_container.d.ts +0 -31
- package/build/cache_container.d.ts.map +0 -1
- package/build/cache_container.js +0 -48
- package/build/cache_container.js.map +0 -1
- package/build/configuration.d.ts +0 -226
- package/build/configuration.d.ts.map +0 -1
- package/build/configuration.js +0 -325
- package/build/configuration.js.map +0 -1
- package/build/constants.d.ts +0 -37
- package/build/constants.d.ts.map +0 -1
- package/build/constants.js +0 -41
- package/build/constants.js.map +0 -1
- package/build/crawlers/basic_crawler.d.ts +0 -443
- package/build/crawlers/basic_crawler.d.ts.map +0 -1
- package/build/crawlers/basic_crawler.js +0 -664
- package/build/crawlers/basic_crawler.js.map +0 -1
- package/build/crawlers/browser_crawler.d.ts +0 -512
- package/build/crawlers/browser_crawler.d.ts.map +0 -1
- package/build/crawlers/browser_crawler.js +0 -540
- package/build/crawlers/browser_crawler.js.map +0 -1
- package/build/crawlers/cheerio_crawler.d.ts +0 -931
- package/build/crawlers/cheerio_crawler.d.ts.map +0 -1
- package/build/crawlers/cheerio_crawler.js +0 -913
- package/build/crawlers/cheerio_crawler.js.map +0 -1
- package/build/crawlers/crawler_extension.d.ts +0 -10
- package/build/crawlers/crawler_extension.d.ts.map +0 -1
- package/build/crawlers/crawler_extension.js +0 -19
- package/build/crawlers/crawler_extension.js.map +0 -1
- package/build/crawlers/crawler_utils.d.ts +0 -34
- package/build/crawlers/crawler_utils.d.ts.map +0 -1
- package/build/crawlers/crawler_utils.js +0 -87
- package/build/crawlers/crawler_utils.js.map +0 -1
- package/build/crawlers/playwright_crawler.d.ts +0 -448
- package/build/crawlers/playwright_crawler.d.ts.map +0 -1
- package/build/crawlers/playwright_crawler.js +0 -299
- package/build/crawlers/playwright_crawler.js.map +0 -1
- package/build/crawlers/puppeteer_crawler.d.ts +0 -425
- package/build/crawlers/puppeteer_crawler.d.ts.map +0 -1
- package/build/crawlers/puppeteer_crawler.js +0 -299
- package/build/crawlers/puppeteer_crawler.js.map +0 -1
- package/build/crawlers/statistics.d.ts +0 -185
- package/build/crawlers/statistics.d.ts.map +0 -1
- package/build/crawlers/statistics.js +0 -331
- package/build/crawlers/statistics.js.map +0 -1
- package/build/enqueue_links/click_elements.d.ts +0 -179
- package/build/enqueue_links/click_elements.d.ts.map +0 -1
- package/build/enqueue_links/click_elements.js +0 -434
- package/build/enqueue_links/click_elements.js.map +0 -1
- package/build/enqueue_links/enqueue_links.d.ts +0 -117
- package/build/enqueue_links/enqueue_links.d.ts.map +0 -1
- package/build/enqueue_links/enqueue_links.js +0 -163
- package/build/enqueue_links/enqueue_links.js.map +0 -1
- package/build/enqueue_links/shared.d.ts +0 -42
- package/build/enqueue_links/shared.d.ts.map +0 -1
- package/build/enqueue_links/shared.js +0 -121
- package/build/enqueue_links/shared.js.map +0 -1
- package/build/errors.d.ts +0 -29
- package/build/errors.d.ts.map +0 -1
- package/build/errors.js +0 -38
- package/build/errors.js.map +0 -1
- package/build/events.d.ts +0 -11
- package/build/events.d.ts.map +0 -1
- package/build/events.js +0 -147
- package/build/events.js.map +0 -1
- package/build/index.d.ts +0 -4
- package/build/index.d.ts.map +0 -1
- package/build/index.js +0 -7
- package/build/index.js.map +0 -1
- package/build/main.d.ts +0 -179
- package/build/main.d.ts.map +0 -1
- package/build/main.js +0 -81
- package/build/main.js.map +0 -1
- package/build/playwright_utils.d.ts +0 -9
- package/build/playwright_utils.d.ts.map +0 -1
- package/build/playwright_utils.js +0 -90
- package/build/playwright_utils.js.map +0 -1
- package/build/proxy_configuration.d.ts +0 -411
- package/build/proxy_configuration.d.ts.map +0 -1
- package/build/proxy_configuration.js +0 -517
- package/build/proxy_configuration.js.map +0 -1
- package/build/pseudo_url.d.ts +0 -86
- package/build/pseudo_url.d.ts.map +0 -1
- package/build/pseudo_url.js +0 -153
- package/build/pseudo_url.js.map +0 -1
- package/build/puppeteer_request_interception.d.ts +0 -8
- package/build/puppeteer_request_interception.d.ts.map +0 -1
- package/build/puppeteer_request_interception.js +0 -235
- package/build/puppeteer_request_interception.js.map +0 -1
- package/build/puppeteer_utils.d.ts +0 -250
- package/build/puppeteer_utils.d.ts.map +0 -1
- package/build/puppeteer_utils.js +0 -551
- package/build/puppeteer_utils.js.map +0 -1
- package/build/request.d.ts +0 -180
- package/build/request.d.ts.map +0 -1
- package/build/request.js +0 -261
- package/build/request.js.map +0 -1
- package/build/request_list.d.ts +0 -581
- package/build/request_list.d.ts.map +0 -1
- package/build/request_list.js +0 -826
- package/build/request_list.js.map +0 -1
- package/build/serialization.d.ts +0 -5
- package/build/serialization.d.ts.map +0 -1
- package/build/serialization.js +0 -139
- package/build/serialization.js.map +0 -1
- package/build/session_pool/errors.d.ts +0 -11
- package/build/session_pool/errors.d.ts.map +0 -1
- package/build/session_pool/errors.js +0 -18
- package/build/session_pool/errors.js.map +0 -1
- package/build/session_pool/events.d.ts +0 -5
- package/build/session_pool/events.d.ts.map +0 -1
- package/build/session_pool/events.js +0 -6
- package/build/session_pool/events.js.map +0 -1
- package/build/session_pool/session.d.ts +0 -286
- package/build/session_pool/session.d.ts.map +0 -1
- package/build/session_pool/session.js +0 -355
- package/build/session_pool/session.js.map +0 -1
- package/build/session_pool/session_pool.d.ts +0 -280
- package/build/session_pool/session_pool.d.ts.map +0 -1
- package/build/session_pool/session_pool.js +0 -393
- package/build/session_pool/session_pool.js.map +0 -1
- package/build/session_pool/session_utils.d.ts +0 -4
- package/build/session_pool/session_utils.d.ts.map +0 -1
- package/build/session_pool/session_utils.js +0 -24
- package/build/session_pool/session_utils.js.map +0 -1
- package/build/stealth/hiding_tricks.d.ts +0 -22
- package/build/stealth/hiding_tricks.d.ts.map +0 -1
- package/build/stealth/hiding_tricks.js +0 -308
- package/build/stealth/hiding_tricks.js.map +0 -1
- package/build/stealth/stealth.d.ts +0 -56
- package/build/stealth/stealth.d.ts.map +0 -1
- package/build/stealth/stealth.js +0 -125
- package/build/stealth/stealth.js.map +0 -1
- package/build/storages/dataset.d.ts +0 -288
- package/build/storages/dataset.d.ts.map +0 -1
- package/build/storages/dataset.js +0 -480
- package/build/storages/dataset.js.map +0 -1
- package/build/storages/key_value_store.d.ts +0 -243
- package/build/storages/key_value_store.d.ts.map +0 -1
- package/build/storages/key_value_store.js +0 -462
- package/build/storages/key_value_store.js.map +0 -1
- package/build/storages/request_queue.d.ts +0 -318
- package/build/storages/request_queue.d.ts.map +0 -1
- package/build/storages/request_queue.js +0 -636
- package/build/storages/request_queue.js.map +0 -1
- package/build/storages/storage_manager.d.ts +0 -87
- package/build/storages/storage_manager.d.ts.map +0 -1
- package/build/storages/storage_manager.js +0 -150
- package/build/storages/storage_manager.js.map +0 -1
- package/build/tsconfig.tsbuildinfo +0 -1
- package/build/typedefs.d.ts +0 -146
- package/build/typedefs.d.ts.map +0 -1
- package/build/typedefs.js +0 -88
- package/build/typedefs.js.map +0 -1
- package/build/utils.d.ts +0 -175
- package/build/utils.d.ts.map +0 -1
- package/build/utils.js +0 -731
- package/build/utils.js.map +0 -1
- package/build/utils_log.d.ts +0 -41
- package/build/utils_log.d.ts.map +0 -1
- package/build/utils_log.js +0 -192
- package/build/utils_log.js.map +0 -1
- package/build/utils_request.d.ts +0 -77
- package/build/utils_request.d.ts.map +0 -1
- package/build/utils_request.js +0 -385
- package/build/utils_request.js.map +0 -1
- package/build/utils_social.d.ts +0 -210
- package/build/utils_social.d.ts.map +0 -1
- package/build/utils_social.js +0 -787
- package/build/utils_social.js.map +0 -1
- package/build/validators.d.ts +0 -23
- package/build/validators.d.ts.map +0 -1
- package/build/validators.js +0 -29
- package/build/validators.js.map +0 -1
package/build/utils_social.js
DELETED
|
@@ -1,787 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.socialUtils = void 0;
|
|
4
|
-
const tslib_1 = require("tslib");
|
|
5
|
-
/* eslint-disable no-continue */
|
|
6
|
-
const underscore_1 = (0, tslib_1.__importDefault)(require("underscore"));
|
|
7
|
-
const cheerio_1 = (0, tslib_1.__importDefault)(require("cheerio"));
|
|
8
|
-
const utils_log_1 = (0, tslib_1.__importDefault)(require("./utils_log"));
|
|
9
|
-
const utils_1 = require("./utils");
|
|
10
|
-
// Regex inspired by https://zapier.com/blog/extract-links-email-phone-regex/
|
|
11
|
-
// eslint-disable-next-line max-len
|
|
12
|
-
const EMAIL_REGEX_STRING = '(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\\])';
|
|
13
|
-
/**
|
|
14
|
-
* Regular expression to exactly match a single email address.
|
|
15
|
-
* It has the following form: `/^...$/i`.
|
|
16
|
-
* @type {RegExp}
|
|
17
|
-
* @memberOf social
|
|
18
|
-
*/
|
|
19
|
-
const EMAIL_REGEX = new RegExp(`^${EMAIL_REGEX_STRING}$`, 'i');
|
|
20
|
-
/**
|
|
21
|
-
* Regular expression to find multiple email addresses in a text.
|
|
22
|
-
* It has the following form: `/.../ig`.
|
|
23
|
-
* @type {RegExp}
|
|
24
|
-
* @memberOf social
|
|
25
|
-
*/
|
|
26
|
-
const EMAIL_REGEX_GLOBAL = new RegExp(EMAIL_REGEX_STRING, 'ig');
|
|
27
|
-
const EMAIL_URL_PREFIX_REGEX = /^mailto:/i;
|
|
28
|
-
/**
|
|
29
|
-
* The function extracts email addresses from a plain text.
|
|
30
|
-
* Note that the function preserves the order of emails and keep duplicates.
|
|
31
|
-
* @param {string} text Text to search in.
|
|
32
|
-
* @return {string[]} Array of emails addresses found.
|
|
33
|
-
* If no emails are found, the function returns an empty array.
|
|
34
|
-
* @memberOf social
|
|
35
|
-
*/
|
|
36
|
-
const emailsFromText = (text) => {
|
|
37
|
-
if (!underscore_1.default.isString(text))
|
|
38
|
-
return [];
|
|
39
|
-
return text.match(EMAIL_REGEX_GLOBAL) || [];
|
|
40
|
-
};
|
|
41
|
-
/**
|
|
42
|
-
* The function extracts email addresses from a list of URLs.
|
|
43
|
-
* Basically it looks for all `mailto:` URLs and returns valid email addresses from them.
|
|
44
|
-
* Note that the function preserves the order of emails and keep duplicates.
|
|
45
|
-
* @param {string[]} urls Array of URLs.
|
|
46
|
-
* @return {string[]} Array of emails addresses found.
|
|
47
|
-
* If no emails are found, the function returns an empty array.
|
|
48
|
-
* @memberOf social
|
|
49
|
-
*/
|
|
50
|
-
const emailsFromUrls = (urls) => {
|
|
51
|
-
if (!Array.isArray(urls))
|
|
52
|
-
throw new Error('The "urls" parameter must be an array');
|
|
53
|
-
const emails = [];
|
|
54
|
-
for (const url of urls) {
|
|
55
|
-
if (!url)
|
|
56
|
-
continue;
|
|
57
|
-
if (!EMAIL_URL_PREFIX_REGEX.test(url))
|
|
58
|
-
continue;
|
|
59
|
-
const email = url.replace(EMAIL_URL_PREFIX_REGEX, '').trim();
|
|
60
|
-
if (EMAIL_REGEX.test(email))
|
|
61
|
-
emails.push(email);
|
|
62
|
-
}
|
|
63
|
-
return emails;
|
|
64
|
-
};
|
|
65
|
-
// Supports URLs starting with `tel://`, `tel:/` and `tel:`, and similarly `phone`, `telephone` and `callto`
|
|
66
|
-
const PHONE_URL_PREFIX_REGEX = /^(tel|phone|telephone|callto):(\/)?(\/)?/i;
|
|
67
|
-
// It's pretty much impossible (and unmaintainable) to have just one large regular expression for all possible phone numbers.
|
|
68
|
-
// So here we define various regular expression for typical phone number patterns, which are then used to compile
|
|
69
|
-
// a single large regular expressions. Add more patterns as needed.
|
|
70
|
-
// NOTE: The patterns are tested in the order as written below, so the longer ones should be before the shorter ones!
|
|
71
|
-
const PHONE_REGEXS_STRINGS = [
|
|
72
|
-
// 775123456
|
|
73
|
-
'[0-9]{6,15}',
|
|
74
|
-
// 1(413)555-2378 or 1(413)555.2378 or 1 (413) 555-2378 or 1 (413) 555 2378 or (303) 494-2320
|
|
75
|
-
'([0-9]{1,4}( )?)?\\([0-9]{2,4}\\)( )?[0-9]{2,4}(( )?(-|.))?( )?[0-9]{2,6}',
|
|
76
|
-
// 1(262) 955-95-79 or 1(262)955.95.79
|
|
77
|
-
'([0-9]{1,4}( )?)?\\([0-9]{2,4}\\)( )?[0-9]{2,4}(( )?(-|.))?( )?[0-9]{2,6}',
|
|
78
|
-
// (51) 5667-9987 or (19)94138-9398
|
|
79
|
-
'\\([0-9]{2}\\)( )?[0-9]{4,5}-[0-9]{4}',
|
|
80
|
-
// 413-577-1234-564
|
|
81
|
-
'[0-9]{2,4}-[0-9]{2,4}-[0-9]{2,4}-[0-9]{2,6}',
|
|
82
|
-
// 413-577-1234
|
|
83
|
-
'[0-9]{2,4}-[0-9]{2,4}-[0-9]{2,6}',
|
|
84
|
-
// 413-577
|
|
85
|
-
'[0-9]{2,4}-[0-9]{2,6}',
|
|
86
|
-
// 413.577.1234.564
|
|
87
|
-
'[0-9]{2,4}\\.[0-9]{2,4}\\.[0-9]{2,4}\\.[0-9]{2,6}',
|
|
88
|
-
// 413.577.1234
|
|
89
|
-
'[0-9]{2,4}\\.[0-9]{2,4}\\.[0-9]{2,6}',
|
|
90
|
-
// 413.577
|
|
91
|
-
'[0-9]{2,4}\\.[0-9]{2,6}',
|
|
92
|
-
// 413 577 1234 564
|
|
93
|
-
'[0-9]{2,4} [0-9]{2,4} [0-9]{2,4} [0-9]{2,6}',
|
|
94
|
-
// 413 577 1234
|
|
95
|
-
'[0-9]{2,4} [0-9]{2,4} [0-9]{2,6}',
|
|
96
|
-
// 123 4567
|
|
97
|
-
'[0-9]{2,4} [0-9]{3,8}',
|
|
98
|
-
];
|
|
99
|
-
// All phones might be prefixed with '+' or '00'
|
|
100
|
-
for (let i = 0; i < PHONE_REGEXS_STRINGS.length; i++) {
|
|
101
|
-
PHONE_REGEXS_STRINGS[i] = `(00|\\+)?${PHONE_REGEXS_STRINGS[i]}`;
|
|
102
|
-
}
|
|
103
|
-
// The minimum number of digits a phone number can contain.
|
|
104
|
-
// That's because the PHONE_REGEXS_STRINGS patterns are quite wide and report a lot of false positives.
|
|
105
|
-
const PHONE_MIN_DIGITS = 7;
|
|
106
|
-
// These are patterns that might be matched by PHONE_REGEXS_STRINGS,
|
|
107
|
-
// but which are most likely not phone numbers. Add more patterns as needed.
|
|
108
|
-
const SKIP_PHONE_REGEXS = [
|
|
109
|
-
// 2018-11-10
|
|
110
|
-
'^[0-9]{4}-[0-9]{2}-[0-9]{2}$',
|
|
111
|
-
];
|
|
112
|
-
const PHONE_REGEX_GLOBAL = new RegExp(`(${PHONE_REGEXS_STRINGS.join('|')})`, 'ig');
|
|
113
|
-
const PHONE_REGEX = new RegExp(`^(${PHONE_REGEXS_STRINGS.join('|')})$`, 'i');
|
|
114
|
-
const SKIP_PHONE_REGEX = new RegExp(`^(${SKIP_PHONE_REGEXS.join('|')})$`, 'i');
|
|
115
|
-
/**
|
|
116
|
-
* The function attempts to extract phone numbers from a text. Please note that
|
|
117
|
-
* the results might not be accurate, since phone numbers appear in a large variety of formats and conventions.
|
|
118
|
-
* If you encounter some problems, please [file an issue](https://github.com/apify/apify-js/issues).
|
|
119
|
-
* @param {string} text Text to search the phone numbers in.
|
|
120
|
-
* @return {string[]} Array of phone numbers found.
|
|
121
|
-
* If no phone numbers are found, the function returns an empty array.
|
|
122
|
-
* @memberOf social
|
|
123
|
-
*/
|
|
124
|
-
const phonesFromText = (text) => {
|
|
125
|
-
if (!underscore_1.default.isString(text))
|
|
126
|
-
return [];
|
|
127
|
-
let phones = text.match(PHONE_REGEX_GLOBAL) || [];
|
|
128
|
-
phones = phones.filter((phone) => {
|
|
129
|
-
if (!phone)
|
|
130
|
-
return false;
|
|
131
|
-
// Skip too short phones, they are most likely incorrect
|
|
132
|
-
if (phone.match(/[0-9]/g).length < PHONE_MIN_DIGITS)
|
|
133
|
-
return false;
|
|
134
|
-
// Skip phone numbers matching specific patterns
|
|
135
|
-
if (SKIP_PHONE_REGEX.test(phone))
|
|
136
|
-
return false;
|
|
137
|
-
return true;
|
|
138
|
-
});
|
|
139
|
-
return phones;
|
|
140
|
-
};
|
|
141
|
-
/**
|
|
142
|
-
* Finds phone number links in an array of URLs and extracts the phone numbers from them.
|
|
143
|
-
* Note that the phone number links look like `tel://123456789`, `tel:/123456789` or `tel:123456789`.
|
|
144
|
-
* @param {string[]} urls Array of URLs.
|
|
145
|
-
* @return {string[]} Array of phone numbers found.
|
|
146
|
-
* If no phone numbers are found, the function returns an empty array.
|
|
147
|
-
* @memberOf social
|
|
148
|
-
*/
|
|
149
|
-
const phonesFromUrls = (urls) => {
|
|
150
|
-
if (!Array.isArray(urls))
|
|
151
|
-
throw new Error('The "urls" parameter must be an array');
|
|
152
|
-
const phones = [];
|
|
153
|
-
for (const url of urls) {
|
|
154
|
-
if (!url)
|
|
155
|
-
continue;
|
|
156
|
-
if (!PHONE_URL_PREFIX_REGEX.test(url))
|
|
157
|
-
continue;
|
|
158
|
-
const phone = url.replace(PHONE_URL_PREFIX_REGEX, '').trim();
|
|
159
|
-
if (PHONE_REGEX.test(phone))
|
|
160
|
-
phones.push(phone);
|
|
161
|
-
}
|
|
162
|
-
return phones;
|
|
163
|
-
};
|
|
164
|
-
// NOTEs about the regular expressions
|
|
165
|
-
// - They have just a single matching group for the profile username, all other groups are non-matching
|
|
166
|
-
// - They use a negative lookbehind and lookahead assertions, which are only supported in Node 8+.
|
|
167
|
-
// They are used to prevent matching URLs in strings like "blahttps://www.example.com"
|
|
168
|
-
// eslint-disable-next-line max-len
|
|
169
|
-
const LINKEDIN_REGEX_STRING = '(?<!\\w)(?:(?:http(?:s)?:\\/\\/)?(?:(?:(?:[a-z]+\\.)?linkedin\\.com\\/(?:in|company)\\/)([a-z0-9\\-_%=]{2,60})(?![a-z0-9\\-_%=])))(?:\\/)?';
|
|
170
|
-
// eslint-disable-next-line max-len
|
|
171
|
-
const INSTAGRAM_REGEX_STRING = '(?<!\\w)(?:http(?:s)?:\\/\\/)?(?:(?:www\\.)?(?:instagram\\.com|instagr\\.am)\\/)(?!explore|_n|_u)([a-z0-9_.]{2,30})(?![a-z0-9_.])(?:/)?';
|
|
172
|
-
const TWITTER_RESERVED_PATHS = 'oauth|account|tos|privacy|signup|home|hashtag|search|login|widgets|i|settings|start|share|intent|oct';
|
|
173
|
-
// eslint-disable-next-line max-len, quotes
|
|
174
|
-
const TWITTER_REGEX_STRING = `(?<!\\w)(?:http(?:s)?:\\/\\/)?(?:www.)?(?:twitter.com)\\/(?!(?:${TWITTER_RESERVED_PATHS})(?:[\\'\\"\\?\\.\\/]|$))([a-z0-9_]{1,15})(?![a-z0-9_])(?:/)?`;
|
|
175
|
-
// eslint-disable-next-line max-len, quotes
|
|
176
|
-
const FACEBOOK_RESERVED_PATHS = 'rsrc\\.php|apps|groups|events|l\\.php|friends|images|photo.php|chat|ajax|dyi|common|policies|login|recover|reg|help|security|messages|marketplace|pages|live|bookmarks|games|fundraisers|saved|gaming|salesgroups|jobs|people|ads|ad_campaign|weather|offers|recommendations|crisisresponse|onthisday|developers|settings|connect|business|plugins|intern|sharer';
|
|
177
|
-
// eslint-disable-next-line max-len, quotes
|
|
178
|
-
const FACEBOOK_REGEX_STRING = `(?<!\\w)(?:http(?:s)?:\\/\\/)?(?:www.)?(?:facebook.com|fb.com)\\/(?!(?:${FACEBOOK_RESERVED_PATHS})(?:[\\'\\"\\?\\.\\/]|$))(profile\\.php\\?id\\=[0-9]{3,20}|(?!profile\\.php)[a-z0-9\\.]{5,51})(?![a-z0-9\\.])(?:/)?`;
|
|
179
|
-
// eslint-disable-next-line max-len, quotes
|
|
180
|
-
const YOUTUBE_REGEX_STRING = '(?<!\\w)(?:https?:\\/\\/)?(?:youtu\\.be\\/|(?:www\\.|m\\.)?youtube\\.com(?:\\/(?:watch|v|embed|user|c(?:hannel)?)(?:\\.php)?)?(?:\\?[^ ]*v=|\\/))([a-zA-Z0-9\\-_]{2,100})';
|
|
181
|
-
// eslint-disable-next-line max-len, quotes
|
|
182
|
-
const TIKTOK_REGEX_STRING = '(?<!\\w)(?:http(?:s)?:\\/\\/)?(?:(?:www|m)\\.)?(?:tiktok\\.com)\\/(((?:(?:v|embed|trending)(?:\\?shareId=|\\/))[0-9]{2,50}(?![0-9]))|(?:@)[a-z0-9\\-_\\.]+((?:\\/video\\/)[0-9]{2,50}(?![0-9]))?)(?:\\/)?';
|
|
183
|
-
// eslint-disable-next-line max-len, quotes
|
|
184
|
-
const PINTEREST_REGEX_STRING = '(?<!\\w)(?:http(?:s)?:\\/\\/)?(?:(?:(?:(?:www\\.)?pinterest(?:\\.com|(?:\\.[a-z]{2}){1,2}))|(?:[a-z]{2})\\.pinterest\\.com)(?:\\/))((pin\\/[0-9]{2,50})|((?!pin)[a-z0-9\\-_\\.]+(\\/[a-z0-9\\-_\\.]+)?))(?:\\/)?';
|
|
185
|
-
// eslint-disable-next-line max-len, quotes
|
|
186
|
-
const DISCORD_REGEX_STRING = '(?<!\\w)(?:https?:\\/\\/)?(?:www\\.)?((?:(?:(?:canary|ptb).)?(?:discord|discordapp)\\.com\\/channels(?:\\/)[0-9]{2,50}(\\/[0-9]{2,50})*)|(?:(?:(?:canary|ptb).)?(?:discord\\.(?:com|me|li|gg|io)|discordapp\\.com)(?:\\/invite)?)\\/(?!channels)[a-z0-9\\-_]{2,50})(?:\\/)?';
|
|
187
|
-
/** @type RegExp */
|
|
188
|
-
let LINKEDIN_REGEX;
|
|
189
|
-
/** @type RegExp */
|
|
190
|
-
let LINKEDIN_REGEX_GLOBAL;
|
|
191
|
-
/** @type RegExp */
|
|
192
|
-
let INSTAGRAM_REGEX;
|
|
193
|
-
/** @type RegExp */
|
|
194
|
-
let INSTAGRAM_REGEX_GLOBAL;
|
|
195
|
-
/** @type RegExp */
|
|
196
|
-
let TWITTER_REGEX;
|
|
197
|
-
/** @type RegExp */
|
|
198
|
-
let TWITTER_REGEX_GLOBAL;
|
|
199
|
-
/** @type RegExp */
|
|
200
|
-
let FACEBOOK_REGEX;
|
|
201
|
-
/** @type RegExp */
|
|
202
|
-
let FACEBOOK_REGEX_GLOBAL;
|
|
203
|
-
/** @type RegExp */
|
|
204
|
-
let YOUTUBE_REGEX;
|
|
205
|
-
/** @type RegExp */
|
|
206
|
-
let YOUTUBE_REGEX_GLOBAL;
|
|
207
|
-
/** @type RegExp */
|
|
208
|
-
let TIKTOK_REGEX;
|
|
209
|
-
/** @type RegExp */
|
|
210
|
-
let TIKTOK_REGEX_GLOBAL;
|
|
211
|
-
/** @type RegExp */
|
|
212
|
-
let PINTEREST_REGEX;
|
|
213
|
-
/** @type RegExp */
|
|
214
|
-
let PINTEREST_REGEX_GLOBAL;
|
|
215
|
-
let DISCORD_REGEX;
|
|
216
|
-
/** @type RegExp */
|
|
217
|
-
let DISCORD_REGEX_GLOBAL;
|
|
218
|
-
try {
|
|
219
|
-
/**
|
|
220
|
-
* Regular expression to exactly match a single LinkedIn profile URL.
|
|
221
|
-
* It has the following form: `/^...$/i` and matches URLs such as:
|
|
222
|
-
* ```
|
|
223
|
-
* https://www.linkedin.com/in/alan-turing
|
|
224
|
-
* en.linkedin.com/in/alan-turing
|
|
225
|
-
* linkedin.com/in/alan-turing
|
|
226
|
-
* https://www.linkedin.com/company/linkedin/
|
|
227
|
-
* ```
|
|
228
|
-
*
|
|
229
|
-
* The regular expression does NOT match URLs with additional
|
|
230
|
-
* subdirectories or query parameters, such as:
|
|
231
|
-
* ```
|
|
232
|
-
* https://www.linkedin.com/in/linus-torvalds/latest-activity
|
|
233
|
-
* ```
|
|
234
|
-
*
|
|
235
|
-
* Example usage:
|
|
236
|
-
* ```
|
|
237
|
-
* if (Apify.utils.social.LINKEDIN_REGEX.test('https://www.linkedin.com/in/alan-turing')) {
|
|
238
|
-
* console.log('Match!');
|
|
239
|
-
* }
|
|
240
|
-
* ```
|
|
241
|
-
* @type {RegExp}
|
|
242
|
-
* @memberOf social
|
|
243
|
-
*/
|
|
244
|
-
LINKEDIN_REGEX = new RegExp(`^${LINKEDIN_REGEX_STRING}$`, 'i');
|
|
245
|
-
/**
|
|
246
|
-
* Regular expression to find multiple LinkedIn profile URLs in a text or HTML.
|
|
247
|
-
* It has the following form: `/.../ig` and matches URLs such as:
|
|
248
|
-
* ```
|
|
249
|
-
* https://www.linkedin.com/in/alan-turing
|
|
250
|
-
* en.linkedin.com/in/alan-turing
|
|
251
|
-
* linkedin.com/in/alan-turing
|
|
252
|
-
* https://www.linkedin.com/company/linkedin/
|
|
253
|
-
* ```
|
|
254
|
-
*
|
|
255
|
-
* If the profile URL contains subdirectories or query parameters, the regular expression
|
|
256
|
-
* extracts just the base part of the profile URL. For example, from text such as:
|
|
257
|
-
* ```
|
|
258
|
-
* https://www.linkedin.com/in/linus-torvalds/latest-activity
|
|
259
|
-
* ```
|
|
260
|
-
* the expression extracts just the following base URL:
|
|
261
|
-
* ```
|
|
262
|
-
* https://www.linkedin.com/in/linus-torvalds
|
|
263
|
-
* ```
|
|
264
|
-
*
|
|
265
|
-
* Example usage:
|
|
266
|
-
* ```
|
|
267
|
-
* const matches = text.match(Apify.utils.social.LINKEDIN_REGEX_GLOBAL);
|
|
268
|
-
* if (matches) console.log(`${matches.length} LinkedIn profiles found!`);
|
|
269
|
-
* ```
|
|
270
|
-
* @type {RegExp}
|
|
271
|
-
* @memberOf social
|
|
272
|
-
*/
|
|
273
|
-
LINKEDIN_REGEX_GLOBAL = new RegExp(LINKEDIN_REGEX_STRING, 'ig');
|
|
274
|
-
/**
|
|
275
|
-
* Regular expression to exactly match a single Instagram profile URL.
|
|
276
|
-
* It has the following form: `/^...$/i` and matches URLs such as:
|
|
277
|
-
* ```
|
|
278
|
-
* https://www.instagram.com/old_prague
|
|
279
|
-
* www.instagram.com/old_prague/
|
|
280
|
-
* instagr.am/old_prague
|
|
281
|
-
* ```
|
|
282
|
-
*
|
|
283
|
-
* The regular expression does NOT match URLs with additional
|
|
284
|
-
* subdirectories or query parameters, such as:
|
|
285
|
-
* ```
|
|
286
|
-
* https://www.instagram.com/cristiano/followers
|
|
287
|
-
* ```
|
|
288
|
-
*
|
|
289
|
-
* It also does NOT match the following URLs:
|
|
290
|
-
* ```
|
|
291
|
-
* https://www.instagram.com/explore/
|
|
292
|
-
* https://www.instagram.com/_n/
|
|
293
|
-
* https://www.instagram.com/_u/
|
|
294
|
-
* ```
|
|
295
|
-
*
|
|
296
|
-
* Example usage:
|
|
297
|
-
* ```
|
|
298
|
-
* if (Apify.utils.social.INSTAGRAM_REGEX.test('https://www.instagram.com/old_prague')) {
|
|
299
|
-
* console.log('Match!');
|
|
300
|
-
* }
|
|
301
|
-
* ```
|
|
302
|
-
* @type {RegExp}
|
|
303
|
-
* @memberOf social
|
|
304
|
-
*/
|
|
305
|
-
INSTAGRAM_REGEX = new RegExp(`^${INSTAGRAM_REGEX_STRING}$`, 'i');
|
|
306
|
-
/**
|
|
307
|
-
* Regular expression to find multiple Instagram profile URLs in a text or HTML.
|
|
308
|
-
* It has the following form: `/.../ig` and matches URLs such as:
|
|
309
|
-
* ```
|
|
310
|
-
* https://www.instagram.com/old_prague
|
|
311
|
-
* www.instagram.com/old_prague/
|
|
312
|
-
* instagr.am/old_prague
|
|
313
|
-
* ```
|
|
314
|
-
*
|
|
315
|
-
* If the profile URL contains subdirectories or query parameters, the regular expression
|
|
316
|
-
* extracts just the base part of the profile URL. For example, from text such as:
|
|
317
|
-
* ```
|
|
318
|
-
* https://www.instagram.com/cristiano/followers
|
|
319
|
-
* ```
|
|
320
|
-
* the expression extracts just the following base URL:
|
|
321
|
-
* ```
|
|
322
|
-
* https://www.instagram.com/cristiano
|
|
323
|
-
* ```
|
|
324
|
-
*
|
|
325
|
-
* The regular expression does NOT match the following URLs:
|
|
326
|
-
* ```
|
|
327
|
-
* https://www.instagram.com/explore/
|
|
328
|
-
* https://www.instagram.com/_n/
|
|
329
|
-
* https://www.instagram.com/_u/
|
|
330
|
-
* ```
|
|
331
|
-
*
|
|
332
|
-
* Example usage:
|
|
333
|
-
* ```
|
|
334
|
-
* const matches = text.match(Apify.utils.social.INSTAGRAM_REGEX_GLOBAL);
|
|
335
|
-
* if (matches) console.log(`${matches.length} Instagram profiles found!`);
|
|
336
|
-
* ```
|
|
337
|
-
* @type {RegExp}
|
|
338
|
-
* @memberOf social
|
|
339
|
-
*/
|
|
340
|
-
INSTAGRAM_REGEX_GLOBAL = new RegExp(INSTAGRAM_REGEX_STRING, 'ig');
|
|
341
|
-
/**
|
|
342
|
-
* Regular expression to exactly match a single Twitter profile URL.
|
|
343
|
-
* It has the following form: `/^...$/i` and matches URLs such as:
|
|
344
|
-
* ```
|
|
345
|
-
* https://www.twitter.com/apify
|
|
346
|
-
* twitter.com/apify
|
|
347
|
-
* ```
|
|
348
|
-
*
|
|
349
|
-
* The regular expression does NOT match URLs with additional
|
|
350
|
-
* subdirectories or query parameters, such as:
|
|
351
|
-
* ```
|
|
352
|
-
* https://www.twitter.com/realdonaldtrump/following
|
|
353
|
-
* ```
|
|
354
|
-
*
|
|
355
|
-
* Example usage:
|
|
356
|
-
* ```
|
|
357
|
-
* if (Apify.utils.social.TWITTER_REGEX.test('https://www.twitter.com/apify')) {
|
|
358
|
-
* console.log('Match!');
|
|
359
|
-
* }
|
|
360
|
-
* ```
|
|
361
|
-
* @type {RegExp}
|
|
362
|
-
* @memberOf social
|
|
363
|
-
*/
|
|
364
|
-
TWITTER_REGEX = new RegExp(`^${TWITTER_REGEX_STRING}$`, 'i');
|
|
365
|
-
/**
|
|
366
|
-
* Regular expression to find multiple Twitter profile URLs in a text or HTML.
|
|
367
|
-
* It has the following form: `/.../ig` and matches URLs such as:
|
|
368
|
-
* ```
|
|
369
|
-
* https://www.twitter.com/apify
|
|
370
|
-
* twitter.com/apify
|
|
371
|
-
* ```
|
|
372
|
-
*
|
|
373
|
-
* If the profile URL contains subdirectories or query parameters, the regular expression
|
|
374
|
-
* extracts just the base part of the profile URL. For example, from text such as:
|
|
375
|
-
* ```
|
|
376
|
-
* https://www.twitter.com/realdonaldtrump/following
|
|
377
|
-
* ```
|
|
378
|
-
* the expression extracts only the following base URL:
|
|
379
|
-
* ```
|
|
380
|
-
* https://www.twitter.com/realdonaldtrump
|
|
381
|
-
* ```
|
|
382
|
-
*
|
|
383
|
-
* Example usage:
|
|
384
|
-
* ```
|
|
385
|
-
* const matches = text.match(Apify.utils.social.TWITTER_REGEX_STRING);
|
|
386
|
-
* if (matches) console.log(`${matches.length} Twitter profiles found!`);
|
|
387
|
-
* ```
|
|
388
|
-
* @type {RegExp}
|
|
389
|
-
* @memberOf social
|
|
390
|
-
*/
|
|
391
|
-
TWITTER_REGEX_GLOBAL = new RegExp(TWITTER_REGEX_STRING, 'ig');
|
|
392
|
-
/**
|
|
393
|
-
* Regular expression to exactly match a single Facebook profile URL.
|
|
394
|
-
* It has the following form: `/^...$/i` and matches URLs such as:
|
|
395
|
-
* ```
|
|
396
|
-
* https://www.facebook.com/apifytech
|
|
397
|
-
* facebook.com/apifytech
|
|
398
|
-
* fb.com/apifytech
|
|
399
|
-
* https://www.facebook.com/profile.php?id=123456789
|
|
400
|
-
* ```
|
|
401
|
-
*
|
|
402
|
-
* The regular expression does NOT match URLs with additional
|
|
403
|
-
* subdirectories or query parameters, such as:
|
|
404
|
-
* ```
|
|
405
|
-
* https://www.facebook.com/apifytech/photos
|
|
406
|
-
* ```
|
|
407
|
-
*
|
|
408
|
-
* Example usage:
|
|
409
|
-
* ```
|
|
410
|
-
* if (Apify.utils.social.FACEBOOK_REGEX.test('https://www.facebook.com/apifytech')) {
|
|
411
|
-
* console.log('Match!');
|
|
412
|
-
* }
|
|
413
|
-
* ```
|
|
414
|
-
* @type {RegExp}
|
|
415
|
-
* @memberOf social
|
|
416
|
-
*/
|
|
417
|
-
FACEBOOK_REGEX = new RegExp(`^${FACEBOOK_REGEX_STRING}$`, 'i');
|
|
418
|
-
/**
|
|
419
|
-
* Regular expression to find multiple Facebook profile URLs in a text or HTML.
|
|
420
|
-
* It has the following form: `/.../ig` and matches URLs such as:
|
|
421
|
-
* ```
|
|
422
|
-
* https://www.facebook.com/apifytech
|
|
423
|
-
* facebook.com/apifytech
|
|
424
|
-
* fb.com/apifytech
|
|
425
|
-
* ```
|
|
426
|
-
*
|
|
427
|
-
* If the profile URL contains subdirectories or query parameters, the regular expression
|
|
428
|
-
* extracts just the base part of the profile URL. For example, from text such as:
|
|
429
|
-
* ```
|
|
430
|
-
* https://www.facebook.com/apifytech/photos
|
|
431
|
-
* ```
|
|
432
|
-
* the expression extracts only the following base URL:
|
|
433
|
-
* ```
|
|
434
|
-
* https://www.facebook.com/apifytech
|
|
435
|
-
* ```
|
|
436
|
-
*
|
|
437
|
-
* Example usage:
|
|
438
|
-
* ```
|
|
439
|
-
* const matches = text.match(Apify.utils.social.FACEBOOK_REGEX_GLOBAL);
|
|
440
|
-
* if (matches) console.log(`${matches.length} Facebook profiles found!`);
|
|
441
|
-
* ```
|
|
442
|
-
* @type {RegExp}
|
|
443
|
-
* @memberOf social
|
|
444
|
-
*/
|
|
445
|
-
FACEBOOK_REGEX_GLOBAL = new RegExp(FACEBOOK_REGEX_STRING, 'ig');
|
|
446
|
-
/**
|
|
447
|
-
* Regular expression to exactly match a single Youtube channel, user or video URL.
|
|
448
|
-
* It has the following form: `/^...$/i` and matches URLs such as:
|
|
449
|
-
* ```
|
|
450
|
-
* https://www.youtube.com/watch?v=kM7YfhfkiEE
|
|
451
|
-
* https://youtu.be/kM7YfhfkiEE
|
|
452
|
-
* https://www.youtube.com/c/TrapNation
|
|
453
|
-
* https://www.youtube.com/channel/UCklie6BM0fhFvzWYqQVoCTA
|
|
454
|
-
* https://www.youtube.com/user/pewdiepie
|
|
455
|
-
* ```
|
|
456
|
-
*
|
|
457
|
-
* Please note that this won't match URLs like https://www.youtube.com/pewdiepie that redirect to /user or /channel.
|
|
458
|
-
*
|
|
459
|
-
* Example usage:
|
|
460
|
-
* ```
|
|
461
|
-
* if (Apify.utils.social.YOUTUBE_REGEX.test('https://www.youtube.com/watch?v=kM7YfhfkiEE')) {
|
|
462
|
-
* console.log('Match!');
|
|
463
|
-
* }
|
|
464
|
-
* ```
|
|
465
|
-
* @type {RegExp}
|
|
466
|
-
* @memberOf social
|
|
467
|
-
*/
|
|
468
|
-
YOUTUBE_REGEX = new RegExp(`^${YOUTUBE_REGEX_STRING}$`, 'i');
|
|
469
|
-
/**
|
|
470
|
-
* Regular expression to find multiple Youtube channel, user or video URLs in a text or HTML.
|
|
471
|
-
* It has the following form: `/.../ig` and matches URLs such as:
|
|
472
|
-
* ```
|
|
473
|
-
* https://www.youtube.com/watch?v=kM7YfhfkiEE
|
|
474
|
-
* https://youtu.be/kM7YfhfkiEE
|
|
475
|
-
* https://www.youtube.com/c/TrapNation
|
|
476
|
-
* https://www.youtube.com/channel/UCklie6BM0fhFvzWYqQVoCTA
|
|
477
|
-
* https://www.youtube.com/user/pewdiepie
|
|
478
|
-
* ```
|
|
479
|
-
*
|
|
480
|
-
* Please note that this won't match URLs like https://www.youtube.com/pewdiepie that redirect to /user or /channel.
|
|
481
|
-
*
|
|
482
|
-
* Example usage:
|
|
483
|
-
* ```
|
|
484
|
-
* const matches = text.match(Apify.utils.social.YOUTUBE_REGEX_GLOBAL);
|
|
485
|
-
* if (matches) console.log(`${matches.length} Youtube videos found!`);
|
|
486
|
-
* ```
|
|
487
|
-
* @type {RegExp}
|
|
488
|
-
* @memberOf social
|
|
489
|
-
*/
|
|
490
|
-
YOUTUBE_REGEX_GLOBAL = new RegExp(YOUTUBE_REGEX_STRING, 'ig');
|
|
491
|
-
/**
|
|
492
|
-
* Regular expression to exactly match a Tiktok video or user account.
|
|
493
|
-
* It has the following form: `/^...$/i` and matches URLs such as:
|
|
494
|
-
* ```
|
|
495
|
-
* https://www.tiktok.com/trending?shareId=123456789
|
|
496
|
-
* https://www.tiktok.com/embed/123456789
|
|
497
|
-
* https://m.tiktok.com/v/123456789
|
|
498
|
-
* https://www.tiktok.com/@user
|
|
499
|
-
* https://www.tiktok.com/@user-account.pro
|
|
500
|
-
* https://www.tiktok.com/@user/video/123456789
|
|
501
|
-
* ```
|
|
502
|
-
*
|
|
503
|
-
* Example usage:
|
|
504
|
-
* ```
|
|
505
|
-
* if (Apify.utils.social.DISCORD_REGEX.test('https://www.tiktok.com/@user')) {
|
|
506
|
-
* console.log('Match!');
|
|
507
|
-
* }
|
|
508
|
-
* ```
|
|
509
|
-
* @type {RegExp}
|
|
510
|
-
* @memberOf social
|
|
511
|
-
*/
|
|
512
|
-
TIKTOK_REGEX = new RegExp(`^${TIKTOK_REGEX_STRING}$`, 'i');
|
|
513
|
-
/**
|
|
514
|
-
* Regular expression to find multiple Tiktok videos or user accounts in a text or HTML.
|
|
515
|
-
* It has the following form: `/.../ig` and matches URLs such as:
|
|
516
|
-
* ```
|
|
517
|
-
* https://www.tiktok.com/trending?shareId=123456789
|
|
518
|
-
* https://www.tiktok.com/embed/123456789
|
|
519
|
-
* https://m.tiktok.com/v/123456789
|
|
520
|
-
* https://www.tiktok.com/@user
|
|
521
|
-
* https://www.tiktok.com/@user-account.pro
|
|
522
|
-
* https://www.tiktok.com/@user/video/123456789
|
|
523
|
-
* ```
|
|
524
|
-
*
|
|
525
|
-
* Example usage:
|
|
526
|
-
* ```
|
|
527
|
-
* const matches = text.match(Apify.utils.social.TIKTOK_REGEX_GLOBAL);
|
|
528
|
-
* if (matches) console.log(`${matches.length} TikTok videos and users found!`);
|
|
529
|
-
* ```
|
|
530
|
-
* @type {RegExp}
|
|
531
|
-
* @memberOf social
|
|
532
|
-
*/
|
|
533
|
-
TIKTOK_REGEX_GLOBAL = new RegExp(TIKTOK_REGEX_STRING, 'ig');
|
|
534
|
-
/**
|
|
535
|
-
* Regular expression to exactly match a Pinterest pin, user or user's board.
|
|
536
|
-
* It has the following form: `/^...$/i` and matches URLs such as:
|
|
537
|
-
* ```
|
|
538
|
-
* https://pinterest.com/pin/123456789
|
|
539
|
-
* https://www.pinterest.cz/pin/123456789
|
|
540
|
-
* https://www.pinterest.com/user
|
|
541
|
-
* https://uk.pinterest.com/user
|
|
542
|
-
* https://www.pinterest.co.uk/user
|
|
543
|
-
* pinterest.com/user_name.gold
|
|
544
|
-
* https://cz.pinterest.com/user/board
|
|
545
|
-
* ```
|
|
546
|
-
*
|
|
547
|
-
* Example usage:
|
|
548
|
-
* ```
|
|
549
|
-
* if (Apify.utils.social.PINTEREST_REGEX.test('https://www.pinterest.com/user')) {
|
|
550
|
-
* console.log('Match!');
|
|
551
|
-
* }
|
|
552
|
-
* ```
|
|
553
|
-
* @type {RegExp}
|
|
554
|
-
* @memberOf social
|
|
555
|
-
*/
|
|
556
|
-
PINTEREST_REGEX = new RegExp(`^${PINTEREST_REGEX_STRING}$`, 'i');
|
|
557
|
-
/**
|
|
558
|
-
* Regular expression to find multiple Pinterest pins, users or boards in a text or HTML.
|
|
559
|
-
* It has the following form: `/.../ig` and matches URLs such as:
|
|
560
|
-
* ```
|
|
561
|
-
* https://pinterest.com/pin/123456789
|
|
562
|
-
* https://www.pinterest.cz/pin/123456789
|
|
563
|
-
* https://www.pinterest.com/user
|
|
564
|
-
* https://uk.pinterest.com/user
|
|
565
|
-
* https://www.pinterest.co.uk/user
|
|
566
|
-
* pinterest.com/user_name.gold
|
|
567
|
-
* https://cz.pinterest.com/user/board
|
|
568
|
-
* ```
|
|
569
|
-
*
|
|
570
|
-
* Example usage:
|
|
571
|
-
* ```
|
|
572
|
-
* const matches = text.match(Apify.utils.social.PINTEREST_REGEX_GLOBAL);
|
|
573
|
-
* if (matches) console.log(`${matches.length} Pinterest pins, users and boards found!`);
|
|
574
|
-
* ```
|
|
575
|
-
* @type {RegExp}
|
|
576
|
-
* @memberOf social
|
|
577
|
-
*/
|
|
578
|
-
PINTEREST_REGEX_GLOBAL = new RegExp(PINTEREST_REGEX_STRING, 'ig');
|
|
579
|
-
/**
|
|
580
|
-
* Regular expression to exactly match a Discord invite or channel.
|
|
581
|
-
* It has the following form: `/^...$/i` and matches URLs such as:
|
|
582
|
-
* ```
|
|
583
|
-
* https://discord.gg/discord-developers
|
|
584
|
-
* https://discord.com/invite/jyEM2PRvMU
|
|
585
|
-
* https://discordapp.com/channels/1234
|
|
586
|
-
* https://discord.com/channels/1234/1234
|
|
587
|
-
* discord.gg/discord-developers
|
|
588
|
-
* ```
|
|
589
|
-
*
|
|
590
|
-
* Example usage:
|
|
591
|
-
* ```
|
|
592
|
-
* if (Apify.utils.social.DISCORD_REGEX.test('https://discord.gg/discord-developers')) {
|
|
593
|
-
* console.log('Match!');
|
|
594
|
-
* }
|
|
595
|
-
* ```
|
|
596
|
-
* @type {RegExp}
|
|
597
|
-
* @memberOf social
|
|
598
|
-
*/
|
|
599
|
-
DISCORD_REGEX = new RegExp(`^${DISCORD_REGEX_STRING}$`, 'i');
|
|
600
|
-
/**
|
|
601
|
-
* Regular expression to find multiple Discord channels or invites in a text or HTML.
|
|
602
|
-
* It has the following form: `/.../ig` and matches URLs such as:
|
|
603
|
-
* ```
|
|
604
|
-
* https://discord.gg/discord-developers
|
|
605
|
-
* https://discord.com/invite/jyEM2PRvMU
|
|
606
|
-
* https://discordapp.com/channels/1234
|
|
607
|
-
* https://discord.com/channels/1234/1234
|
|
608
|
-
* discord.gg/discord-developers
|
|
609
|
-
* ```
|
|
610
|
-
*
|
|
611
|
-
* Example usage:
|
|
612
|
-
* ```
|
|
613
|
-
* const matches = text.match(Apify.utils.social.DISCORD_REGEX_GLOBAL);
|
|
614
|
-
* if (matches) console.log(`${matches.length} Discord channels found!`);
|
|
615
|
-
* ```
|
|
616
|
-
* @type {RegExp}
|
|
617
|
-
* @memberOf social
|
|
618
|
-
*/
|
|
619
|
-
DISCORD_REGEX_GLOBAL = new RegExp(DISCORD_REGEX_STRING, 'ig');
|
|
620
|
-
}
|
|
621
|
-
catch (e) {
|
|
622
|
-
// Older versions of Node don't support negative lookbehind and lookahead expressions.
|
|
623
|
-
// Show warning instead of failing.
|
|
624
|
-
if (e && e.message && e.message.includes('Invalid group')) {
|
|
625
|
-
// eslint-disable-next-line max-len
|
|
626
|
-
utils_log_1.default.warning(`Your version of Node.js (${process.version}) doesn't support the regular expression syntax used by Apify.utils.social tools. The tools will not work. Please upgrade your Node.js to the latest version.`);
|
|
627
|
-
}
|
|
628
|
-
else {
|
|
629
|
-
throw e;
|
|
630
|
-
}
|
|
631
|
-
}
|
|
632
|
-
/**
|
|
633
|
-
* Representation of social handles parsed from a HTML page.
|
|
634
|
-
*
|
|
635
|
-
* The object has the following structure:
|
|
636
|
-
*
|
|
637
|
-
* ```
|
|
638
|
-
* {
|
|
639
|
-
* emails: String[],
|
|
640
|
-
* phones: String[],
|
|
641
|
-
* phonesUncertain: String[],
|
|
642
|
-
* linkedIns: String[],
|
|
643
|
-
* twitters: String[],
|
|
644
|
-
* instagrams: String[],
|
|
645
|
-
* facebooks: String[],
|
|
646
|
-
* youtubes: String[],
|
|
647
|
-
* tiktoks: String[],
|
|
648
|
-
* pinterests: String[],
|
|
649
|
-
* discords: String[],
|
|
650
|
-
* }
|
|
651
|
-
* ```
|
|
652
|
-
* @typedef SocialHandles
|
|
653
|
-
* @property {string[]} emails
|
|
654
|
-
* @property {string[]} phones
|
|
655
|
-
* @property {string[]} phonesUncertain
|
|
656
|
-
* @property {string[]} linkedIns
|
|
657
|
-
* @property {string[]} twitters
|
|
658
|
-
* @property {string[]} instagrams
|
|
659
|
-
* @property {string[]} facebooks
|
|
660
|
-
* @property {string[]} youtubes
|
|
661
|
-
* @property {string[]} tiktoks
|
|
662
|
-
* @property {string[]} pinterests
|
|
663
|
-
* @property {string[]} discords
|
|
664
|
-
*/
|
|
665
|
-
/**
|
|
666
|
-
* The function attempts to extract emails, phone numbers and social profile URLs from a HTML document,
|
|
667
|
-
* specifically LinkedIn, Twitter, Instagram and Facebook profile URLs.
|
|
668
|
-
* The function removes duplicates from the resulting arrays and sorts the items alphabetically.
|
|
669
|
-
*
|
|
670
|
-
* Note that the `phones` field contains phone numbers extracted from the special phone links
|
|
671
|
-
* such as `[call us](tel:+1234556789)` (see {@link social#phonesFromUrls})
|
|
672
|
-
* and potentially other sources with high certainty, while `phonesUncertain` contains phone numbers
|
|
673
|
-
* extracted from the plain text, which might be very inaccurate.
|
|
674
|
-
*
|
|
675
|
-
* **Example usage:**
|
|
676
|
-
* ```javascript
|
|
677
|
-
* const Apify = require('apify');
|
|
678
|
-
*
|
|
679
|
-
* const browser = await Apify.launchPuppeteer();
|
|
680
|
-
* const page = await browser.newPage();
|
|
681
|
-
* await page.goto('http://www.example.com');
|
|
682
|
-
* const html = await page.content();
|
|
683
|
-
*
|
|
684
|
-
* const result = Apify.utils.social.parseHandlesFromHtml(html);
|
|
685
|
-
* console.log('Social handles:');
|
|
686
|
-
* console.dir(result);
|
|
687
|
-
* ```
|
|
688
|
-
*
|
|
689
|
-
* @param {string} html HTML text
|
|
690
|
-
* @param {*|null} [data] Optional object which will receive the `text` and `$` properties
|
|
691
|
-
* that contain text content of the HTML and `cheerio` object, respectively. This is an optimization
|
|
692
|
-
* so that the caller doesn't need to parse the HTML document again, if needed.
|
|
693
|
-
* @return {SocialHandles} An object with the social handles.
|
|
694
|
-
*
|
|
695
|
-
* @memberOf social
|
|
696
|
-
*/
|
|
697
|
-
const parseHandlesFromHtml = (html, data = null) => {
|
|
698
|
-
const result = {
|
|
699
|
-
emails: [],
|
|
700
|
-
phones: [],
|
|
701
|
-
phonesUncertain: [],
|
|
702
|
-
linkedIns: [],
|
|
703
|
-
twitters: [],
|
|
704
|
-
instagrams: [],
|
|
705
|
-
facebooks: [],
|
|
706
|
-
youtubes: [],
|
|
707
|
-
tiktoks: [],
|
|
708
|
-
pinterests: [],
|
|
709
|
-
discords: [],
|
|
710
|
-
};
|
|
711
|
-
// TODO: maybe extract phone numbers from JSON+LD
|
|
712
|
-
if (!underscore_1.default.isString(html))
|
|
713
|
-
return result;
|
|
714
|
-
const $ = cheerio_1.default.load(html, { decodeEntities: true });
|
|
715
|
-
if (data)
|
|
716
|
-
data.$ = $;
|
|
717
|
-
const text = utils_1.publicUtils.htmlToText($);
|
|
718
|
-
if (data)
|
|
719
|
-
data.text = text;
|
|
720
|
-
// Find all <a> links with href tag
|
|
721
|
-
const linkUrls = [];
|
|
722
|
-
$('a[href]').each((index, elem) => {
|
|
723
|
-
if (elem)
|
|
724
|
-
linkUrls.push($(elem).attr('href'));
|
|
725
|
-
});
|
|
726
|
-
result.emails = emailsFromUrls(linkUrls).concat(emailsFromText(text));
|
|
727
|
-
result.phones = phonesFromUrls(linkUrls);
|
|
728
|
-
result.phonesUncertain = phonesFromText(text);
|
|
729
|
-
// Note that these regexps extract just the base profile path. For example for
|
|
730
|
-
// https://www.linkedin.com/in/carl-newman-123456a/detail/recent-activity/
|
|
731
|
-
// they match just:
|
|
732
|
-
// https://www.linkedin.com/in/carl-newman-123456a
|
|
733
|
-
result.linkedIns = html.match(LINKEDIN_REGEX_GLOBAL) || [];
|
|
734
|
-
result.twitters = html.match(TWITTER_REGEX_GLOBAL) || [];
|
|
735
|
-
result.instagrams = html.match(INSTAGRAM_REGEX_GLOBAL) || [];
|
|
736
|
-
result.facebooks = html.match(FACEBOOK_REGEX_GLOBAL) || [];
|
|
737
|
-
result.youtubes = html.match(YOUTUBE_REGEX_GLOBAL) || [];
|
|
738
|
-
result.tiktoks = html.match(TIKTOK_REGEX_GLOBAL) || [];
|
|
739
|
-
result.pinterests = html.match(PINTEREST_REGEX_GLOBAL) || [];
|
|
740
|
-
result.discords = html.match(DISCORD_REGEX_GLOBAL) || [];
|
|
741
|
-
// Sort and deduplicate handles
|
|
742
|
-
// eslint-disable-next-line guard-for-in, no-restricted-syntax
|
|
743
|
-
for (const key in result) {
|
|
744
|
-
result[key].sort();
|
|
745
|
-
result[key] = underscore_1.default.uniq(result[key], true);
|
|
746
|
-
}
|
|
747
|
-
return result;
|
|
748
|
-
};
|
|
749
|
-
/**
|
|
750
|
-
* A namespace that contains various utilities to help you extract social handles
|
|
751
|
-
* from text, URLs and and HTML documents.
|
|
752
|
-
*
|
|
753
|
-
* **Example usage:**
|
|
754
|
-
*
|
|
755
|
-
* ```javascript
|
|
756
|
-
* const Apify = require('apify');
|
|
757
|
-
*
|
|
758
|
-
* const emails = Apify.utils.social.emailsFromText('alice@example.com bob@example.com');
|
|
759
|
-
* ```
|
|
760
|
-
* @namespace social
|
|
761
|
-
*/
|
|
762
|
-
exports.socialUtils = {
|
|
763
|
-
emailsFromText,
|
|
764
|
-
emailsFromUrls,
|
|
765
|
-
phonesFromText,
|
|
766
|
-
phonesFromUrls,
|
|
767
|
-
parseHandlesFromHtml,
|
|
768
|
-
EMAIL_REGEX,
|
|
769
|
-
EMAIL_REGEX_GLOBAL,
|
|
770
|
-
LINKEDIN_REGEX,
|
|
771
|
-
LINKEDIN_REGEX_GLOBAL,
|
|
772
|
-
INSTAGRAM_REGEX,
|
|
773
|
-
INSTAGRAM_REGEX_GLOBAL,
|
|
774
|
-
TWITTER_REGEX,
|
|
775
|
-
TWITTER_REGEX_GLOBAL,
|
|
776
|
-
FACEBOOK_REGEX,
|
|
777
|
-
FACEBOOK_REGEX_GLOBAL,
|
|
778
|
-
YOUTUBE_REGEX,
|
|
779
|
-
YOUTUBE_REGEX_GLOBAL,
|
|
780
|
-
TIKTOK_REGEX,
|
|
781
|
-
TIKTOK_REGEX_GLOBAL,
|
|
782
|
-
PINTEREST_REGEX,
|
|
783
|
-
PINTEREST_REGEX_GLOBAL,
|
|
784
|
-
DISCORD_REGEX,
|
|
785
|
-
DISCORD_REGEX_GLOBAL,
|
|
786
|
-
};
|
|
787
|
-
//# sourceMappingURL=utils_social.js.map
|