@mvegter/scrapedin 1.0.33 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -4
- package/src/company/company.js +19 -20
- package/src/company/companyScraperTemplate.js +24 -25
- package/src/login.js +156 -50
- package/src/openPage.js +4 -14
- package/src/profile/cleanProfileData.js +28 -29
- package/src/profile/contactInfo.js +17 -19
- package/src/profile/profile.js +248 -53
- package/src/profile/profileScraperTemplate.js +71 -74
- package/src/profile/scrapAccomplishmentPanel.js +7 -14
- package/src/profile/scrollToPageBottom.js +5 -6
- package/src/profile/seeMoreButtons.js +11 -35
- package/src/scrapedin.js +21 -2
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mvegter/scrapedin",
|
|
3
|
-
"version": "1.0
|
|
4
|
-
"description": "linkedin scraper for
|
|
3
|
+
"version": "1.2.0",
|
|
4
|
+
"description": "linkedin scraper updated for 2025+ website",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"linkedin",
|
|
7
7
|
"scraper",
|
|
@@ -21,14 +21,14 @@
|
|
|
21
21
|
"author": "Wagner Leonardi <leonardiwagner@gmail.com>",
|
|
22
22
|
"license": "Apache-2.0",
|
|
23
23
|
"dependencies": {
|
|
24
|
-
"puppeteer": "
|
|
24
|
+
"puppeteer": "24.36.0",
|
|
25
25
|
"winston": "3.7.2"
|
|
26
26
|
},
|
|
27
27
|
"devDependencies": {
|
|
28
28
|
"standard": "17.0.0"
|
|
29
29
|
},
|
|
30
30
|
"engines": {
|
|
31
|
-
"node": ">=
|
|
31
|
+
"node": ">= 16.0.0"
|
|
32
32
|
},
|
|
33
33
|
"homepage": "https://github.com/linkedtales/scrapedin#readme"
|
|
34
34
|
}
|
package/src/company/company.js
CHANGED
|
@@ -5,31 +5,30 @@ const template = require('./companyScraperTemplate')
|
|
|
5
5
|
const logger = require('../logger')(__filename)
|
|
6
6
|
|
|
7
7
|
module.exports = async (browser, cookies, url, waitTimeToScrapMs = 500, puppeteerAuthenticate = undefined) => {
|
|
8
|
-
logger.info(`starting scraping url: ${url}`)
|
|
8
|
+
logger.info(`starting scraping url: ${url}`)
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
const company = {}
|
|
11
11
|
|
|
12
|
-
let page
|
|
13
|
-
if(url.includes('legacySchoolId=')){
|
|
14
|
-
|
|
12
|
+
let page
|
|
13
|
+
if (url.includes('legacySchoolId=')) {
|
|
14
|
+
page = await openPage({ browser, cookies, url, puppeteerAuthenticate })
|
|
15
15
|
|
|
16
|
-
|
|
16
|
+
const aboutSelector = 'a[href$="/about/"]'
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
} else{
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
18
|
+
company.url = page.url()
|
|
19
|
+
|
|
20
|
+
await page.$eval(aboutSelector, async about => await about.click())
|
|
21
|
+
await page.waitForNavigation()
|
|
22
|
+
} else {
|
|
23
|
+
company.url = url
|
|
24
|
+
url = url + '/about'
|
|
25
|
+
page = await openPage({ browser, cookies, url, puppeteerAuthenticate })
|
|
26
26
|
}
|
|
27
|
-
company.about = (await scrapSection(page, template.about))[0]
|
|
28
|
-
company.profile = (await scrapSection(page, template.profile))[0]
|
|
27
|
+
company.about = (await scrapSection(page, template.about))[0]
|
|
28
|
+
company.profile = (await scrapSection(page, template.profile))[0]
|
|
29
|
+
|
|
30
|
+
await page.close()
|
|
31
|
+
logger.info(`finished scraping url: ${url}`)
|
|
29
32
|
|
|
30
|
-
await page.close();
|
|
31
|
-
logger.info(`finished scraping url: ${url}`);
|
|
32
|
-
|
|
33
33
|
return company
|
|
34
|
-
|
|
35
34
|
}
|
|
@@ -1,30 +1,29 @@
|
|
|
1
1
|
const template = {
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
}
|
|
12
|
-
},
|
|
13
|
-
about: {
|
|
14
|
-
selector: '.org-grid__core-rail--no-margin-left',
|
|
15
|
-
fields: {
|
|
16
|
-
overview: 'p',
|
|
17
|
-
types:{
|
|
18
|
-
selector: 'dl dt',
|
|
19
|
-
isMultipleFields: true
|
|
20
|
-
},
|
|
21
|
-
values:{
|
|
22
|
-
selector: 'dl dd:not(.org-page-details__employees-on-linkedin-count)',
|
|
23
|
-
isMultipleFields: true
|
|
24
|
-
}
|
|
25
|
-
}
|
|
2
|
+
profile: {
|
|
3
|
+
selector: '.org-top-card, .top-card-layout',
|
|
4
|
+
fields: {
|
|
5
|
+
name: 'h1',
|
|
6
|
+
headline: '.org-top-card-summary__tagline, .top-card-layout__headline, p',
|
|
7
|
+
imageurl: {
|
|
8
|
+
selector: 'img.org-top-card-primary-content__logo, img.top-card-layout__entity-image',
|
|
9
|
+
attribute: 'src'
|
|
10
|
+
}
|
|
26
11
|
}
|
|
12
|
+
},
|
|
13
|
+
about: {
|
|
14
|
+
selector: '.org-grid__core-rail--no-margin-left, .org-about-us-organization-description, .core-section-container',
|
|
15
|
+
fields: {
|
|
16
|
+
overview: 'p',
|
|
17
|
+
types: {
|
|
18
|
+
selector: 'dl dt, .org-page-details__definition-term',
|
|
19
|
+
isMultipleFields: true
|
|
20
|
+
},
|
|
21
|
+
values: {
|
|
22
|
+
selector: 'dl dd:not(.org-page-details__employees-on-linkedin-count), .org-page-details__definition-text',
|
|
23
|
+
isMultipleFields: true
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
27
|
}
|
|
28
28
|
|
|
29
|
-
|
|
30
29
|
module.exports = template
|
package/src/login.js
CHANGED
|
@@ -2,67 +2,173 @@ const openPage = require('./openPage')
|
|
|
2
2
|
const logger = require('./logger')(__filename)
|
|
3
3
|
const pkg = require('./package')
|
|
4
4
|
|
|
5
|
+
const ACCEPT_COOKIES_SELECTORS = [
|
|
6
|
+
'button:has-text("Accept")',
|
|
7
|
+
'button:has-text("Alle accepteren")',
|
|
8
|
+
'button[action-type="ACCEPT"]',
|
|
9
|
+
'.cookie-consent-v2__button--accept',
|
|
10
|
+
'#artdeco-global-alert-container button:has-text("Accept")'
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
const acceptCookies = async (page) => {
|
|
14
|
+
for (const selector of ACCEPT_COOKIES_SELECTORS) {
|
|
15
|
+
try {
|
|
16
|
+
const btn = await page.$(selector)
|
|
17
|
+
if (btn) {
|
|
18
|
+
await btn.click()
|
|
19
|
+
await new Promise((r) => setTimeout(r, 1000))
|
|
20
|
+
return
|
|
21
|
+
}
|
|
22
|
+
} catch {
|
|
23
|
+
// selector might not exist, try next
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
// Fallback: find any button with Accept text
|
|
27
|
+
try {
|
|
28
|
+
await page.evaluate(() => {
|
|
29
|
+
const btn = Array.from(document.querySelectorAll('button')).find(
|
|
30
|
+
(b) => b.textContent.trim().toLowerCase() === 'accept'
|
|
31
|
+
)
|
|
32
|
+
if (btn) btn.click()
|
|
33
|
+
})
|
|
34
|
+
await new Promise((r) => setTimeout(r, 1000))
|
|
35
|
+
} catch {
|
|
36
|
+
// ignore
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
const fillField = async (page, fieldValue) => {
|
|
41
|
+
// LinkedIn renders two sets of inputs: hidden (CSS-only, not visible) and visible.
|
|
42
|
+
// We find all <input> elements matching the autocomplete attribute, then pick
|
|
43
|
+
// the first one that is actually visible (has non-zero dimensions).
|
|
44
|
+
const autocomplete = fieldValue === 'username webauthn' ? 'username' : 'current-password'
|
|
45
|
+
|
|
46
|
+
const visibleInput = await page.evaluate((auto, val) => {
|
|
47
|
+
const inputs = Array.from(document.querySelectorAll(`input[autocomplete="${auto}"]`))
|
|
48
|
+
for (const input of inputs) {
|
|
49
|
+
const rect = input.getBoundingClientRect()
|
|
50
|
+
if (rect.width > 0 && rect.height > 0) {
|
|
51
|
+
const style = window.getComputedStyle(input)
|
|
52
|
+
if (style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0') {
|
|
53
|
+
input.focus()
|
|
54
|
+
input.value = ''
|
|
55
|
+
return true
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
return false
|
|
60
|
+
}, autocomplete, fieldValue)
|
|
61
|
+
|
|
62
|
+
if (visibleInput) {
|
|
63
|
+
await page.keyboard.type(fieldValue, { delay: 50 })
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
const clickSignIn = async (page) => {
|
|
68
|
+
// Find the "Sign in" button, excluding "Sign in with Apple" etc.
|
|
69
|
+
await page.evaluate(() => {
|
|
70
|
+
const buttons = Array.from(document.querySelectorAll('button'))
|
|
71
|
+
const signInBtn = buttons.find(
|
|
72
|
+
(b) => {
|
|
73
|
+
const text = b.textContent.trim().toLowerCase()
|
|
74
|
+
return (text === 'sign in' || text === 'inloggen' || text === 'aanmelden') &&
|
|
75
|
+
!text.includes('apple')
|
|
76
|
+
}
|
|
77
|
+
)
|
|
78
|
+
if (signInBtn) {
|
|
79
|
+
signInBtn.click()
|
|
80
|
+
return true
|
|
81
|
+
}
|
|
82
|
+
return false
|
|
83
|
+
})
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const LOGGED_IN_PATHS = ['/feed', '/mynetwork', '/in/']
|
|
87
|
+
|
|
5
88
|
module.exports = async (browser, email, password) => {
|
|
6
89
|
const url = 'https://www.linkedin.com/login'
|
|
7
90
|
const page = await openPage({ browser, url })
|
|
8
91
|
logger.info(`logging at: ${url}`)
|
|
9
92
|
|
|
10
|
-
|
|
93
|
+
// Accept cookie consent if present
|
|
94
|
+
await acceptCookies(page)
|
|
95
|
+
await new Promise((r) => setTimeout(r, 1000))
|
|
11
96
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
await
|
|
15
|
-
.then((passwordElement) => passwordElement.type(password))
|
|
97
|
+
// Fill in email field
|
|
98
|
+
await fillField(page, 'username webauthn')
|
|
99
|
+
await new Promise((r) => setTimeout(r, 500))
|
|
16
100
|
|
|
17
|
-
|
|
18
|
-
|
|
101
|
+
// Fill in password field
|
|
102
|
+
await fillField(page, 'current-password')
|
|
103
|
+
await new Promise((r) => setTimeout(r, 500))
|
|
19
104
|
|
|
20
|
-
|
|
21
|
-
timeout: 15000
|
|
22
|
-
})
|
|
23
|
-
.then(async () => {
|
|
24
|
-
logger.info('logged feed page selector found')
|
|
25
|
-
await page.close()
|
|
26
|
-
})
|
|
27
|
-
.catch(async () => {
|
|
28
|
-
logger.warn('successful login element was not found')
|
|
29
|
-
const emailError = await page.evaluate(() => {
|
|
30
|
-
const e = document.querySelector('div[error-for=username]')
|
|
31
|
-
if (!e) { return false }
|
|
32
|
-
const style = window.getComputedStyle(e)
|
|
33
|
-
return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
|
|
34
|
-
})
|
|
35
|
-
|
|
36
|
-
const passwordError = await page.evaluate(() => {
|
|
37
|
-
const e = document.querySelector('div[error-for=password]')
|
|
38
|
-
if (!e) { return false }
|
|
39
|
-
const style = window.getComputedStyle(e)
|
|
40
|
-
return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
|
|
41
|
-
})
|
|
42
|
-
|
|
43
|
-
const manualChallengeRequested = await page.evaluate(() => {
|
|
44
|
-
const e = document.querySelector('.flow-challenge-content')
|
|
45
|
-
if (!e) { return false }
|
|
46
|
-
const style = window.getComputedStyle(e)
|
|
47
|
-
return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
|
|
48
|
-
})
|
|
49
|
-
|
|
50
|
-
if (emailError) {
|
|
51
|
-
logger.info('wrong username element found')
|
|
52
|
-
return Promise.reject(new Error(`linkedin: invalid username: ${email}`))
|
|
53
|
-
}
|
|
105
|
+
await clickSignIn(page)
|
|
54
106
|
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
107
|
+
let hadChallenge = false
|
|
108
|
+
|
|
109
|
+
try {
|
|
110
|
+
await page.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 15000 })
|
|
111
|
+
} catch {
|
|
112
|
+
// Navigation timeout is expected — the page may not navigate if already on login
|
|
113
|
+
}
|
|
59
114
|
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
115
|
+
// Wait for either the feed/mynetwork or challenge page
|
|
116
|
+
const maxWaitMs = 120000
|
|
117
|
+
const start = Date.now()
|
|
118
|
+
let resolved = false
|
|
119
|
+
while (Date.now() - start < maxWaitMs && !resolved) {
|
|
120
|
+
const currentUrl = page.url()
|
|
121
|
+
if (LOGGED_IN_PATHS.some((p) => currentUrl.includes(p))) {
|
|
122
|
+
logger.info('logged in, redirected to: ' + currentUrl)
|
|
123
|
+
resolved = true
|
|
124
|
+
break
|
|
125
|
+
}
|
|
126
|
+
if (currentUrl.includes('/checkpoint')) {
|
|
127
|
+
if (!hadChallenge) {
|
|
128
|
+
logger.warn('2FA challenge detected, please complete the verification in the browser window (waiting up to 2 minutes)...')
|
|
129
|
+
hadChallenge = true
|
|
63
130
|
}
|
|
131
|
+
await new Promise((r) => setTimeout(r, 2000))
|
|
132
|
+
continue
|
|
133
|
+
}
|
|
134
|
+
await new Promise((r) => setTimeout(r, 500))
|
|
135
|
+
}
|
|
64
136
|
|
|
65
|
-
|
|
66
|
-
|
|
137
|
+
if (!resolved) {
|
|
138
|
+
const finalUrl = page.url()
|
|
139
|
+
logger.warn('successful login element was not found, url: ' + finalUrl)
|
|
140
|
+
|
|
141
|
+
const emailError = await page.evaluate(() => {
|
|
142
|
+
const e = document.querySelector('div[error-for=username], #error-for-username')
|
|
143
|
+
if (!e) { return false }
|
|
144
|
+
const style = window.getComputedStyle(e)
|
|
145
|
+
return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
|
|
67
146
|
})
|
|
147
|
+
|
|
148
|
+
const passwordError = await page.evaluate(() => {
|
|
149
|
+
const e = document.querySelector('div[error-for=password], #error-for-password')
|
|
150
|
+
if (!e) { return false }
|
|
151
|
+
const style = window.getComputedStyle(e)
|
|
152
|
+
return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
|
|
153
|
+
})
|
|
154
|
+
|
|
155
|
+
if (emailError) {
|
|
156
|
+
logger.info('wrong username element found')
|
|
157
|
+
await page.close()
|
|
158
|
+
return Promise.reject(new Error(`linkedin: invalid username: ${email}`))
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
if (passwordError) {
|
|
162
|
+
logger.info('wrong password element found')
|
|
163
|
+
await page.close()
|
|
164
|
+
return Promise.reject(new Error('linkedin: invalid password'))
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
logger.error('could not find any element to retrieve a proper error')
|
|
168
|
+
await page.close()
|
|
169
|
+
return Promise.reject(new Error(`${pkg.name} ${pkg.version} login is not working, please report: ${pkg.bugs.url}`))
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
await page.close()
|
|
173
|
+
return { hadChallenge }
|
|
68
174
|
}
|
package/src/openPage.js
CHANGED
|
@@ -1,23 +1,13 @@
|
|
|
1
|
-
const
|
|
2
|
-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
|
|
3
|
-
// "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
|
|
4
|
-
// "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
|
|
5
|
-
// "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:34.0) Gecko/20100101 Firefox/34.0",
|
|
6
|
-
// "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
|
|
7
|
-
// "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
|
|
8
|
-
// "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
|
|
9
|
-
// "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
|
|
10
|
-
// "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
|
|
11
|
-
]
|
|
1
|
+
const AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
|
|
12
2
|
|
|
13
3
|
module.exports = async ({ browser, cookies, url, puppeteerAuthenticate }) => {
|
|
14
4
|
const page = await browser.newPage()
|
|
15
|
-
await page.setDefaultNavigationTimeout(
|
|
5
|
+
await page.setDefaultNavigationTimeout(60000)
|
|
16
6
|
|
|
17
7
|
if (cookies) {
|
|
18
8
|
await page.setCookie(...cookies)
|
|
19
9
|
}
|
|
20
|
-
await page.setUserAgent(
|
|
10
|
+
await page.setUserAgent(AGENT)
|
|
21
11
|
await page.setExtraHTTPHeaders({ 'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8' })
|
|
22
12
|
await page.setViewport({
|
|
23
13
|
width: 1920,
|
|
@@ -28,6 +18,6 @@ module.exports = async ({ browser, cookies, url, puppeteerAuthenticate }) => {
|
|
|
28
18
|
await page.authenticate(puppeteerAuthenticate)
|
|
29
19
|
}
|
|
30
20
|
|
|
31
|
-
await page.goto(url, { waitUntil: '
|
|
21
|
+
await page.goto(url, { waitUntil: 'networkidle2' })
|
|
32
22
|
return page
|
|
33
23
|
}
|
|
@@ -2,7 +2,7 @@ const logger = require('../logger')(__filename)
|
|
|
2
2
|
const pkg = require('../package')
|
|
3
3
|
|
|
4
4
|
module.exports = (profile) => {
|
|
5
|
-
if(!profile?.profile?.name) {
|
|
5
|
+
if (!profile?.profile?.name) {
|
|
6
6
|
const messageError = `LinkedIn website changed and ${pkg.name} ${pkg.version} can't read basic data. Please report this issue at ${pkg.bugs.url}`
|
|
7
7
|
logger.error(messageError, '')
|
|
8
8
|
throw new Error(messageError)
|
|
@@ -11,17 +11,17 @@ module.exports = (profile) => {
|
|
|
11
11
|
profile.profile.summary = profile?.about?.text
|
|
12
12
|
|
|
13
13
|
profile.positions.forEach((position) => {
|
|
14
|
-
if(position.title){
|
|
15
|
-
|
|
14
|
+
if (position.title) {
|
|
15
|
+
position.title = position.title.replace('Company Name\n', '')
|
|
16
16
|
}
|
|
17
|
-
if(position.description) {
|
|
18
|
-
position.description = position.description.replace('See more', '')
|
|
19
|
-
position.description = position.description.replace('see more', '')
|
|
20
|
-
|
|
17
|
+
if (position.description) {
|
|
18
|
+
position.description = position.description.replace('See more', '')
|
|
19
|
+
position.description = position.description.replace('see more', '')
|
|
20
|
+
position.description = position.description.replace('See less', '')
|
|
21
21
|
}
|
|
22
|
-
if(position.roles) {
|
|
22
|
+
if (position.roles) {
|
|
23
23
|
position.roles.forEach((role) => {
|
|
24
|
-
if(role.title) {
|
|
24
|
+
if (role.title) {
|
|
25
25
|
role.title = role.title.replace('Title\n', '')
|
|
26
26
|
}
|
|
27
27
|
if (role.date) {
|
|
@@ -29,7 +29,7 @@ module.exports = (profile) => {
|
|
|
29
29
|
role.date2 = role.date.replace('·', '-').split('-')[1].trim()
|
|
30
30
|
delete role.date
|
|
31
31
|
}
|
|
32
|
-
if(role.description) {
|
|
32
|
+
if (role.description) {
|
|
33
33
|
role.description = role.description.replace('See more', '')
|
|
34
34
|
role.description = role.description.replace('see more', '')
|
|
35
35
|
}
|
|
@@ -37,63 +37,62 @@ module.exports = (profile) => {
|
|
|
37
37
|
}
|
|
38
38
|
})
|
|
39
39
|
|
|
40
|
-
if(profile.recommendations.receivedCount) {
|
|
40
|
+
if (profile.recommendations.receivedCount) {
|
|
41
41
|
profile.recommendations.receivedCount = profile.recommendations.receivedCount.replace(/[^\d]/g, '')
|
|
42
42
|
}
|
|
43
43
|
|
|
44
|
-
if(profile.recommendations.givenCount) {
|
|
44
|
+
if (profile.recommendations.givenCount) {
|
|
45
45
|
profile.recommendations.givenCount = profile.recommendations.givenCount.replace(/[^\d]/g, '')
|
|
46
46
|
}
|
|
47
47
|
|
|
48
|
-
if(profile.recommendations.received) {
|
|
48
|
+
if (profile.recommendations.received) {
|
|
49
49
|
profile.recommendations.received.forEach((recommendation) => {
|
|
50
|
-
if(recommendation.summary){
|
|
50
|
+
if (recommendation.summary) {
|
|
51
51
|
recommendation.summary = recommendation.summary.replace('See more', '')
|
|
52
52
|
recommendation.summary = recommendation.summary.replace('See less', '')
|
|
53
53
|
}
|
|
54
54
|
})
|
|
55
55
|
}
|
|
56
56
|
|
|
57
|
-
if(profile.recommendations.given) {
|
|
57
|
+
if (profile.recommendations.given) {
|
|
58
58
|
profile.recommendations.given.forEach((recommendation) => {
|
|
59
|
-
if(recommendation.summary){
|
|
59
|
+
if (recommendation.summary) {
|
|
60
60
|
recommendation.summary = recommendation.summary.replace('See more', '')
|
|
61
61
|
recommendation.summary = recommendation.summary.replace('See less', '')
|
|
62
62
|
}
|
|
63
63
|
})
|
|
64
64
|
}
|
|
65
65
|
|
|
66
|
-
if(profile.courses){
|
|
66
|
+
if (profile.courses) {
|
|
67
67
|
profile.courses = profile.courses.map(({ name, year }) => {
|
|
68
68
|
const coursesObj = {}
|
|
69
|
-
if(name) {
|
|
69
|
+
if (name) {
|
|
70
70
|
coursesObj.name = name.replace('Course name\n', '')
|
|
71
71
|
}
|
|
72
|
-
if(year) {
|
|
72
|
+
if (year) {
|
|
73
73
|
coursesObj.year = year.replace('Course number\n', '')
|
|
74
74
|
}
|
|
75
75
|
return coursesObj
|
|
76
|
-
}
|
|
77
|
-
);
|
|
76
|
+
})
|
|
78
77
|
}
|
|
79
78
|
|
|
80
|
-
if(profile.languages){
|
|
79
|
+
if (profile.languages) {
|
|
81
80
|
profile.languages = profile.languages.map(({ name, proficiency }) => ({
|
|
82
81
|
name: name ? name.replace('Language name\n', '') : undefined,
|
|
83
|
-
proficiency
|
|
84
|
-
}))
|
|
82
|
+
proficiency
|
|
83
|
+
}))
|
|
85
84
|
}
|
|
86
85
|
|
|
87
|
-
if(profile.projects){
|
|
86
|
+
if (profile.projects) {
|
|
88
87
|
profile.projects = profile.projects.map(
|
|
89
88
|
({ name, date, description, link }) => ({
|
|
90
89
|
name: name ? name.replace('Project name\n', '') : undefined,
|
|
91
90
|
date,
|
|
92
91
|
description: description ? description.replace('Project description\n', '') : undefined,
|
|
93
|
-
link
|
|
94
|
-
})
|
|
95
|
-
)
|
|
92
|
+
link
|
|
93
|
+
})
|
|
94
|
+
)
|
|
96
95
|
}
|
|
97
|
-
|
|
96
|
+
|
|
98
97
|
return profile
|
|
99
98
|
}
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
const logger = require('../logger')(__filename)
|
|
2
2
|
const scrapSection = require('../scrapSection')
|
|
3
3
|
|
|
4
|
-
const SEE_MORE_SELECTOR = '
|
|
5
|
-
const CLOSE_MODAL_SELECTOR = '.artdeco-modal__dismiss'
|
|
4
|
+
const SEE_MORE_SELECTOR = '#top-card-text-details-contact-info'
|
|
5
|
+
const CLOSE_MODAL_SELECTOR = '.artdeco-modal__dismiss'
|
|
6
6
|
|
|
7
7
|
const template = {
|
|
8
|
-
selector: '.pv-contact-info__contact-type',
|
|
8
|
+
selector: '.pv-contact-info__contact-type, .ci-vanity-url, .ci-email, .ci-phone, .ci-websites, .ci-birthday, .ci-ims, .ci-address',
|
|
9
9
|
fields: {
|
|
10
|
-
type: 'header',
|
|
10
|
+
type: 'header, h3',
|
|
11
11
|
values: {
|
|
12
|
-
selector: '.pv-contact-info__ci-container',
|
|
12
|
+
selector: '.pv-contact-info__ci-container, .t-14',
|
|
13
13
|
isMultipleFields: true
|
|
14
14
|
},
|
|
15
15
|
links: {
|
|
@@ -18,31 +18,29 @@ const template = {
|
|
|
18
18
|
isMultipleFields: true
|
|
19
19
|
}
|
|
20
20
|
}
|
|
21
|
-
}
|
|
22
|
-
const getContactInfo = async(page) => {
|
|
23
|
-
await page.
|
|
21
|
+
}
|
|
22
|
+
const getContactInfo = async (page) => {
|
|
23
|
+
await page.waitForSelector(SEE_MORE_SELECTOR, { timeout: 2000 })
|
|
24
24
|
.catch(() => {
|
|
25
|
-
logger.warn('contact-info
|
|
25
|
+
logger.warn('contact-info selector not found')
|
|
26
26
|
return {}
|
|
27
27
|
})
|
|
28
28
|
|
|
29
29
|
const element = await page.$(SEE_MORE_SELECTOR)
|
|
30
|
-
if(element){
|
|
30
|
+
if (element) {
|
|
31
31
|
await element.click()
|
|
32
|
-
const contactInfoIndicatorSelector = '
|
|
33
|
-
await page.
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
32
|
+
const contactInfoIndicatorSelector = '.pv-profile-section__section-info, .artdeco-modal__content'
|
|
33
|
+
await page.waitForSelector(contactInfoIndicatorSelector, { timeout: 5000 })
|
|
34
|
+
.catch(() => {
|
|
35
|
+
logger.warn('contact info was not found')
|
|
36
|
+
})
|
|
37
|
+
|
|
38
38
|
const contactInfo = await scrapSection(page, template)
|
|
39
39
|
const closeButton = await page.$(CLOSE_MODAL_SELECTOR)
|
|
40
|
-
if(closeButton)
|
|
41
|
-
await closeButton.click()
|
|
40
|
+
if (closeButton) { await closeButton.click() }
|
|
42
41
|
|
|
43
42
|
return contactInfo
|
|
44
43
|
}
|
|
45
|
-
|
|
46
44
|
}
|
|
47
45
|
|
|
48
46
|
module.exports = getContactInfo
|
package/src/profile/profile.js
CHANGED
|
@@ -1,81 +1,276 @@
|
|
|
1
1
|
const openPage = require('../openPage')
|
|
2
|
-
const scrapSection = require('../scrapSection')
|
|
3
|
-
const scrapAccomplishmentPanel = require('./scrapAccomplishmentPanel')
|
|
4
2
|
const scrollToPageBottom = require('./scrollToPageBottom')
|
|
5
3
|
const seeMoreButtons = require('./seeMoreButtons')
|
|
6
4
|
const contactInfo = require('./contactInfo')
|
|
7
|
-
const template = require('./profileScraperTemplate')
|
|
8
5
|
const cleanProfileData = require('./cleanProfileData')
|
|
9
6
|
|
|
10
7
|
const logger = require('../logger')(__filename)
|
|
11
8
|
|
|
9
|
+
const extractProfileData = async (page) => {
|
|
10
|
+
return page.evaluate(() => {
|
|
11
|
+
const txt = (el) => el ? (el.textContent || '').trim() : ''
|
|
12
|
+
|
|
13
|
+
const findSection = (headingText) => {
|
|
14
|
+
const sections = document.querySelectorAll('section')
|
|
15
|
+
for (const s of sections) {
|
|
16
|
+
const h2 = s.querySelector('h2')
|
|
17
|
+
if (h2 && h2.textContent.trim().startsWith(headingText)) return s
|
|
18
|
+
}
|
|
19
|
+
return null
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const getContentDivs = (section) => {
|
|
23
|
+
if (!section) return []
|
|
24
|
+
const h2 = section.querySelector('h2')
|
|
25
|
+
if (!h2) return []
|
|
26
|
+
const content = h2.parentElement?.nextElementSibling
|
|
27
|
+
if (!content) return []
|
|
28
|
+
return [...content.querySelectorAll(':scope > div')].filter(d => d.querySelector('p'))
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// Profile top card
|
|
32
|
+
const sections = document.querySelectorAll('section')
|
|
33
|
+
let topSection = null
|
|
34
|
+
for (const s of sections) {
|
|
35
|
+
const h2 = s.querySelector('h2')
|
|
36
|
+
if (h2 && ['0 notifications', 'Suggested for you', 'Analytics', 'Activity'].includes(h2.textContent.trim())) continue
|
|
37
|
+
if (s.textContent.trim().length > 200) { topSection = s; break }
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
const nameH1 = document.querySelector('h1')
|
|
41
|
+
const profileName = nameH1 ? txt(nameH1) : document.title.replace(' | LinkedIn', '').trim()
|
|
42
|
+
let headline = ''
|
|
43
|
+
let location = ''
|
|
44
|
+
if (topSection) {
|
|
45
|
+
const allP = [...topSection.querySelectorAll('p')]
|
|
46
|
+
headline = txt(allP[0])
|
|
47
|
+
for (const p of allP) {
|
|
48
|
+
const t = txt(p)
|
|
49
|
+
if (t.includes(',') && !t.includes('at ') && !t.includes('\u00B7') && !t.includes('follow')) {
|
|
50
|
+
location = t
|
|
51
|
+
break
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
const photoImg = document.querySelector('img[src*="profile-displayphoto"]')
|
|
56
|
+
const profile = {
|
|
57
|
+
name: profileName,
|
|
58
|
+
headline,
|
|
59
|
+
location,
|
|
60
|
+
connections: '',
|
|
61
|
+
imageurl: photoImg ? photoImg.getAttribute('src') || '' : ''
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Experience
|
|
65
|
+
const positions = []
|
|
66
|
+
const expSection = findSection('Experience')
|
|
67
|
+
if (expSection) {
|
|
68
|
+
const companyGroups = expSection.querySelectorAll('[componentkey^="entity-collection"]')
|
|
69
|
+
companyGroups.forEach(group => {
|
|
70
|
+
const allP = [...group.querySelectorAll('p')].filter(p => txt(p).length > 0)
|
|
71
|
+
if (allP.length === 0) return
|
|
72
|
+
const companyName = txt(allP[0])
|
|
73
|
+
const companyLink = group.querySelector('a[href*="/company/"]')
|
|
74
|
+
const companyUrl = companyLink ? companyLink.getAttribute('href') : ''
|
|
75
|
+
|
|
76
|
+
const positionLis = group.querySelectorAll('ul > li')
|
|
77
|
+
if (positionLis.length > 0) {
|
|
78
|
+
positionLis.forEach(li => {
|
|
79
|
+
const liPs = [...li.querySelectorAll('p')].filter(p => txt(p).length > 0)
|
|
80
|
+
if (liPs.length === 0) return
|
|
81
|
+
const descEl = li.querySelector('[data-testid="expandable-text-box"]')
|
|
82
|
+
let dateStr = ''
|
|
83
|
+
for (let i = 1; i < liPs.length; i++) {
|
|
84
|
+
const t = txt(liPs[i])
|
|
85
|
+
if (t.includes('\u00B7') || /\d{4}/.test(t)) { dateStr = t; break }
|
|
86
|
+
}
|
|
87
|
+
const dateRange = dateStr.split('\u00B7')[0].trim()
|
|
88
|
+
const dateParts = dateRange.split(' - ')
|
|
89
|
+
positions.push({
|
|
90
|
+
title: txt(liPs[0]),
|
|
91
|
+
companyName,
|
|
92
|
+
link: companyUrl,
|
|
93
|
+
url: companyUrl,
|
|
94
|
+
location: '',
|
|
95
|
+
description: descEl ? txt(descEl) : '',
|
|
96
|
+
date: dateRange,
|
|
97
|
+
date1: dateParts[0] ? dateParts[0].trim() : '',
|
|
98
|
+
date2: dateParts[1] ? dateParts[1].trim() : ''
|
|
99
|
+
})
|
|
100
|
+
})
|
|
101
|
+
} else {
|
|
102
|
+
const descEl = group.querySelector('[data-testid="expandable-text-box"]')
|
|
103
|
+
let dateStr = ''
|
|
104
|
+
for (let i = 1; i < allP.length; i++) {
|
|
105
|
+
const t = txt(allP[i])
|
|
106
|
+
if (t.includes('\u00B7') || /\d{4}/.test(t.split(' ')[0])) { dateStr = t; break }
|
|
107
|
+
}
|
|
108
|
+
const dateRange = dateStr.split('\u00B7')[0].trim()
|
|
109
|
+
const dateParts = dateRange.split(' - ')
|
|
110
|
+
positions.push({
|
|
111
|
+
title: txt(allP[3]) || txt(allP[1]),
|
|
112
|
+
companyName,
|
|
113
|
+
link: companyUrl,
|
|
114
|
+
url: companyUrl,
|
|
115
|
+
location: txt(allP[2]) || '',
|
|
116
|
+
description: descEl ? txt(descEl) : '',
|
|
117
|
+
date: dateRange,
|
|
118
|
+
date1: dateParts[0] ? dateParts[0].trim() : '',
|
|
119
|
+
date2: dateParts[1] ? dateParts[1].trim() : ''
|
|
120
|
+
})
|
|
121
|
+
}
|
|
122
|
+
})
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
// Education
|
|
126
|
+
const educations = []
|
|
127
|
+
const eduSection = findSection('Education')
|
|
128
|
+
const eduDivs = getContentDivs(eduSection)
|
|
129
|
+
eduDivs.forEach(div => {
|
|
130
|
+
const ps = [...div.querySelectorAll('p')].filter(p => txt(p).length > 0)
|
|
131
|
+
if (ps.length === 0) return
|
|
132
|
+
const schoolLink = div.querySelector('a[href*="/school/"]')
|
|
133
|
+
const url = schoolLink ? schoolLink.getAttribute('href') : ''
|
|
134
|
+
const dateStr = txt(ps[2]) || ''
|
|
135
|
+
const dateParts = dateStr.split('\u2013')
|
|
136
|
+
educations.push({
|
|
137
|
+
title: txt(ps[0]),
|
|
138
|
+
degree: txt(ps[1]) || '',
|
|
139
|
+
fieldOfStudy: txt(ps[1]) || '',
|
|
140
|
+
url,
|
|
141
|
+
date1: dateParts[0] ? dateParts[0].trim() : '',
|
|
142
|
+
date2: dateParts[1] ? dateParts[1].trim() : '',
|
|
143
|
+
description: ''
|
|
144
|
+
})
|
|
145
|
+
})
|
|
146
|
+
|
|
147
|
+
// Skills
|
|
148
|
+
const skills = []
|
|
149
|
+
const skillsSection = findSection('Skills')
|
|
150
|
+
if (skillsSection) {
|
|
151
|
+
const allSectionP = [...skillsSection.querySelectorAll('p')].filter(p => txt(p).length > 0)
|
|
152
|
+
for (let i = 0; i < allSectionP.length; i += 2) {
|
|
153
|
+
const name = txt(allSectionP[i])
|
|
154
|
+
if (name && !name.includes('Show all') && !name.includes('Private')) {
|
|
155
|
+
skills.push({ title: name, count: '' })
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// Languages
|
|
161
|
+
const languages = []
|
|
162
|
+
const langSection = findSection('Languages')
|
|
163
|
+
if (langSection) {
|
|
164
|
+
const langDivs = getContentDivs(langSection)
|
|
165
|
+
langDivs.forEach(div => {
|
|
166
|
+
const ps = [...div.querySelectorAll('p')].filter(p => txt(p).length > 0)
|
|
167
|
+
for (let i = 0; i < ps.length; i += 2) {
|
|
168
|
+
const n = txt(ps[i])
|
|
169
|
+
const pr = ps[i + 1] ? txt(ps[i + 1]) : ''
|
|
170
|
+
if (n) languages.push({ name: n, proficiency: pr })
|
|
171
|
+
}
|
|
172
|
+
})
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
// Projects
|
|
176
|
+
const projects = []
|
|
177
|
+
const projSection = findSection('Projects')
|
|
178
|
+
const projDivs = getContentDivs(projSection)
|
|
179
|
+
projDivs.forEach(div => {
|
|
180
|
+
const ps = [...div.querySelectorAll('p')].filter(p => txt(p).length > 0)
|
|
181
|
+
if (ps.length === 0) return
|
|
182
|
+
const descEl = div.querySelector('[data-testid="expandable-text-box"]')
|
|
183
|
+
const link = div.querySelector('a[href*="http"]')
|
|
184
|
+
projects.push({
|
|
185
|
+
name: txt(ps[0]),
|
|
186
|
+
date: txt(ps[1]) || '',
|
|
187
|
+
description: descEl ? txt(descEl) : '',
|
|
188
|
+
link: link ? link.getAttribute('href') : ''
|
|
189
|
+
})
|
|
190
|
+
})
|
|
191
|
+
|
|
192
|
+
// Certifications
|
|
193
|
+
const accomplishments = []
|
|
194
|
+
const certSection = findSection('Licenses')
|
|
195
|
+
const certDivs = getContentDivs(certSection)
|
|
196
|
+
certDivs.forEach(div => {
|
|
197
|
+
const ps = [...div.querySelectorAll('p')].filter(p => txt(p).length > 0)
|
|
198
|
+
if (ps.length > 0) accomplishments.push({ title: txt(ps[0]), count: '', items: [] })
|
|
199
|
+
})
|
|
200
|
+
|
|
201
|
+
return {
|
|
202
|
+
profile,
|
|
203
|
+
about: { text: '' },
|
|
204
|
+
positions,
|
|
205
|
+
educations,
|
|
206
|
+
skills,
|
|
207
|
+
recommendations: { givenCount: '0', receivedCount: '0', given: [], received: [] },
|
|
208
|
+
accomplishments,
|
|
209
|
+
courses: [],
|
|
210
|
+
languages,
|
|
211
|
+
projects,
|
|
212
|
+
peopleAlsoViewed: [],
|
|
213
|
+
volunteerExperience: [],
|
|
214
|
+
contact: []
|
|
215
|
+
}
|
|
216
|
+
})
|
|
217
|
+
}
|
|
218
|
+
|
|
12
219
|
module.exports = async (browser, cookies, url, waitTimeToScrapMs = 500, hasToGetContactInfo = false, puppeteerAuthenticate = undefined) => {
|
|
13
220
|
logger.info(`starting scraping url: ${url}`)
|
|
14
221
|
|
|
15
222
|
const page = await openPage({ browser, cookies, url, puppeteerAuthenticate })
|
|
16
|
-
|
|
17
|
-
|
|
223
|
+
|
|
224
|
+
// Check for authwall (expired session)
|
|
225
|
+
const isAuthwall = await page.evaluate(() =>
|
|
226
|
+
window.location.href.includes('/authwall') || document.title.toLowerCase().includes('inschrijven')
|
|
227
|
+
)
|
|
228
|
+
if (isAuthwall) {
|
|
229
|
+
await page.close()
|
|
230
|
+
throw new Error('authwall: LinkedIn session expired, re-authentication required')
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// Accept cookie consent if present
|
|
234
|
+
await page.evaluate(() => {
|
|
235
|
+
const btn = Array.from(document.querySelectorAll('button')).find(
|
|
236
|
+
(b) => b.textContent.trim().toLowerCase() === 'accept'
|
|
237
|
+
)
|
|
238
|
+
if (btn) btn.click()
|
|
239
|
+
})
|
|
240
|
+
await new Promise((r) => setTimeout(r, 2000))
|
|
241
|
+
|
|
242
|
+
// Wait for sections to appear (profile content is SDUI rendered)
|
|
243
|
+
await page.waitForFunction(() => {
|
|
244
|
+
return document.querySelectorAll('section h2').length > 1
|
|
245
|
+
}, { timeout: 30000 })
|
|
18
246
|
.catch(() => {
|
|
19
|
-
|
|
20
|
-
//because it can be just a false negative meaning LinkedIn only changed that selector but everything else is fine :)
|
|
21
|
-
logger.warn('profile selector was not found')
|
|
247
|
+
logger.warn('profile content did not fully render in time')
|
|
22
248
|
})
|
|
23
249
|
|
|
24
250
|
logger.info('scrolling page to the bottom')
|
|
25
251
|
await scrollToPageBottom(page)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
252
|
+
|
|
253
|
+
// Wait for lazy-loaded sections to render after scrolling
|
|
254
|
+
await new Promise((resolve) => { setTimeout(resolve, 2000) })
|
|
255
|
+
|
|
256
|
+
// Scroll again in case new content was loaded
|
|
257
|
+
await scrollToPageBottom(page)
|
|
258
|
+
await new Promise((resolve) => { setTimeout(resolve, 1000) })
|
|
31
259
|
|
|
32
260
|
await seeMoreButtons.clickAll(page)
|
|
33
261
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
await new Promise((resolve) => { setTimeout(() => { resolve() }, waitTimeToScrapMs / 2)})
|
|
37
|
-
}
|
|
262
|
+
// Final wait for content to settle
|
|
263
|
+
await new Promise((resolve) => { setTimeout(resolve, 1000) })
|
|
38
264
|
|
|
39
|
-
const
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
const recommendationsReceived = await scrapSection(page, template.recommendationsReceived)
|
|
45
|
-
const recommendationsGiven = await scrapSection(page, template.recommendationsGiven)
|
|
46
|
-
const skills = await scrapSection(page, template.skills)
|
|
47
|
-
const accomplishments = await scrapSection(page, template.accomplishments)
|
|
48
|
-
const courses = await scrapAccomplishmentPanel(page, 'courses')
|
|
49
|
-
const languages = await scrapAccomplishmentPanel(page, 'languages')
|
|
50
|
-
const projects = await scrapAccomplishmentPanel(page, 'projects')
|
|
51
|
-
const volunteerExperience = await scrapSection(page, template.volunteerExperience)
|
|
52
|
-
const peopleAlsoViewed = await scrapSection(page, template.peopleAlsoViewed)
|
|
53
|
-
const contact = hasToGetContactInfo ? await contactInfo(page) : []
|
|
265
|
+
const rawProfile = await extractProfileData(page)
|
|
266
|
+
|
|
267
|
+
if (hasToGetContactInfo) {
|
|
268
|
+
rawProfile.contact = await contactInfo(page) || []
|
|
269
|
+
}
|
|
54
270
|
|
|
55
271
|
await page.close()
|
|
56
272
|
logger.info(`finished scraping url: ${url}`)
|
|
57
273
|
|
|
58
|
-
const rawProfile = {
|
|
59
|
-
profile,
|
|
60
|
-
about,
|
|
61
|
-
positions,
|
|
62
|
-
educations,
|
|
63
|
-
skills,
|
|
64
|
-
recommendations: {
|
|
65
|
-
givenCount: recommendationsCount ? recommendationsCount.given : "0",
|
|
66
|
-
receivedCount: recommendationsCount ? recommendationsCount.received : "0",
|
|
67
|
-
given: recommendationsReceived,
|
|
68
|
-
received: recommendationsGiven
|
|
69
|
-
},
|
|
70
|
-
accomplishments,
|
|
71
|
-
courses,
|
|
72
|
-
languages,
|
|
73
|
-
projects,
|
|
74
|
-
peopleAlsoViewed,
|
|
75
|
-
volunteerExperience,
|
|
76
|
-
contact
|
|
77
|
-
}
|
|
78
|
-
|
|
79
274
|
const cleanedProfile = cleanProfileData(rawProfile)
|
|
80
275
|
return cleanedProfile
|
|
81
276
|
}
|
|
@@ -1,189 +1,186 @@
|
|
|
1
|
-
const profileSelector = '.core-rail > *:first-child section >'
|
|
2
|
-
|
|
3
1
|
const template = {
|
|
4
2
|
profile: {
|
|
5
|
-
selector: '.
|
|
3
|
+
selector: '.scaffold-layout__main',
|
|
6
4
|
fields: {
|
|
7
|
-
|
|
8
|
-
headline:
|
|
9
|
-
location:
|
|
10
|
-
connections:
|
|
5
|
+
name: '.text-heading-xlarge',
|
|
6
|
+
headline: '.text-body-medium',
|
|
7
|
+
location: '.text-body-small.inline.t-black--light.break-words',
|
|
8
|
+
connections: '.t-bold',
|
|
11
9
|
imageurl: {
|
|
12
|
-
|
|
10
|
+
selector: 'img.pv-top-card-profile-picture__image',
|
|
13
11
|
attribute: 'src'
|
|
14
12
|
}
|
|
15
13
|
}
|
|
16
14
|
},
|
|
17
15
|
about: {
|
|
18
|
-
selector: '.
|
|
16
|
+
selector: '#about ~ .display-flex .inline-show-more-text',
|
|
19
17
|
fields: {
|
|
20
|
-
text: '
|
|
18
|
+
text: 'span[aria-hidden="true"]'
|
|
21
19
|
}
|
|
22
20
|
},
|
|
23
21
|
positions: {
|
|
24
|
-
selector: '
|
|
22
|
+
selector: '#experience ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
|
|
25
23
|
fields: {
|
|
26
|
-
title: '
|
|
24
|
+
title: '.mr1.hoverable-link-text.t-bold > span',
|
|
27
25
|
link: {
|
|
28
|
-
selector: 'a',
|
|
29
|
-
attribute: 'href'
|
|
26
|
+
selector: 'a.optional-action-target-wrapper',
|
|
27
|
+
attribute: 'href'
|
|
30
28
|
},
|
|
31
29
|
url: {
|
|
32
|
-
selector: 'a',
|
|
30
|
+
selector: 'a.optional-action-target-wrapper',
|
|
33
31
|
attribute: 'href'
|
|
34
32
|
},
|
|
35
|
-
companyName: '
|
|
36
|
-
location: '.
|
|
37
|
-
description: '.
|
|
38
|
-
date1: '.
|
|
39
|
-
date2: '.
|
|
33
|
+
companyName: '.t-14.t-normal > span',
|
|
34
|
+
location: '.t-14.t-normal.t-black--light > span',
|
|
35
|
+
description: '.pvs-list__outer-container .inline-show-more-text span[aria-hidden="true"]',
|
|
36
|
+
date1: '.pvs-entity__caption-wrapper',
|
|
37
|
+
date2: '.pvs-entity__caption-wrapper',
|
|
40
38
|
roles: {
|
|
41
|
-
selector: '
|
|
39
|
+
selector: '.pvs-entity__sub-components li.pvs-list__paged-list-item',
|
|
42
40
|
hasChildrenFields: true,
|
|
43
41
|
fields: {
|
|
44
|
-
title: '
|
|
45
|
-
description: '
|
|
42
|
+
title: '.mr1.hoverable-link-text.t-bold > span',
|
|
43
|
+
description: '.inline-show-more-text span[aria-hidden="true"]',
|
|
46
44
|
date: '.pvs-entity__caption-wrapper',
|
|
47
|
-
location: '.
|
|
45
|
+
location: '.t-14.t-normal.t-black--light > span'
|
|
48
46
|
}
|
|
49
47
|
}
|
|
50
48
|
}
|
|
51
49
|
},
|
|
52
50
|
educations: {
|
|
53
|
-
selector: '#education-
|
|
51
|
+
selector: '#education ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
|
|
54
52
|
fields: {
|
|
55
|
-
title: '
|
|
56
|
-
degree: '
|
|
53
|
+
title: '.hoverable-link-text.t-bold > span',
|
|
54
|
+
degree: '.t-14.t-normal > span',
|
|
57
55
|
url: {
|
|
58
56
|
selector: 'a',
|
|
59
57
|
attribute: 'href'
|
|
60
58
|
},
|
|
61
|
-
|
|
62
|
-
date1: '.
|
|
63
|
-
date2: '.
|
|
64
|
-
description: '.
|
|
59
|
+
fieldOfStudy: '.t-14.t-normal > span',
|
|
60
|
+
date1: '.pvs-entity__caption-wrapper',
|
|
61
|
+
date2: '.pvs-entity__caption-wrapper',
|
|
62
|
+
description: '.inline-show-more-text span[aria-hidden="true"]'
|
|
65
63
|
}
|
|
66
64
|
},
|
|
67
65
|
skills: {
|
|
68
|
-
selector: '.
|
|
66
|
+
selector: '#skills ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
|
|
69
67
|
fields: {
|
|
70
|
-
title: '.
|
|
71
|
-
count: '.
|
|
68
|
+
title: '.mr1.hoverable-link-text.t-bold > span',
|
|
69
|
+
count: '.t-14.t-normal.t-black--light > span'
|
|
72
70
|
}
|
|
73
71
|
},
|
|
74
72
|
recommendationsCount: {
|
|
75
|
-
selector: '.
|
|
73
|
+
selector: '#recommendations ~ .pvs-list__outer-container',
|
|
76
74
|
fields: {
|
|
77
75
|
received: '.artdeco-tab:nth-child(1)',
|
|
78
76
|
given: '.artdeco-tab:nth-child(2)'
|
|
79
77
|
}
|
|
80
78
|
},
|
|
81
79
|
recommendationsReceived: {
|
|
82
|
-
selector: '.
|
|
80
|
+
selector: '#recommendations ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
|
|
83
81
|
fields: {
|
|
84
82
|
user: {
|
|
85
|
-
selector: '
|
|
83
|
+
selector: 'a',
|
|
86
84
|
attribute: 'href'
|
|
87
85
|
},
|
|
88
|
-
text: '
|
|
86
|
+
text: '.inline-show-more-text span[aria-hidden="true"]',
|
|
89
87
|
profileImage: {
|
|
90
|
-
selector: '
|
|
88
|
+
selector: 'img',
|
|
91
89
|
attribute: 'src'
|
|
92
90
|
},
|
|
93
91
|
name: {
|
|
94
|
-
selector: '
|
|
92
|
+
selector: '.t-bold > span'
|
|
95
93
|
},
|
|
96
94
|
userDescription: {
|
|
97
|
-
selector: '.
|
|
95
|
+
selector: '.t-14.t-normal > span'
|
|
98
96
|
}
|
|
99
97
|
}
|
|
100
98
|
},
|
|
101
99
|
recommendationsGiven: {
|
|
102
|
-
selector: '.
|
|
100
|
+
selector: '#recommendations ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
|
|
103
101
|
fields: {
|
|
104
102
|
user: {
|
|
105
|
-
selector: '
|
|
103
|
+
selector: 'a',
|
|
106
104
|
attribute: 'href'
|
|
107
105
|
},
|
|
108
|
-
text: '
|
|
106
|
+
text: '.inline-show-more-text span[aria-hidden="true"]',
|
|
109
107
|
profileImage: {
|
|
110
|
-
selector: '
|
|
108
|
+
selector: 'img',
|
|
111
109
|
attribute: 'src'
|
|
112
110
|
},
|
|
113
111
|
name: {
|
|
114
|
-
selector: '
|
|
112
|
+
selector: '.t-bold > span'
|
|
115
113
|
},
|
|
116
114
|
userDescription: {
|
|
117
|
-
selector: '.
|
|
115
|
+
selector: '.t-14.t-normal > span'
|
|
118
116
|
}
|
|
119
117
|
}
|
|
120
118
|
},
|
|
121
119
|
accomplishments: {
|
|
122
|
-
selector: '.
|
|
120
|
+
selector: '#honors_and_awards ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
|
|
123
121
|
fields: {
|
|
124
|
-
count: '
|
|
125
|
-
title: '.
|
|
122
|
+
count: '.t-14.t-normal.t-black--light > span',
|
|
123
|
+
title: '.mr1.hoverable-link-text.t-bold > span',
|
|
126
124
|
items: {
|
|
127
|
-
selector: 'li',
|
|
125
|
+
selector: '.pvs-list__outer-container li',
|
|
128
126
|
isMultipleFields: true
|
|
129
127
|
}
|
|
130
128
|
}
|
|
131
129
|
},
|
|
132
130
|
peopleAlsoViewed: {
|
|
133
|
-
selector: '
|
|
131
|
+
selector: '.pv-browsemap-section li',
|
|
134
132
|
fields: {
|
|
135
133
|
user: {
|
|
136
134
|
selector: 'a',
|
|
137
135
|
attribute: 'href'
|
|
138
136
|
},
|
|
139
|
-
text: '
|
|
137
|
+
text: '.t-14.t-normal',
|
|
140
138
|
profileImage: {
|
|
141
|
-
selector: '
|
|
139
|
+
selector: 'img',
|
|
142
140
|
attribute: 'src'
|
|
143
141
|
},
|
|
144
142
|
name: {
|
|
145
|
-
selector: '.
|
|
143
|
+
selector: '.t-bold'
|
|
146
144
|
}
|
|
147
145
|
}
|
|
148
146
|
},
|
|
149
147
|
volunteerExperience: {
|
|
150
|
-
selector: '
|
|
148
|
+
selector: '#volunteering_experience ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
|
|
151
149
|
fields: {
|
|
152
|
-
title: '
|
|
153
|
-
experience: '
|
|
154
|
-
location: '.
|
|
155
|
-
description: '.
|
|
156
|
-
date1: '.
|
|
157
|
-
date2: '.
|
|
150
|
+
title: '.mr1.hoverable-link-text.t-bold > span',
|
|
151
|
+
experience: '.t-14.t-normal > span',
|
|
152
|
+
location: '.t-14.t-normal.t-black--light > span',
|
|
153
|
+
description: '.inline-show-more-text span[aria-hidden="true"]',
|
|
154
|
+
date1: '.pvs-entity__caption-wrapper',
|
|
155
|
+
date2: '.pvs-entity__caption-wrapper'
|
|
158
156
|
}
|
|
159
157
|
},
|
|
160
158
|
courses: {
|
|
161
|
-
selector: '.
|
|
159
|
+
selector: '#courses ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
|
|
162
160
|
fields: {
|
|
163
|
-
name: '.
|
|
164
|
-
year: '.
|
|
161
|
+
name: '.mr1.hoverable-link-text.t-bold > span',
|
|
162
|
+
year: '.t-14.t-normal > span'
|
|
165
163
|
}
|
|
166
164
|
},
|
|
167
165
|
languages: {
|
|
168
|
-
selector: '.
|
|
166
|
+
selector: '#languages ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
|
|
169
167
|
fields: {
|
|
170
|
-
name: '.
|
|
171
|
-
proficiency: '.
|
|
168
|
+
name: '.mr1.t-bold > span',
|
|
169
|
+
proficiency: '.t-14.t-normal.t-black--light > span'
|
|
172
170
|
}
|
|
173
171
|
},
|
|
174
172
|
projects: {
|
|
175
|
-
selector: '.
|
|
173
|
+
selector: '#projects ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
|
|
176
174
|
fields: {
|
|
177
|
-
name: '.
|
|
178
|
-
date: '.
|
|
179
|
-
description: '.
|
|
175
|
+
name: '.mr1.hoverable-link-text.t-bold > span',
|
|
176
|
+
date: '.pvs-entity__caption-wrapper',
|
|
177
|
+
description: '.inline-show-more-text span[aria-hidden="true"]',
|
|
180
178
|
link: {
|
|
181
|
-
selector: '
|
|
179
|
+
selector: 'a',
|
|
182
180
|
attribute: 'href'
|
|
183
181
|
}
|
|
184
182
|
}
|
|
185
183
|
}
|
|
186
184
|
}
|
|
187
185
|
|
|
188
|
-
|
|
189
186
|
module.exports = template
|
|
@@ -1,18 +1,11 @@
|
|
|
1
|
-
const scrapSection = require('../scrapSection')
|
|
2
|
-
const template = require('./profileScraperTemplate')
|
|
1
|
+
const scrapSection = require('../scrapSection')
|
|
2
|
+
const template = require('./profileScraperTemplate')
|
|
3
3
|
|
|
4
4
|
const scrapAccomplishmentPanel = async (page, section) => {
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
const openingButton = await page.$(queryString);
|
|
8
|
-
|
|
9
|
-
if (openingButton) {
|
|
10
|
-
await page.evaluate((q) => {
|
|
11
|
-
document.querySelector(q).click();
|
|
12
|
-
}, queryString);
|
|
13
|
-
|
|
14
|
-
return scrapSection(page, template[section]);
|
|
5
|
+
if (!template[section]) {
|
|
6
|
+
return []
|
|
15
7
|
}
|
|
16
|
-
|
|
8
|
+
return scrapSection(page, template[section])
|
|
9
|
+
}
|
|
17
10
|
|
|
18
|
-
module.exports = scrapAccomplishmentPanel
|
|
11
|
+
module.exports = scrapAccomplishmentPanel
|
|
@@ -3,21 +3,20 @@ const logger = require('../logger')(__filename)
|
|
|
3
3
|
module.exports = async (page) => {
|
|
4
4
|
const MAX_TIMES_TO_SCROLL = 25
|
|
5
5
|
const TIMEOUT_BETWEEN_SCROLLS = 500
|
|
6
|
-
const PAGE_BOTTOM_SELECTOR_STRING = '#expanded-footer'
|
|
7
6
|
|
|
8
7
|
for (let i = 0; i < MAX_TIMES_TO_SCROLL; i++) {
|
|
9
8
|
await page.evaluate(() => window.scrollBy(0, window.innerHeight))
|
|
10
9
|
|
|
11
|
-
const hasReachedEnd = await page.
|
|
12
|
-
|
|
13
|
-
timeout: TIMEOUT_BETWEEN_SCROLLS
|
|
14
|
-
}).catch(() => {
|
|
15
|
-
logger.info(`scrolling to page bottom (${i + 1})`)
|
|
10
|
+
const hasReachedEnd = await page.evaluate(() => {
|
|
11
|
+
return (window.innerHeight + window.scrollY) >= (document.body.scrollHeight - 200)
|
|
16
12
|
})
|
|
17
13
|
|
|
18
14
|
if (hasReachedEnd) {
|
|
19
15
|
return
|
|
20
16
|
}
|
|
17
|
+
|
|
18
|
+
await new Promise(resolve => setTimeout(resolve, TIMEOUT_BETWEEN_SCROLLS))
|
|
19
|
+
logger.info(`scrolling to page bottom (${i + 1})`)
|
|
21
20
|
}
|
|
22
21
|
|
|
23
22
|
logger.warn('page bottom not found')
|
|
@@ -1,42 +1,18 @@
|
|
|
1
1
|
const logger = require('../logger')(__filename)
|
|
2
|
-
const seeMoreButtons = [
|
|
3
|
-
{
|
|
4
|
-
id: 'SHOW_MORE_ABOUT',
|
|
5
|
-
selector: '#line-clamp-show-more-button'
|
|
6
|
-
},{
|
|
7
|
-
id: 'SHOW_MORE_EXPERIENCES',
|
|
8
|
-
selector: '#experience-section .pv-profile-section__see-more-inline'
|
|
9
|
-
},{
|
|
10
|
-
id: 'SEE_MORE_EXPERIENCES',
|
|
11
|
-
selector: '#experience-section .inline-show-more-text__button'
|
|
12
|
-
},{
|
|
13
|
-
id: 'SHOW_MORE_CERTIFICATIONS',
|
|
14
|
-
selector: '#certifications-section .pv-profile-section__see-more-inline'
|
|
15
|
-
},{
|
|
16
|
-
id: 'SHOW_MORE_SKILLS',
|
|
17
|
-
selector: '.pv-skills-section__additional-skills'
|
|
18
|
-
},{
|
|
19
|
-
id: 'SEE_MORE_RECOMMENDATIONS',
|
|
20
|
-
selector: '.recommendations-inlining #line-clamp-show-more-button'
|
|
21
|
-
}
|
|
22
|
-
]
|
|
23
|
-
|
|
24
2
|
|
|
25
|
-
const clickAll = async(page) => {
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
3
|
+
const clickAll = async (page) => {
|
|
4
|
+
const clicked = await page.evaluate(() => {
|
|
5
|
+
let count = 0
|
|
6
|
+
// Only click expandable text buttons (inline expand, not navigation)
|
|
7
|
+
const expandButtons = document.querySelectorAll('[data-testid="expandable-text-button"]')
|
|
8
|
+
expandButtons.forEach(btn => { btn.click(); count++ })
|
|
9
|
+
return count
|
|
10
|
+
})
|
|
29
11
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
await elem.click()
|
|
34
|
-
.catch((e) => logger.warn(`couldn't click on ${button.selector}, it's probably invisible`))
|
|
35
|
-
}
|
|
36
|
-
}
|
|
12
|
+
if (clicked > 0) {
|
|
13
|
+
logger.info(`clicked ${clicked} show-more buttons`)
|
|
14
|
+
await new Promise(resolve => setTimeout(resolve, 500))
|
|
37
15
|
}
|
|
38
|
-
|
|
39
|
-
return
|
|
40
16
|
}
|
|
41
17
|
|
|
42
18
|
module.exports = { clickAll }
|
package/src/scrapedin.js
CHANGED
|
@@ -4,7 +4,22 @@ const profile = require('./profile/profile')
|
|
|
4
4
|
const company = require('./company/company')
|
|
5
5
|
const logger = require('./logger')(__filename)
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
const saveBrowserCookies = async (browser, cookiesPath) => {
|
|
8
|
+
if (!cookiesPath) return
|
|
9
|
+
try {
|
|
10
|
+
const pages = await browser.pages()
|
|
11
|
+
if (pages.length > 0) {
|
|
12
|
+
const pageCookies = await pages[0].cookies()
|
|
13
|
+
const fs = require('fs')
|
|
14
|
+
fs.writeFileSync(cookiesPath, JSON.stringify(pageCookies, null, 2))
|
|
15
|
+
logger.info('cookies saved to: ' + cookiesPath)
|
|
16
|
+
}
|
|
17
|
+
} catch (e) {
|
|
18
|
+
logger.warn('failed to save cookies: ' + e.message)
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
module.exports = async ({ cookies, email, password, isHeadless, hasToLog, hasToGetContactInfo, cookiesPath, puppeteerArgs, puppeteerAuthenticate, endpoint } = { isHeadless: true, hasToLog: false }) => {
|
|
8
23
|
if (!hasToLog) {
|
|
9
24
|
logger.stopLogging()
|
|
10
25
|
}
|
|
@@ -26,7 +41,11 @@ module.exports = async ({ cookies, email, password, isHeadless, hasToLog, hasToG
|
|
|
26
41
|
logger.info('email and password was provided, we\'re going to login...')
|
|
27
42
|
|
|
28
43
|
try {
|
|
29
|
-
await login(browser, email, password, logger)
|
|
44
|
+
const loginResult = await login(browser, email, password, logger)
|
|
45
|
+
// Only save cookies if login fully completed (no 2FA challenge)
|
|
46
|
+
if (loginResult && !loginResult.hadChallenge && cookiesPath) {
|
|
47
|
+
await saveBrowserCookies(browser, cookiesPath)
|
|
48
|
+
}
|
|
30
49
|
} catch (e) {
|
|
31
50
|
if (!endpoint) {
|
|
32
51
|
await browser.close()
|