@mvegter/scrapedin 1.0.33 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +4 -4
- package/src/company/company.js +19 -20
- package/src/company/companyScraperTemplate.js +24 -25
- package/src/login.js +13 -11
- package/src/openPage.js +3 -13
- package/src/profile/cleanProfileData.js +28 -29
- package/src/profile/contactInfo.js +17 -19
- package/src/profile/profile.js +193 -53
- package/src/profile/profileScraperTemplate.js +71 -74
- package/src/profile/scrapAccomplishmentPanel.js +7 -14
- package/src/profile/scrollToPageBottom.js +5 -6
- package/src/profile/seeMoreButtons.js +11 -35
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mvegter/scrapedin",
|
|
3
|
-
"version": "1.0
|
|
4
|
-
"description": "linkedin scraper for
|
|
3
|
+
"version": "1.1.0",
|
|
4
|
+
"description": "linkedin scraper updated for 2025+ website",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"linkedin",
|
|
7
7
|
"scraper",
|
|
@@ -21,14 +21,14 @@
|
|
|
21
21
|
"author": "Wagner Leonardi <leonardiwagner@gmail.com>",
|
|
22
22
|
"license": "Apache-2.0",
|
|
23
23
|
"dependencies": {
|
|
24
|
-
"puppeteer": "
|
|
24
|
+
"puppeteer": "24.36.0",
|
|
25
25
|
"winston": "3.7.2"
|
|
26
26
|
},
|
|
27
27
|
"devDependencies": {
|
|
28
28
|
"standard": "17.0.0"
|
|
29
29
|
},
|
|
30
30
|
"engines": {
|
|
31
|
-
"node": ">=
|
|
31
|
+
"node": ">= 16.0.0"
|
|
32
32
|
},
|
|
33
33
|
"homepage": "https://github.com/linkedtales/scrapedin#readme"
|
|
34
34
|
}
|
package/src/company/company.js
CHANGED
|
@@ -5,31 +5,30 @@ const template = require('./companyScraperTemplate')
|
|
|
5
5
|
const logger = require('../logger')(__filename)
|
|
6
6
|
|
|
7
7
|
module.exports = async (browser, cookies, url, waitTimeToScrapMs = 500, puppeteerAuthenticate = undefined) => {
|
|
8
|
-
logger.info(`starting scraping url: ${url}`)
|
|
8
|
+
logger.info(`starting scraping url: ${url}`)
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
const company = {}
|
|
11
11
|
|
|
12
|
-
let page
|
|
13
|
-
if(url.includes('legacySchoolId=')){
|
|
14
|
-
|
|
12
|
+
let page
|
|
13
|
+
if (url.includes('legacySchoolId=')) {
|
|
14
|
+
page = await openPage({ browser, cookies, url, puppeteerAuthenticate })
|
|
15
15
|
|
|
16
|
-
|
|
16
|
+
const aboutSelector = 'a[href$="/about/"]'
|
|
17
17
|
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
} else{
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
18
|
+
company.url = page.url()
|
|
19
|
+
|
|
20
|
+
await page.$eval(aboutSelector, async about => await about.click())
|
|
21
|
+
await page.waitForNavigation()
|
|
22
|
+
} else {
|
|
23
|
+
company.url = url
|
|
24
|
+
url = url + '/about'
|
|
25
|
+
page = await openPage({ browser, cookies, url, puppeteerAuthenticate })
|
|
26
26
|
}
|
|
27
|
-
company.about = (await scrapSection(page, template.about))[0]
|
|
28
|
-
company.profile = (await scrapSection(page, template.profile))[0]
|
|
27
|
+
company.about = (await scrapSection(page, template.about))[0]
|
|
28
|
+
company.profile = (await scrapSection(page, template.profile))[0]
|
|
29
|
+
|
|
30
|
+
await page.close()
|
|
31
|
+
logger.info(`finished scraping url: ${url}`)
|
|
29
32
|
|
|
30
|
-
await page.close();
|
|
31
|
-
logger.info(`finished scraping url: ${url}`);
|
|
32
|
-
|
|
33
33
|
return company
|
|
34
|
-
|
|
35
34
|
}
|
|
@@ -1,30 +1,29 @@
|
|
|
1
1
|
const template = {
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
}
|
|
12
|
-
},
|
|
13
|
-
about: {
|
|
14
|
-
selector: '.org-grid__core-rail--no-margin-left',
|
|
15
|
-
fields: {
|
|
16
|
-
overview: 'p',
|
|
17
|
-
types:{
|
|
18
|
-
selector: 'dl dt',
|
|
19
|
-
isMultipleFields: true
|
|
20
|
-
},
|
|
21
|
-
values:{
|
|
22
|
-
selector: 'dl dd:not(.org-page-details__employees-on-linkedin-count)',
|
|
23
|
-
isMultipleFields: true
|
|
24
|
-
}
|
|
25
|
-
}
|
|
2
|
+
profile: {
|
|
3
|
+
selector: '.org-top-card, .top-card-layout',
|
|
4
|
+
fields: {
|
|
5
|
+
name: 'h1',
|
|
6
|
+
headline: '.org-top-card-summary__tagline, .top-card-layout__headline, p',
|
|
7
|
+
imageurl: {
|
|
8
|
+
selector: 'img.org-top-card-primary-content__logo, img.top-card-layout__entity-image',
|
|
9
|
+
attribute: 'src'
|
|
10
|
+
}
|
|
26
11
|
}
|
|
12
|
+
},
|
|
13
|
+
about: {
|
|
14
|
+
selector: '.org-grid__core-rail--no-margin-left, .org-about-us-organization-description, .core-section-container',
|
|
15
|
+
fields: {
|
|
16
|
+
overview: 'p',
|
|
17
|
+
types: {
|
|
18
|
+
selector: 'dl dt, .org-page-details__definition-term',
|
|
19
|
+
isMultipleFields: true
|
|
20
|
+
},
|
|
21
|
+
values: {
|
|
22
|
+
selector: 'dl dd:not(.org-page-details__employees-on-linkedin-count), .org-page-details__definition-text',
|
|
23
|
+
isMultipleFields: true
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
27
|
}
|
|
28
28
|
|
|
29
|
-
|
|
30
29
|
module.exports = template
|
package/src/login.js
CHANGED
|
@@ -14,34 +14,36 @@ module.exports = async (browser, email, password) => {
|
|
|
14
14
|
await page.$('#password')
|
|
15
15
|
.then((passwordElement) => passwordElement.type(password))
|
|
16
16
|
|
|
17
|
-
await page
|
|
18
|
-
.then((button) => button[0].click())
|
|
17
|
+
await page.locator('button[type="submit"]').click()
|
|
19
18
|
|
|
20
|
-
return page.
|
|
21
|
-
timeout: 15000
|
|
22
|
-
})
|
|
19
|
+
return page.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 15000 })
|
|
23
20
|
.then(async () => {
|
|
24
|
-
|
|
25
|
-
|
|
21
|
+
const currentUrl = page.url()
|
|
22
|
+
if (currentUrl.includes('/feed') || currentUrl.includes('/mynetwork') || currentUrl.includes('/in/')) {
|
|
23
|
+
logger.info('logged in, redirected to: ' + currentUrl)
|
|
24
|
+
await page.close()
|
|
25
|
+
return
|
|
26
|
+
}
|
|
27
|
+
throw new Error('unexpected redirect: ' + currentUrl)
|
|
26
28
|
})
|
|
27
29
|
.catch(async () => {
|
|
28
30
|
logger.warn('successful login element was not found')
|
|
29
31
|
const emailError = await page.evaluate(() => {
|
|
30
|
-
const e = document.querySelector('div[error-for=username]')
|
|
32
|
+
const e = document.querySelector('div[error-for=username], #error-for-username')
|
|
31
33
|
if (!e) { return false }
|
|
32
34
|
const style = window.getComputedStyle(e)
|
|
33
35
|
return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
|
|
34
36
|
})
|
|
35
37
|
|
|
36
38
|
const passwordError = await page.evaluate(() => {
|
|
37
|
-
const e = document.querySelector('div[error-for=password]')
|
|
39
|
+
const e = document.querySelector('div[error-for=password], #error-for-password')
|
|
38
40
|
if (!e) { return false }
|
|
39
41
|
const style = window.getComputedStyle(e)
|
|
40
42
|
return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
|
|
41
43
|
})
|
|
42
44
|
|
|
43
45
|
const manualChallengeRequested = await page.evaluate(() => {
|
|
44
|
-
const e = document.querySelector('.flow-challenge-content')
|
|
46
|
+
const e = document.querySelector('.flow-challenge-content, #challenge, [data-test-id="challenge"]')
|
|
45
47
|
if (!e) { return false }
|
|
46
48
|
const style = window.getComputedStyle(e)
|
|
47
49
|
return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
|
|
@@ -57,7 +59,7 @@ module.exports = async (browser, email, password) => {
|
|
|
57
59
|
return Promise.reject(new Error('linkedin: invalid password'))
|
|
58
60
|
}
|
|
59
61
|
|
|
60
|
-
if (
|
|
62
|
+
if (manualChallengeRequested) {
|
|
61
63
|
logger.warn('manual check was required')
|
|
62
64
|
return Promise.reject(new Error(`linkedin: manual check was required, verify if your login is properly working manually or report this issue: ${pkg.name} ${pkg.version} ${pkg.bugs.url}`))
|
|
63
65
|
}
|
package/src/openPage.js
CHANGED
|
@@ -1,23 +1,13 @@
|
|
|
1
|
-
const
|
|
2
|
-
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
|
|
3
|
-
// "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
|
|
4
|
-
// "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
|
|
5
|
-
// "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:34.0) Gecko/20100101 Firefox/34.0",
|
|
6
|
-
// "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
|
|
7
|
-
// "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
|
|
8
|
-
// "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
|
|
9
|
-
// "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
|
|
10
|
-
// "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
|
|
11
|
-
]
|
|
1
|
+
const AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
|
|
12
2
|
|
|
13
3
|
module.exports = async ({ browser, cookies, url, puppeteerAuthenticate }) => {
|
|
14
4
|
const page = await browser.newPage()
|
|
15
|
-
await page.setDefaultNavigationTimeout(
|
|
5
|
+
await page.setDefaultNavigationTimeout(60000)
|
|
16
6
|
|
|
17
7
|
if (cookies) {
|
|
18
8
|
await page.setCookie(...cookies)
|
|
19
9
|
}
|
|
20
|
-
await page.setUserAgent(
|
|
10
|
+
await page.setUserAgent(AGENT)
|
|
21
11
|
await page.setExtraHTTPHeaders({ 'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8' })
|
|
22
12
|
await page.setViewport({
|
|
23
13
|
width: 1920,
|
|
@@ -2,7 +2,7 @@ const logger = require('../logger')(__filename)
|
|
|
2
2
|
const pkg = require('../package')
|
|
3
3
|
|
|
4
4
|
module.exports = (profile) => {
|
|
5
|
-
if(!profile?.profile?.name) {
|
|
5
|
+
if (!profile?.profile?.name) {
|
|
6
6
|
const messageError = `LinkedIn website changed and ${pkg.name} ${pkg.version} can't read basic data. Please report this issue at ${pkg.bugs.url}`
|
|
7
7
|
logger.error(messageError, '')
|
|
8
8
|
throw new Error(messageError)
|
|
@@ -11,17 +11,17 @@ module.exports = (profile) => {
|
|
|
11
11
|
profile.profile.summary = profile?.about?.text
|
|
12
12
|
|
|
13
13
|
profile.positions.forEach((position) => {
|
|
14
|
-
if(position.title){
|
|
15
|
-
|
|
14
|
+
if (position.title) {
|
|
15
|
+
position.title = position.title.replace('Company Name\n', '')
|
|
16
16
|
}
|
|
17
|
-
if(position.description) {
|
|
18
|
-
position.description = position.description.replace('See more', '')
|
|
19
|
-
position.description = position.description.replace('see more', '')
|
|
20
|
-
|
|
17
|
+
if (position.description) {
|
|
18
|
+
position.description = position.description.replace('See more', '')
|
|
19
|
+
position.description = position.description.replace('see more', '')
|
|
20
|
+
position.description = position.description.replace('See less', '')
|
|
21
21
|
}
|
|
22
|
-
if(position.roles) {
|
|
22
|
+
if (position.roles) {
|
|
23
23
|
position.roles.forEach((role) => {
|
|
24
|
-
if(role.title) {
|
|
24
|
+
if (role.title) {
|
|
25
25
|
role.title = role.title.replace('Title\n', '')
|
|
26
26
|
}
|
|
27
27
|
if (role.date) {
|
|
@@ -29,7 +29,7 @@ module.exports = (profile) => {
|
|
|
29
29
|
role.date2 = role.date.replace('·', '-').split('-')[1].trim()
|
|
30
30
|
delete role.date
|
|
31
31
|
}
|
|
32
|
-
if(role.description) {
|
|
32
|
+
if (role.description) {
|
|
33
33
|
role.description = role.description.replace('See more', '')
|
|
34
34
|
role.description = role.description.replace('see more', '')
|
|
35
35
|
}
|
|
@@ -37,63 +37,62 @@ module.exports = (profile) => {
|
|
|
37
37
|
}
|
|
38
38
|
})
|
|
39
39
|
|
|
40
|
-
if(profile.recommendations.receivedCount) {
|
|
40
|
+
if (profile.recommendations.receivedCount) {
|
|
41
41
|
profile.recommendations.receivedCount = profile.recommendations.receivedCount.replace(/[^\d]/g, '')
|
|
42
42
|
}
|
|
43
43
|
|
|
44
|
-
if(profile.recommendations.givenCount) {
|
|
44
|
+
if (profile.recommendations.givenCount) {
|
|
45
45
|
profile.recommendations.givenCount = profile.recommendations.givenCount.replace(/[^\d]/g, '')
|
|
46
46
|
}
|
|
47
47
|
|
|
48
|
-
if(profile.recommendations.received) {
|
|
48
|
+
if (profile.recommendations.received) {
|
|
49
49
|
profile.recommendations.received.forEach((recommendation) => {
|
|
50
|
-
if(recommendation.summary){
|
|
50
|
+
if (recommendation.summary) {
|
|
51
51
|
recommendation.summary = recommendation.summary.replace('See more', '')
|
|
52
52
|
recommendation.summary = recommendation.summary.replace('See less', '')
|
|
53
53
|
}
|
|
54
54
|
})
|
|
55
55
|
}
|
|
56
56
|
|
|
57
|
-
if(profile.recommendations.given) {
|
|
57
|
+
if (profile.recommendations.given) {
|
|
58
58
|
profile.recommendations.given.forEach((recommendation) => {
|
|
59
|
-
if(recommendation.summary){
|
|
59
|
+
if (recommendation.summary) {
|
|
60
60
|
recommendation.summary = recommendation.summary.replace('See more', '')
|
|
61
61
|
recommendation.summary = recommendation.summary.replace('See less', '')
|
|
62
62
|
}
|
|
63
63
|
})
|
|
64
64
|
}
|
|
65
65
|
|
|
66
|
-
if(profile.courses){
|
|
66
|
+
if (profile.courses) {
|
|
67
67
|
profile.courses = profile.courses.map(({ name, year }) => {
|
|
68
68
|
const coursesObj = {}
|
|
69
|
-
if(name) {
|
|
69
|
+
if (name) {
|
|
70
70
|
coursesObj.name = name.replace('Course name\n', '')
|
|
71
71
|
}
|
|
72
|
-
if(year) {
|
|
72
|
+
if (year) {
|
|
73
73
|
coursesObj.year = year.replace('Course number\n', '')
|
|
74
74
|
}
|
|
75
75
|
return coursesObj
|
|
76
|
-
}
|
|
77
|
-
);
|
|
76
|
+
})
|
|
78
77
|
}
|
|
79
78
|
|
|
80
|
-
if(profile.languages){
|
|
79
|
+
if (profile.languages) {
|
|
81
80
|
profile.languages = profile.languages.map(({ name, proficiency }) => ({
|
|
82
81
|
name: name ? name.replace('Language name\n', '') : undefined,
|
|
83
|
-
proficiency
|
|
84
|
-
}))
|
|
82
|
+
proficiency
|
|
83
|
+
}))
|
|
85
84
|
}
|
|
86
85
|
|
|
87
|
-
if(profile.projects){
|
|
86
|
+
if (profile.projects) {
|
|
88
87
|
profile.projects = profile.projects.map(
|
|
89
88
|
({ name, date, description, link }) => ({
|
|
90
89
|
name: name ? name.replace('Project name\n', '') : undefined,
|
|
91
90
|
date,
|
|
92
91
|
description: description ? description.replace('Project description\n', '') : undefined,
|
|
93
|
-
link
|
|
94
|
-
})
|
|
95
|
-
)
|
|
92
|
+
link
|
|
93
|
+
})
|
|
94
|
+
)
|
|
96
95
|
}
|
|
97
|
-
|
|
96
|
+
|
|
98
97
|
return profile
|
|
99
98
|
}
|
|
@@ -1,15 +1,15 @@
|
|
|
1
1
|
const logger = require('../logger')(__filename)
|
|
2
2
|
const scrapSection = require('../scrapSection')
|
|
3
3
|
|
|
4
|
-
const SEE_MORE_SELECTOR = '
|
|
5
|
-
const CLOSE_MODAL_SELECTOR = '.artdeco-modal__dismiss'
|
|
4
|
+
const SEE_MORE_SELECTOR = '#top-card-text-details-contact-info'
|
|
5
|
+
const CLOSE_MODAL_SELECTOR = '.artdeco-modal__dismiss'
|
|
6
6
|
|
|
7
7
|
const template = {
|
|
8
|
-
selector: '.pv-contact-info__contact-type',
|
|
8
|
+
selector: '.pv-contact-info__contact-type, .ci-vanity-url, .ci-email, .ci-phone, .ci-websites, .ci-birthday, .ci-ims, .ci-address',
|
|
9
9
|
fields: {
|
|
10
|
-
type: 'header',
|
|
10
|
+
type: 'header, h3',
|
|
11
11
|
values: {
|
|
12
|
-
selector: '.pv-contact-info__ci-container',
|
|
12
|
+
selector: '.pv-contact-info__ci-container, .t-14',
|
|
13
13
|
isMultipleFields: true
|
|
14
14
|
},
|
|
15
15
|
links: {
|
|
@@ -18,31 +18,29 @@ const template = {
|
|
|
18
18
|
isMultipleFields: true
|
|
19
19
|
}
|
|
20
20
|
}
|
|
21
|
-
}
|
|
22
|
-
const getContactInfo = async(page) => {
|
|
23
|
-
await page.
|
|
21
|
+
}
|
|
22
|
+
const getContactInfo = async (page) => {
|
|
23
|
+
await page.waitForSelector(SEE_MORE_SELECTOR, { timeout: 2000 })
|
|
24
24
|
.catch(() => {
|
|
25
|
-
logger.warn('contact-info
|
|
25
|
+
logger.warn('contact-info selector not found')
|
|
26
26
|
return {}
|
|
27
27
|
})
|
|
28
28
|
|
|
29
29
|
const element = await page.$(SEE_MORE_SELECTOR)
|
|
30
|
-
if(element){
|
|
30
|
+
if (element) {
|
|
31
31
|
await element.click()
|
|
32
|
-
const contactInfoIndicatorSelector = '
|
|
33
|
-
await page.
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
32
|
+
const contactInfoIndicatorSelector = '.pv-profile-section__section-info, .artdeco-modal__content'
|
|
33
|
+
await page.waitForSelector(contactInfoIndicatorSelector, { timeout: 5000 })
|
|
34
|
+
.catch(() => {
|
|
35
|
+
logger.warn('contact info was not found')
|
|
36
|
+
})
|
|
37
|
+
|
|
38
38
|
const contactInfo = await scrapSection(page, template)
|
|
39
39
|
const closeButton = await page.$(CLOSE_MODAL_SELECTOR)
|
|
40
|
-
if(closeButton)
|
|
41
|
-
await closeButton.click()
|
|
40
|
+
if (closeButton) { await closeButton.click() }
|
|
42
41
|
|
|
43
42
|
return contactInfo
|
|
44
43
|
}
|
|
45
|
-
|
|
46
44
|
}
|
|
47
45
|
|
|
48
46
|
module.exports = getContactInfo
|
package/src/profile/profile.js
CHANGED
|
@@ -1,81 +1,221 @@
|
|
|
1
1
|
const openPage = require('../openPage')
|
|
2
|
-
const scrapSection = require('../scrapSection')
|
|
3
|
-
const scrapAccomplishmentPanel = require('./scrapAccomplishmentPanel')
|
|
4
2
|
const scrollToPageBottom = require('./scrollToPageBottom')
|
|
5
3
|
const seeMoreButtons = require('./seeMoreButtons')
|
|
6
4
|
const contactInfo = require('./contactInfo')
|
|
7
|
-
const template = require('./profileScraperTemplate')
|
|
8
5
|
const cleanProfileData = require('./cleanProfileData')
|
|
9
6
|
|
|
10
7
|
const logger = require('../logger')(__filename)
|
|
11
8
|
|
|
9
|
+
const extractProfileData = async (page) => {
|
|
10
|
+
return page.evaluate(() => {
|
|
11
|
+
const txt = (el) => el ? (el.textContent || '').trim() : ''
|
|
12
|
+
|
|
13
|
+
const byViewName = (name) => document.querySelector(`[data-view-name="${name}"]`)
|
|
14
|
+
|
|
15
|
+
const getSection = (viewName) => {
|
|
16
|
+
const el = byViewName(viewName)
|
|
17
|
+
if (!el) return null
|
|
18
|
+
return el.querySelector('section') || el.closest('section')
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// Get items from a section. For sections without ul/li, items are divs
|
|
22
|
+
// found after the h2 heading: h2.parent.nextSibling > div > div > div
|
|
23
|
+
const getSectionItems = (viewName) => {
|
|
24
|
+
const section = getSection(viewName)
|
|
25
|
+
if (!section) return []
|
|
26
|
+
const ul = section.querySelector('ul')
|
|
27
|
+
if (ul) return [...ul.querySelectorAll(':scope > li')]
|
|
28
|
+
const h2 = section.querySelector('h2')
|
|
29
|
+
if (!h2) return []
|
|
30
|
+
const afterH2 = h2.parentElement.nextElementSibling
|
|
31
|
+
if (!afterH2) return []
|
|
32
|
+
const itemDivs = afterH2.querySelectorAll(':scope > div > div > div')
|
|
33
|
+
return [...itemDivs].filter(d => d.querySelectorAll('p').length > 0)
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// Profile top card: name is in h2, headline/location in p tags
|
|
37
|
+
const titleName = document.title.replace(' | LinkedIn', '').trim()
|
|
38
|
+
const mainLevel = byViewName('profile-main-level')
|
|
39
|
+
const topSection = mainLevel ? (mainLevel.querySelector('section') || mainLevel.closest('section')) : null
|
|
40
|
+
const nameH2 = topSection ? topSection.querySelector('h2') : null
|
|
41
|
+
const photoImg = document.querySelector('img[src*="profile-displayphoto"]')
|
|
42
|
+
|
|
43
|
+
const profileName = nameH2 ? txt(nameH2) : titleName
|
|
44
|
+
let headline = ''
|
|
45
|
+
let location = ''
|
|
46
|
+
|
|
47
|
+
if (topSection) {
|
|
48
|
+
const allP = [...topSection.querySelectorAll('p')]
|
|
49
|
+
headline = txt(allP[0])
|
|
50
|
+
for (const p of allP) {
|
|
51
|
+
const t = txt(p)
|
|
52
|
+
if (t.includes(',') && !t.includes('at ') && !t.includes('\xB7')) {
|
|
53
|
+
location = t
|
|
54
|
+
break
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
const profile = {
|
|
60
|
+
name: profileName,
|
|
61
|
+
headline,
|
|
62
|
+
location,
|
|
63
|
+
connections: '',
|
|
64
|
+
imageurl: photoImg ? photoImg.getAttribute('src') || '' : ''
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Experience: uses LazyColumn with company groups containing ul > li
|
|
68
|
+
const positions = []
|
|
69
|
+
const expSection = getSection('profile-card-experience')
|
|
70
|
+
if (expSection) {
|
|
71
|
+
const lazyCol = expSection.querySelector('[data-component-type]') || expSection
|
|
72
|
+
const groups = [...lazyCol.children]
|
|
73
|
+
|
|
74
|
+
groups.forEach(group => {
|
|
75
|
+
const ul = group.querySelector('ul')
|
|
76
|
+
if (!ul) return
|
|
77
|
+
|
|
78
|
+
// Company info is in p tags before the ul
|
|
79
|
+
const allPs = group.querySelectorAll('p')
|
|
80
|
+
const beforeUlPs = []
|
|
81
|
+
for (const p of allPs) {
|
|
82
|
+
if (ul.contains(p)) break
|
|
83
|
+
beforeUlPs.push(p)
|
|
84
|
+
}
|
|
85
|
+
const companyName = txt(beforeUlPs[0])
|
|
86
|
+
const companyLink = group.querySelector('a[href*="/company/"]')
|
|
87
|
+
const companyUrl = companyLink ? companyLink.getAttribute('href') : ''
|
|
88
|
+
const companyLocation = txt(beforeUlPs[2])
|
|
89
|
+
|
|
90
|
+
const lis = [...ul.querySelectorAll(':scope > li')]
|
|
91
|
+
lis.forEach(li => {
|
|
92
|
+
const ps = [...li.querySelectorAll('div[role="button"] p')]
|
|
93
|
+
const descEl = li.querySelector('[data-testid="expandable-text-box"]')
|
|
94
|
+
const dateStr = txt(ps[2])
|
|
95
|
+
const dateRange = dateStr.split('\xB7')[0].trim()
|
|
96
|
+
const dateParts = dateRange.split(' - ')
|
|
97
|
+
positions.push({
|
|
98
|
+
title: txt(ps[0]),
|
|
99
|
+
companyName,
|
|
100
|
+
link: companyUrl,
|
|
101
|
+
url: companyUrl,
|
|
102
|
+
location: companyLocation,
|
|
103
|
+
description: descEl ? txt(descEl) : '',
|
|
104
|
+
date: dateRange,
|
|
105
|
+
date1: dateParts[0] ? dateParts[0].trim() : '',
|
|
106
|
+
date2: dateParts[1] ? dateParts[1].trim() : ''
|
|
107
|
+
})
|
|
108
|
+
})
|
|
109
|
+
})
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
// Education
|
|
113
|
+
const eduItems = getSectionItems('profile-card-education')
|
|
114
|
+
const educations = eduItems.map(item => {
|
|
115
|
+
const ps = [...item.querySelectorAll('p')]
|
|
116
|
+
const link = item.querySelector('a[href*="/school/"]') ||
|
|
117
|
+
item.closest('div')?.parentElement?.querySelector('a[href*="/school/"]')
|
|
118
|
+
return {
|
|
119
|
+
title: txt(ps[0]),
|
|
120
|
+
degree: txt(ps[1]),
|
|
121
|
+
fieldOfStudy: txt(ps[1]),
|
|
122
|
+
url: link ? link.getAttribute('href') : '',
|
|
123
|
+
date1: ps[2] ? txt(ps[2]).split('\u2013')[0].trim() : '',
|
|
124
|
+
date2: ps[2] ? (txt(ps[2]).split('\u2013')[1] || '').trim() : '',
|
|
125
|
+
description: ''
|
|
126
|
+
}
|
|
127
|
+
})
|
|
128
|
+
|
|
129
|
+
// Skills
|
|
130
|
+
const skillItems = getSectionItems('profile-card-skills')
|
|
131
|
+
const skills = skillItems.map(item => {
|
|
132
|
+
const ps = [...item.querySelectorAll('p')]
|
|
133
|
+
return { title: txt(ps[0]), count: '' }
|
|
134
|
+
})
|
|
135
|
+
|
|
136
|
+
// Languages
|
|
137
|
+
const langItems = getSectionItems('profile-card-languages')
|
|
138
|
+
const languages = langItems.map(item => {
|
|
139
|
+
const ps = [...item.querySelectorAll('p')]
|
|
140
|
+
return { name: txt(ps[0]), proficiency: txt(ps[1]) }
|
|
141
|
+
})
|
|
142
|
+
|
|
143
|
+
// Projects
|
|
144
|
+
const projItems = getSectionItems('profile-card-projects')
|
|
145
|
+
const projects = projItems.map(item => {
|
|
146
|
+
const ps = [...item.querySelectorAll('p')]
|
|
147
|
+
const descEl = item.querySelector('[data-testid="expandable-text-box"]')
|
|
148
|
+
const link = item.querySelector('a[href*="http"]')
|
|
149
|
+
return {
|
|
150
|
+
name: txt(ps[0]),
|
|
151
|
+
date: txt(ps[1]),
|
|
152
|
+
description: descEl ? txt(descEl) : '',
|
|
153
|
+
link: link ? link.getAttribute('href') : ''
|
|
154
|
+
}
|
|
155
|
+
})
|
|
156
|
+
|
|
157
|
+
// Certifications
|
|
158
|
+
const certItems = getSectionItems('profile-card-licenses-and-certifications')
|
|
159
|
+
const accomplishments = certItems.map(item => {
|
|
160
|
+
const ps = [...item.querySelectorAll('p')]
|
|
161
|
+
return { title: txt(ps[0]), count: '', items: [] }
|
|
162
|
+
})
|
|
163
|
+
|
|
164
|
+
return {
|
|
165
|
+
profile,
|
|
166
|
+
about: { text: '' },
|
|
167
|
+
positions,
|
|
168
|
+
educations,
|
|
169
|
+
skills,
|
|
170
|
+
recommendations: { givenCount: '0', receivedCount: '0', given: [], received: [] },
|
|
171
|
+
accomplishments,
|
|
172
|
+
courses: [],
|
|
173
|
+
languages,
|
|
174
|
+
projects,
|
|
175
|
+
peopleAlsoViewed: [],
|
|
176
|
+
volunteerExperience: [],
|
|
177
|
+
contact: []
|
|
178
|
+
}
|
|
179
|
+
})
|
|
180
|
+
}
|
|
181
|
+
|
|
12
182
|
module.exports = async (browser, cookies, url, waitTimeToScrapMs = 500, hasToGetContactInfo = false, puppeteerAuthenticate = undefined) => {
|
|
13
183
|
logger.info(`starting scraping url: ${url}`)
|
|
14
184
|
|
|
15
185
|
const page = await openPage({ browser, cookies, url, puppeteerAuthenticate })
|
|
16
|
-
|
|
17
|
-
|
|
186
|
+
|
|
187
|
+
// Wait for the SDUI profile to fully hydrate
|
|
188
|
+
await page.waitForFunction(() => {
|
|
189
|
+
return document.querySelector('[data-view-name="profile-card-experience"]')
|
|
190
|
+
}, { timeout: 30000 })
|
|
18
191
|
.catch(() => {
|
|
19
|
-
|
|
20
|
-
//because it can be just a false negative meaning LinkedIn only changed that selector but everything else is fine :)
|
|
21
|
-
logger.warn('profile selector was not found')
|
|
192
|
+
logger.warn('profile content did not fully render in time')
|
|
22
193
|
})
|
|
23
194
|
|
|
24
195
|
logger.info('scrolling page to the bottom')
|
|
25
196
|
await scrollToPageBottom(page)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
197
|
+
|
|
198
|
+
// Wait for lazy-loaded sections to render after scrolling
|
|
199
|
+
await new Promise((resolve) => { setTimeout(resolve, 2000) })
|
|
200
|
+
|
|
201
|
+
// Scroll again in case new content was loaded
|
|
202
|
+
await scrollToPageBottom(page)
|
|
203
|
+
await new Promise((resolve) => { setTimeout(resolve, 1000) })
|
|
31
204
|
|
|
32
205
|
await seeMoreButtons.clickAll(page)
|
|
33
206
|
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
await new Promise((resolve) => { setTimeout(() => { resolve() }, waitTimeToScrapMs / 2)})
|
|
37
|
-
}
|
|
207
|
+
// Final wait for content to settle
|
|
208
|
+
await new Promise((resolve) => { setTimeout(resolve, 1000) })
|
|
38
209
|
|
|
39
|
-
const
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
const recommendationsReceived = await scrapSection(page, template.recommendationsReceived)
|
|
45
|
-
const recommendationsGiven = await scrapSection(page, template.recommendationsGiven)
|
|
46
|
-
const skills = await scrapSection(page, template.skills)
|
|
47
|
-
const accomplishments = await scrapSection(page, template.accomplishments)
|
|
48
|
-
const courses = await scrapAccomplishmentPanel(page, 'courses')
|
|
49
|
-
const languages = await scrapAccomplishmentPanel(page, 'languages')
|
|
50
|
-
const projects = await scrapAccomplishmentPanel(page, 'projects')
|
|
51
|
-
const volunteerExperience = await scrapSection(page, template.volunteerExperience)
|
|
52
|
-
const peopleAlsoViewed = await scrapSection(page, template.peopleAlsoViewed)
|
|
53
|
-
const contact = hasToGetContactInfo ? await contactInfo(page) : []
|
|
210
|
+
const rawProfile = await extractProfileData(page)
|
|
211
|
+
|
|
212
|
+
if (hasToGetContactInfo) {
|
|
213
|
+
rawProfile.contact = await contactInfo(page) || []
|
|
214
|
+
}
|
|
54
215
|
|
|
55
216
|
await page.close()
|
|
56
217
|
logger.info(`finished scraping url: ${url}`)
|
|
57
218
|
|
|
58
|
-
const rawProfile = {
|
|
59
|
-
profile,
|
|
60
|
-
about,
|
|
61
|
-
positions,
|
|
62
|
-
educations,
|
|
63
|
-
skills,
|
|
64
|
-
recommendations: {
|
|
65
|
-
givenCount: recommendationsCount ? recommendationsCount.given : "0",
|
|
66
|
-
receivedCount: recommendationsCount ? recommendationsCount.received : "0",
|
|
67
|
-
given: recommendationsReceived,
|
|
68
|
-
received: recommendationsGiven
|
|
69
|
-
},
|
|
70
|
-
accomplishments,
|
|
71
|
-
courses,
|
|
72
|
-
languages,
|
|
73
|
-
projects,
|
|
74
|
-
peopleAlsoViewed,
|
|
75
|
-
volunteerExperience,
|
|
76
|
-
contact
|
|
77
|
-
}
|
|
78
|
-
|
|
79
219
|
const cleanedProfile = cleanProfileData(rawProfile)
|
|
80
220
|
return cleanedProfile
|
|
81
221
|
}
|
|
@@ -1,189 +1,186 @@
|
|
|
1
|
-
const profileSelector = '.core-rail > *:first-child section >'
|
|
2
|
-
|
|
3
1
|
const template = {
|
|
4
2
|
profile: {
|
|
5
|
-
selector: '.
|
|
3
|
+
selector: '.scaffold-layout__main',
|
|
6
4
|
fields: {
|
|
7
|
-
|
|
8
|
-
headline:
|
|
9
|
-
location:
|
|
10
|
-
connections:
|
|
5
|
+
name: '.text-heading-xlarge',
|
|
6
|
+
headline: '.text-body-medium',
|
|
7
|
+
location: '.text-body-small.inline.t-black--light.break-words',
|
|
8
|
+
connections: '.t-bold',
|
|
11
9
|
imageurl: {
|
|
12
|
-
|
|
10
|
+
selector: 'img.pv-top-card-profile-picture__image',
|
|
13
11
|
attribute: 'src'
|
|
14
12
|
}
|
|
15
13
|
}
|
|
16
14
|
},
|
|
17
15
|
about: {
|
|
18
|
-
selector: '.
|
|
16
|
+
selector: '#about ~ .display-flex .inline-show-more-text',
|
|
19
17
|
fields: {
|
|
20
|
-
text: '
|
|
18
|
+
text: 'span[aria-hidden="true"]'
|
|
21
19
|
}
|
|
22
20
|
},
|
|
23
21
|
positions: {
|
|
24
|
-
selector: '
|
|
22
|
+
selector: '#experience ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
|
|
25
23
|
fields: {
|
|
26
|
-
title: '
|
|
24
|
+
title: '.mr1.hoverable-link-text.t-bold > span',
|
|
27
25
|
link: {
|
|
28
|
-
selector: 'a',
|
|
29
|
-
attribute: 'href'
|
|
26
|
+
selector: 'a.optional-action-target-wrapper',
|
|
27
|
+
attribute: 'href'
|
|
30
28
|
},
|
|
31
29
|
url: {
|
|
32
|
-
selector: 'a',
|
|
30
|
+
selector: 'a.optional-action-target-wrapper',
|
|
33
31
|
attribute: 'href'
|
|
34
32
|
},
|
|
35
|
-
companyName: '
|
|
36
|
-
location: '.
|
|
37
|
-
description: '.
|
|
38
|
-
date1: '.
|
|
39
|
-
date2: '.
|
|
33
|
+
companyName: '.t-14.t-normal > span',
|
|
34
|
+
location: '.t-14.t-normal.t-black--light > span',
|
|
35
|
+
description: '.pvs-list__outer-container .inline-show-more-text span[aria-hidden="true"]',
|
|
36
|
+
date1: '.pvs-entity__caption-wrapper',
|
|
37
|
+
date2: '.pvs-entity__caption-wrapper',
|
|
40
38
|
roles: {
|
|
41
|
-
selector: '
|
|
39
|
+
selector: '.pvs-entity__sub-components li.pvs-list__paged-list-item',
|
|
42
40
|
hasChildrenFields: true,
|
|
43
41
|
fields: {
|
|
44
|
-
title: '
|
|
45
|
-
description: '
|
|
42
|
+
title: '.mr1.hoverable-link-text.t-bold > span',
|
|
43
|
+
description: '.inline-show-more-text span[aria-hidden="true"]',
|
|
46
44
|
date: '.pvs-entity__caption-wrapper',
|
|
47
|
-
location: '.
|
|
45
|
+
location: '.t-14.t-normal.t-black--light > span'
|
|
48
46
|
}
|
|
49
47
|
}
|
|
50
48
|
}
|
|
51
49
|
},
|
|
52
50
|
educations: {
|
|
53
|
-
selector: '#education-
|
|
51
|
+
selector: '#education ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
|
|
54
52
|
fields: {
|
|
55
|
-
title: '
|
|
56
|
-
degree: '
|
|
53
|
+
title: '.hoverable-link-text.t-bold > span',
|
|
54
|
+
degree: '.t-14.t-normal > span',
|
|
57
55
|
url: {
|
|
58
56
|
selector: 'a',
|
|
59
57
|
attribute: 'href'
|
|
60
58
|
},
|
|
61
|
-
|
|
62
|
-
date1: '.
|
|
63
|
-
date2: '.
|
|
64
|
-
description: '.
|
|
59
|
+
fieldOfStudy: '.t-14.t-normal > span',
|
|
60
|
+
date1: '.pvs-entity__caption-wrapper',
|
|
61
|
+
date2: '.pvs-entity__caption-wrapper',
|
|
62
|
+
description: '.inline-show-more-text span[aria-hidden="true"]'
|
|
65
63
|
}
|
|
66
64
|
},
|
|
67
65
|
skills: {
|
|
68
|
-
selector: '.
|
|
66
|
+
selector: '#skills ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
|
|
69
67
|
fields: {
|
|
70
|
-
title: '.
|
|
71
|
-
count: '.
|
|
68
|
+
title: '.mr1.hoverable-link-text.t-bold > span',
|
|
69
|
+
count: '.t-14.t-normal.t-black--light > span'
|
|
72
70
|
}
|
|
73
71
|
},
|
|
74
72
|
recommendationsCount: {
|
|
75
|
-
selector: '.
|
|
73
|
+
selector: '#recommendations ~ .pvs-list__outer-container',
|
|
76
74
|
fields: {
|
|
77
75
|
received: '.artdeco-tab:nth-child(1)',
|
|
78
76
|
given: '.artdeco-tab:nth-child(2)'
|
|
79
77
|
}
|
|
80
78
|
},
|
|
81
79
|
recommendationsReceived: {
|
|
82
|
-
selector: '.
|
|
80
|
+
selector: '#recommendations ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
|
|
83
81
|
fields: {
|
|
84
82
|
user: {
|
|
85
|
-
selector: '
|
|
83
|
+
selector: 'a',
|
|
86
84
|
attribute: 'href'
|
|
87
85
|
},
|
|
88
|
-
text: '
|
|
86
|
+
text: '.inline-show-more-text span[aria-hidden="true"]',
|
|
89
87
|
profileImage: {
|
|
90
|
-
selector: '
|
|
88
|
+
selector: 'img',
|
|
91
89
|
attribute: 'src'
|
|
92
90
|
},
|
|
93
91
|
name: {
|
|
94
|
-
selector: '
|
|
92
|
+
selector: '.t-bold > span'
|
|
95
93
|
},
|
|
96
94
|
userDescription: {
|
|
97
|
-
selector: '.
|
|
95
|
+
selector: '.t-14.t-normal > span'
|
|
98
96
|
}
|
|
99
97
|
}
|
|
100
98
|
},
|
|
101
99
|
recommendationsGiven: {
|
|
102
|
-
selector: '.
|
|
100
|
+
selector: '#recommendations ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
|
|
103
101
|
fields: {
|
|
104
102
|
user: {
|
|
105
|
-
selector: '
|
|
103
|
+
selector: 'a',
|
|
106
104
|
attribute: 'href'
|
|
107
105
|
},
|
|
108
|
-
text: '
|
|
106
|
+
text: '.inline-show-more-text span[aria-hidden="true"]',
|
|
109
107
|
profileImage: {
|
|
110
|
-
selector: '
|
|
108
|
+
selector: 'img',
|
|
111
109
|
attribute: 'src'
|
|
112
110
|
},
|
|
113
111
|
name: {
|
|
114
|
-
selector: '
|
|
112
|
+
selector: '.t-bold > span'
|
|
115
113
|
},
|
|
116
114
|
userDescription: {
|
|
117
|
-
selector: '.
|
|
115
|
+
selector: '.t-14.t-normal > span'
|
|
118
116
|
}
|
|
119
117
|
}
|
|
120
118
|
},
|
|
121
119
|
accomplishments: {
|
|
122
|
-
selector: '.
|
|
120
|
+
selector: '#honors_and_awards ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
|
|
123
121
|
fields: {
|
|
124
|
-
count: '
|
|
125
|
-
title: '.
|
|
122
|
+
count: '.t-14.t-normal.t-black--light > span',
|
|
123
|
+
title: '.mr1.hoverable-link-text.t-bold > span',
|
|
126
124
|
items: {
|
|
127
|
-
selector: 'li',
|
|
125
|
+
selector: '.pvs-list__outer-container li',
|
|
128
126
|
isMultipleFields: true
|
|
129
127
|
}
|
|
130
128
|
}
|
|
131
129
|
},
|
|
132
130
|
peopleAlsoViewed: {
|
|
133
|
-
selector: '
|
|
131
|
+
selector: '.pv-browsemap-section li',
|
|
134
132
|
fields: {
|
|
135
133
|
user: {
|
|
136
134
|
selector: 'a',
|
|
137
135
|
attribute: 'href'
|
|
138
136
|
},
|
|
139
|
-
text: '
|
|
137
|
+
text: '.t-14.t-normal',
|
|
140
138
|
profileImage: {
|
|
141
|
-
selector: '
|
|
139
|
+
selector: 'img',
|
|
142
140
|
attribute: 'src'
|
|
143
141
|
},
|
|
144
142
|
name: {
|
|
145
|
-
selector: '.
|
|
143
|
+
selector: '.t-bold'
|
|
146
144
|
}
|
|
147
145
|
}
|
|
148
146
|
},
|
|
149
147
|
volunteerExperience: {
|
|
150
|
-
selector: '
|
|
148
|
+
selector: '#volunteering_experience ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
|
|
151
149
|
fields: {
|
|
152
|
-
title: '
|
|
153
|
-
experience: '
|
|
154
|
-
location: '.
|
|
155
|
-
description: '.
|
|
156
|
-
date1: '.
|
|
157
|
-
date2: '.
|
|
150
|
+
title: '.mr1.hoverable-link-text.t-bold > span',
|
|
151
|
+
experience: '.t-14.t-normal > span',
|
|
152
|
+
location: '.t-14.t-normal.t-black--light > span',
|
|
153
|
+
description: '.inline-show-more-text span[aria-hidden="true"]',
|
|
154
|
+
date1: '.pvs-entity__caption-wrapper',
|
|
155
|
+
date2: '.pvs-entity__caption-wrapper'
|
|
158
156
|
}
|
|
159
157
|
},
|
|
160
158
|
courses: {
|
|
161
|
-
selector: '.
|
|
159
|
+
selector: '#courses ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
|
|
162
160
|
fields: {
|
|
163
|
-
name: '.
|
|
164
|
-
year: '.
|
|
161
|
+
name: '.mr1.hoverable-link-text.t-bold > span',
|
|
162
|
+
year: '.t-14.t-normal > span'
|
|
165
163
|
}
|
|
166
164
|
},
|
|
167
165
|
languages: {
|
|
168
|
-
selector: '.
|
|
166
|
+
selector: '#languages ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
|
|
169
167
|
fields: {
|
|
170
|
-
name: '.
|
|
171
|
-
proficiency: '.
|
|
168
|
+
name: '.mr1.t-bold > span',
|
|
169
|
+
proficiency: '.t-14.t-normal.t-black--light > span'
|
|
172
170
|
}
|
|
173
171
|
},
|
|
174
172
|
projects: {
|
|
175
|
-
selector: '.
|
|
173
|
+
selector: '#projects ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
|
|
176
174
|
fields: {
|
|
177
|
-
name: '.
|
|
178
|
-
date: '.
|
|
179
|
-
description: '.
|
|
175
|
+
name: '.mr1.hoverable-link-text.t-bold > span',
|
|
176
|
+
date: '.pvs-entity__caption-wrapper',
|
|
177
|
+
description: '.inline-show-more-text span[aria-hidden="true"]',
|
|
180
178
|
link: {
|
|
181
|
-
selector: '
|
|
179
|
+
selector: 'a',
|
|
182
180
|
attribute: 'href'
|
|
183
181
|
}
|
|
184
182
|
}
|
|
185
183
|
}
|
|
186
184
|
}
|
|
187
185
|
|
|
188
|
-
|
|
189
186
|
module.exports = template
|
|
@@ -1,18 +1,11 @@
|
|
|
1
|
-
const scrapSection = require('../scrapSection')
|
|
2
|
-
const template = require('./profileScraperTemplate')
|
|
1
|
+
const scrapSection = require('../scrapSection')
|
|
2
|
+
const template = require('./profileScraperTemplate')
|
|
3
3
|
|
|
4
4
|
const scrapAccomplishmentPanel = async (page, section) => {
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
const openingButton = await page.$(queryString);
|
|
8
|
-
|
|
9
|
-
if (openingButton) {
|
|
10
|
-
await page.evaluate((q) => {
|
|
11
|
-
document.querySelector(q).click();
|
|
12
|
-
}, queryString);
|
|
13
|
-
|
|
14
|
-
return scrapSection(page, template[section]);
|
|
5
|
+
if (!template[section]) {
|
|
6
|
+
return []
|
|
15
7
|
}
|
|
16
|
-
|
|
8
|
+
return scrapSection(page, template[section])
|
|
9
|
+
}
|
|
17
10
|
|
|
18
|
-
module.exports = scrapAccomplishmentPanel
|
|
11
|
+
module.exports = scrapAccomplishmentPanel
|
|
@@ -3,21 +3,20 @@ const logger = require('../logger')(__filename)
|
|
|
3
3
|
module.exports = async (page) => {
|
|
4
4
|
const MAX_TIMES_TO_SCROLL = 25
|
|
5
5
|
const TIMEOUT_BETWEEN_SCROLLS = 500
|
|
6
|
-
const PAGE_BOTTOM_SELECTOR_STRING = '#expanded-footer'
|
|
7
6
|
|
|
8
7
|
for (let i = 0; i < MAX_TIMES_TO_SCROLL; i++) {
|
|
9
8
|
await page.evaluate(() => window.scrollBy(0, window.innerHeight))
|
|
10
9
|
|
|
11
|
-
const hasReachedEnd = await page.
|
|
12
|
-
|
|
13
|
-
timeout: TIMEOUT_BETWEEN_SCROLLS
|
|
14
|
-
}).catch(() => {
|
|
15
|
-
logger.info(`scrolling to page bottom (${i + 1})`)
|
|
10
|
+
const hasReachedEnd = await page.evaluate(() => {
|
|
11
|
+
return (window.innerHeight + window.scrollY) >= (document.body.scrollHeight - 200)
|
|
16
12
|
})
|
|
17
13
|
|
|
18
14
|
if (hasReachedEnd) {
|
|
19
15
|
return
|
|
20
16
|
}
|
|
17
|
+
|
|
18
|
+
await new Promise(resolve => setTimeout(resolve, TIMEOUT_BETWEEN_SCROLLS))
|
|
19
|
+
logger.info(`scrolling to page bottom (${i + 1})`)
|
|
21
20
|
}
|
|
22
21
|
|
|
23
22
|
logger.warn('page bottom not found')
|
|
@@ -1,42 +1,18 @@
|
|
|
1
1
|
const logger = require('../logger')(__filename)
|
|
2
|
-
const seeMoreButtons = [
|
|
3
|
-
{
|
|
4
|
-
id: 'SHOW_MORE_ABOUT',
|
|
5
|
-
selector: '#line-clamp-show-more-button'
|
|
6
|
-
},{
|
|
7
|
-
id: 'SHOW_MORE_EXPERIENCES',
|
|
8
|
-
selector: '#experience-section .pv-profile-section__see-more-inline'
|
|
9
|
-
},{
|
|
10
|
-
id: 'SEE_MORE_EXPERIENCES',
|
|
11
|
-
selector: '#experience-section .inline-show-more-text__button'
|
|
12
|
-
},{
|
|
13
|
-
id: 'SHOW_MORE_CERTIFICATIONS',
|
|
14
|
-
selector: '#certifications-section .pv-profile-section__see-more-inline'
|
|
15
|
-
},{
|
|
16
|
-
id: 'SHOW_MORE_SKILLS',
|
|
17
|
-
selector: '.pv-skills-section__additional-skills'
|
|
18
|
-
},{
|
|
19
|
-
id: 'SEE_MORE_RECOMMENDATIONS',
|
|
20
|
-
selector: '.recommendations-inlining #line-clamp-show-more-button'
|
|
21
|
-
}
|
|
22
|
-
]
|
|
23
|
-
|
|
24
2
|
|
|
25
|
-
const clickAll = async(page) => {
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
3
|
+
const clickAll = async (page) => {
|
|
4
|
+
const clicked = await page.evaluate(() => {
|
|
5
|
+
let count = 0
|
|
6
|
+
// Only click expandable text buttons (inline expand, not navigation)
|
|
7
|
+
const expandButtons = document.querySelectorAll('[data-testid="expandable-text-button"]')
|
|
8
|
+
expandButtons.forEach(btn => { btn.click(); count++ })
|
|
9
|
+
return count
|
|
10
|
+
})
|
|
29
11
|
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
await elem.click()
|
|
34
|
-
.catch((e) => logger.warn(`couldn't click on ${button.selector}, it's probably invisible`))
|
|
35
|
-
}
|
|
36
|
-
}
|
|
12
|
+
if (clicked > 0) {
|
|
13
|
+
logger.info(`clicked ${clicked} show-more buttons`)
|
|
14
|
+
await new Promise(resolve => setTimeout(resolve, 500))
|
|
37
15
|
}
|
|
38
|
-
|
|
39
|
-
return
|
|
40
16
|
}
|
|
41
17
|
|
|
42
18
|
module.exports = { clickAll }
|