@mvegter/scrapedin 1.0.33 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@mvegter/scrapedin",
3
- "version": "1.0.33",
4
- "description": "linkedin scraper for 2020 website",
3
+ "version": "1.1.0",
4
+ "description": "linkedin scraper updated for 2025+ website",
5
5
  "keywords": [
6
6
  "linkedin",
7
7
  "scraper",
@@ -21,14 +21,14 @@
21
21
  "author": "Wagner Leonardi <leonardiwagner@gmail.com>",
22
22
  "license": "Apache-2.0",
23
23
  "dependencies": {
24
- "puppeteer": "14.1.1",
24
+ "puppeteer": "24.36.0",
25
25
  "winston": "3.7.2"
26
26
  },
27
27
  "devDependencies": {
28
28
  "standard": "17.0.0"
29
29
  },
30
30
  "engines": {
31
- "node": ">= 7.6.0"
31
+ "node": ">= 16.0.0"
32
32
  },
33
33
  "homepage": "https://github.com/linkedtales/scrapedin#readme"
34
34
  }
@@ -5,31 +5,30 @@ const template = require('./companyScraperTemplate')
5
5
  const logger = require('../logger')(__filename)
6
6
 
7
7
  module.exports = async (browser, cookies, url, waitTimeToScrapMs = 500, puppeteerAuthenticate = undefined) => {
8
- logger.info(`starting scraping url: ${url}`);
8
+ logger.info(`starting scraping url: ${url}`)
9
9
 
10
- let company = {};
10
+ const company = {}
11
11
 
12
- let page;
13
- if(url.includes('legacySchoolId=')){
14
- page = await openPage({ browser, cookies, url, puppeteerAuthenticate });
12
+ let page
13
+ if (url.includes('legacySchoolId=')) {
14
+ page = await openPage({ browser, cookies, url, puppeteerAuthenticate })
15
15
 
16
- const aboutSelector = 'a[href$="/about/"]';
16
+ const aboutSelector = 'a[href$="/about/"]'
17
17
 
18
- company.url = page.url();
19
-
20
- await page.$eval(aboutSelector, async about => await about.click());
21
- await page.waitForNavigation();
22
- } else{
23
- company.url = url;
24
- url = url + '/about';
25
- page = await openPage({ browser, cookies, url, puppeteerAuthenticate });
18
+ company.url = page.url()
19
+
20
+ await page.$eval(aboutSelector, async about => await about.click())
21
+ await page.waitForNavigation()
22
+ } else {
23
+ company.url = url
24
+ url = url + '/about'
25
+ page = await openPage({ browser, cookies, url, puppeteerAuthenticate })
26
26
  }
27
- company.about = (await scrapSection(page, template.about))[0];
28
- company.profile = (await scrapSection(page, template.profile))[0];
27
+ company.about = (await scrapSection(page, template.about))[0]
28
+ company.profile = (await scrapSection(page, template.profile))[0]
29
+
30
+ await page.close()
31
+ logger.info(`finished scraping url: ${url}`)
29
32
 
30
- await page.close();
31
- logger.info(`finished scraping url: ${url}`);
32
-
33
33
  return company
34
-
35
34
  }
@@ -1,30 +1,29 @@
1
1
  const template = {
2
- profile: {
3
- selector: '.org-top-card',
4
- fields: {
5
- name: `h1`,
6
- headline: `p`,
7
- imageurl: {
8
- selector: `img.org-top-card-primary-content__logo`,
9
- attribute: 'src'
10
- }
11
- }
12
- },
13
- about: {
14
- selector: '.org-grid__core-rail--no-margin-left',
15
- fields: {
16
- overview: 'p',
17
- types:{
18
- selector: 'dl dt',
19
- isMultipleFields: true
20
- },
21
- values:{
22
- selector: 'dl dd:not(.org-page-details__employees-on-linkedin-count)',
23
- isMultipleFields: true
24
- }
25
- }
2
+ profile: {
3
+ selector: '.org-top-card, .top-card-layout',
4
+ fields: {
5
+ name: 'h1',
6
+ headline: '.org-top-card-summary__tagline, .top-card-layout__headline, p',
7
+ imageurl: {
8
+ selector: 'img.org-top-card-primary-content__logo, img.top-card-layout__entity-image',
9
+ attribute: 'src'
10
+ }
26
11
  }
12
+ },
13
+ about: {
14
+ selector: '.org-grid__core-rail--no-margin-left, .org-about-us-organization-description, .core-section-container',
15
+ fields: {
16
+ overview: 'p',
17
+ types: {
18
+ selector: 'dl dt, .org-page-details__definition-term',
19
+ isMultipleFields: true
20
+ },
21
+ values: {
22
+ selector: 'dl dd:not(.org-page-details__employees-on-linkedin-count), .org-page-details__definition-text',
23
+ isMultipleFields: true
24
+ }
25
+ }
26
+ }
27
27
  }
28
28
 
29
-
30
29
  module.exports = template
package/src/login.js CHANGED
@@ -14,34 +14,36 @@ module.exports = async (browser, email, password) => {
14
14
  await page.$('#password')
15
15
  .then((passwordElement) => passwordElement.type(password))
16
16
 
17
- await page.$x("//button[contains(text(), 'Sign in')]")
18
- .then((button) => button[0].click())
17
+ await page.locator('button[type="submit"]').click()
19
18
 
20
- return page.waitForSelector('input[role=combobox]', {
21
- timeout: 15000
22
- })
19
+ return page.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 15000 })
23
20
  .then(async () => {
24
- logger.info('logged feed page selector found')
25
- await page.close()
21
+ const currentUrl = page.url()
22
+ if (currentUrl.includes('/feed') || currentUrl.includes('/mynetwork') || currentUrl.includes('/in/')) {
23
+ logger.info('logged in, redirected to: ' + currentUrl)
24
+ await page.close()
25
+ return
26
+ }
27
+ throw new Error('unexpected redirect: ' + currentUrl)
26
28
  })
27
29
  .catch(async () => {
28
30
  logger.warn('successful login element was not found')
29
31
  const emailError = await page.evaluate(() => {
30
- const e = document.querySelector('div[error-for=username]')
32
+ const e = document.querySelector('div[error-for=username], #error-for-username')
31
33
  if (!e) { return false }
32
34
  const style = window.getComputedStyle(e)
33
35
  return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
34
36
  })
35
37
 
36
38
  const passwordError = await page.evaluate(() => {
37
- const e = document.querySelector('div[error-for=password]')
39
+ const e = document.querySelector('div[error-for=password], #error-for-password')
38
40
  if (!e) { return false }
39
41
  const style = window.getComputedStyle(e)
40
42
  return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
41
43
  })
42
44
 
43
45
  const manualChallengeRequested = await page.evaluate(() => {
44
- const e = document.querySelector('.flow-challenge-content')
46
+ const e = document.querySelector('.flow-challenge-content, #challenge, [data-test-id="challenge"]')
45
47
  if (!e) { return false }
46
48
  const style = window.getComputedStyle(e)
47
49
  return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
@@ -57,7 +59,7 @@ module.exports = async (browser, email, password) => {
57
59
  return Promise.reject(new Error('linkedin: invalid password'))
58
60
  }
59
61
 
60
- if (page.$(manualChallengeRequested)) {
62
+ if (manualChallengeRequested) {
61
63
  logger.warn('manual check was required')
62
64
  return Promise.reject(new Error(`linkedin: manual check was required, verify if your login is properly working manually or report this issue: ${pkg.name} ${pkg.version} ${pkg.bugs.url}`))
63
65
  }
package/src/openPage.js CHANGED
@@ -1,23 +1,13 @@
1
- const agents = [
2
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
3
- // "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
4
- // "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
5
- // "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:34.0) Gecko/20100101 Firefox/34.0",
6
- // "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
7
- // "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
8
- // "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
9
- // "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
10
- // "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
11
- ]
1
+ const AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
12
2
 
13
3
  module.exports = async ({ browser, cookies, url, puppeteerAuthenticate }) => {
14
4
  const page = await browser.newPage()
15
- await page.setDefaultNavigationTimeout(0)
5
+ await page.setDefaultNavigationTimeout(60000)
16
6
 
17
7
  if (cookies) {
18
8
  await page.setCookie(...cookies)
19
9
  }
20
- await page.setUserAgent(agents[Math.floor(Math.random() * agents.length)])
10
+ await page.setUserAgent(AGENT)
21
11
  await page.setExtraHTTPHeaders({ 'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8' })
22
12
  await page.setViewport({
23
13
  width: 1920,
@@ -2,7 +2,7 @@ const logger = require('../logger')(__filename)
2
2
  const pkg = require('../package')
3
3
 
4
4
  module.exports = (profile) => {
5
- if(!profile?.profile?.name) {
5
+ if (!profile?.profile?.name) {
6
6
  const messageError = `LinkedIn website changed and ${pkg.name} ${pkg.version} can't read basic data. Please report this issue at ${pkg.bugs.url}`
7
7
  logger.error(messageError, '')
8
8
  throw new Error(messageError)
@@ -11,17 +11,17 @@ module.exports = (profile) => {
11
11
  profile.profile.summary = profile?.about?.text
12
12
 
13
13
  profile.positions.forEach((position) => {
14
- if(position.title){
15
- position.title = position.title.replace('Company Name\n', '')
14
+ if (position.title) {
15
+ position.title = position.title.replace('Company Name\n', '')
16
16
  }
17
- if(position.description) {
18
- position.description = position.description.replace('See more', '');
19
- position.description = position.description.replace('see more', '');
20
- position.description = position.description.replace('See less', '');
17
+ if (position.description) {
18
+ position.description = position.description.replace('See more', '')
19
+ position.description = position.description.replace('see more', '')
20
+ position.description = position.description.replace('See less', '')
21
21
  }
22
- if(position.roles) {
22
+ if (position.roles) {
23
23
  position.roles.forEach((role) => {
24
- if(role.title) {
24
+ if (role.title) {
25
25
  role.title = role.title.replace('Title\n', '')
26
26
  }
27
27
  if (role.date) {
@@ -29,7 +29,7 @@ module.exports = (profile) => {
29
29
  role.date2 = role.date.replace('·', '-').split('-')[1].trim()
30
30
  delete role.date
31
31
  }
32
- if(role.description) {
32
+ if (role.description) {
33
33
  role.description = role.description.replace('See more', '')
34
34
  role.description = role.description.replace('see more', '')
35
35
  }
@@ -37,63 +37,62 @@ module.exports = (profile) => {
37
37
  }
38
38
  })
39
39
 
40
- if(profile.recommendations.receivedCount) {
40
+ if (profile.recommendations.receivedCount) {
41
41
  profile.recommendations.receivedCount = profile.recommendations.receivedCount.replace(/[^\d]/g, '')
42
42
  }
43
43
 
44
- if(profile.recommendations.givenCount) {
44
+ if (profile.recommendations.givenCount) {
45
45
  profile.recommendations.givenCount = profile.recommendations.givenCount.replace(/[^\d]/g, '')
46
46
  }
47
47
 
48
- if(profile.recommendations.received) {
48
+ if (profile.recommendations.received) {
49
49
  profile.recommendations.received.forEach((recommendation) => {
50
- if(recommendation.summary){
50
+ if (recommendation.summary) {
51
51
  recommendation.summary = recommendation.summary.replace('See more', '')
52
52
  recommendation.summary = recommendation.summary.replace('See less', '')
53
53
  }
54
54
  })
55
55
  }
56
56
 
57
- if(profile.recommendations.given) {
57
+ if (profile.recommendations.given) {
58
58
  profile.recommendations.given.forEach((recommendation) => {
59
- if(recommendation.summary){
59
+ if (recommendation.summary) {
60
60
  recommendation.summary = recommendation.summary.replace('See more', '')
61
61
  recommendation.summary = recommendation.summary.replace('See less', '')
62
62
  }
63
63
  })
64
64
  }
65
65
 
66
- if(profile.courses){
66
+ if (profile.courses) {
67
67
  profile.courses = profile.courses.map(({ name, year }) => {
68
68
  const coursesObj = {}
69
- if(name) {
69
+ if (name) {
70
70
  coursesObj.name = name.replace('Course name\n', '')
71
71
  }
72
- if(year) {
72
+ if (year) {
73
73
  coursesObj.year = year.replace('Course number\n', '')
74
74
  }
75
75
  return coursesObj
76
- }
77
- );
76
+ })
78
77
  }
79
78
 
80
- if(profile.languages){
79
+ if (profile.languages) {
81
80
  profile.languages = profile.languages.map(({ name, proficiency }) => ({
82
81
  name: name ? name.replace('Language name\n', '') : undefined,
83
- proficiency,
84
- }));
82
+ proficiency
83
+ }))
85
84
  }
86
85
 
87
- if(profile.projects){
86
+ if (profile.projects) {
88
87
  profile.projects = profile.projects.map(
89
88
  ({ name, date, description, link }) => ({
90
89
  name: name ? name.replace('Project name\n', '') : undefined,
91
90
  date,
92
91
  description: description ? description.replace('Project description\n', '') : undefined,
93
- link,
94
- }),
95
- );
92
+ link
93
+ })
94
+ )
96
95
  }
97
-
96
+
98
97
  return profile
99
98
  }
@@ -1,15 +1,15 @@
1
1
  const logger = require('../logger')(__filename)
2
2
  const scrapSection = require('../scrapSection')
3
3
 
4
- const SEE_MORE_SELECTOR = 'a[data-control-name=contact_see_more]'
5
- const CLOSE_MODAL_SELECTOR = '.artdeco-modal__dismiss';
4
+ const SEE_MORE_SELECTOR = '#top-card-text-details-contact-info'
5
+ const CLOSE_MODAL_SELECTOR = '.artdeco-modal__dismiss'
6
6
 
7
7
  const template = {
8
- selector: '.pv-contact-info__contact-type',
8
+ selector: '.pv-contact-info__contact-type, .ci-vanity-url, .ci-email, .ci-phone, .ci-websites, .ci-birthday, .ci-ims, .ci-address',
9
9
  fields: {
10
- type: 'header',
10
+ type: 'header, h3',
11
11
  values: {
12
- selector: '.pv-contact-info__ci-container',
12
+ selector: '.pv-contact-info__ci-container, .t-14',
13
13
  isMultipleFields: true
14
14
  },
15
15
  links: {
@@ -18,31 +18,29 @@ const template = {
18
18
  isMultipleFields: true
19
19
  }
20
20
  }
21
- }
22
- const getContactInfo = async(page) => {
23
- await page.waitFor(SEE_MORE_SELECTOR, { timeout: 2000 })
21
+ }
22
+ const getContactInfo = async (page) => {
23
+ await page.waitForSelector(SEE_MORE_SELECTOR, { timeout: 2000 })
24
24
  .catch(() => {
25
- logger.warn('contact-info', 'selector not found')
25
+ logger.warn('contact-info selector not found')
26
26
  return {}
27
27
  })
28
28
 
29
29
  const element = await page.$(SEE_MORE_SELECTOR)
30
- if(element){
30
+ if (element) {
31
31
  await element.click()
32
- const contactInfoIndicatorSelector = '#pv-contact-info'
33
- await page.waitFor(contactInfoIndicatorSelector, { timeout: 5000 })
34
- .catch(() => {
35
- logger.warn('contact info was not found')
36
- })
37
-
32
+ const contactInfoIndicatorSelector = '.pv-profile-section__section-info, .artdeco-modal__content'
33
+ await page.waitForSelector(contactInfoIndicatorSelector, { timeout: 5000 })
34
+ .catch(() => {
35
+ logger.warn('contact info was not found')
36
+ })
37
+
38
38
  const contactInfo = await scrapSection(page, template)
39
39
  const closeButton = await page.$(CLOSE_MODAL_SELECTOR)
40
- if(closeButton)
41
- await closeButton.click()
40
+ if (closeButton) { await closeButton.click() }
42
41
 
43
42
  return contactInfo
44
43
  }
45
-
46
44
  }
47
45
 
48
46
  module.exports = getContactInfo
@@ -1,81 +1,221 @@
1
1
  const openPage = require('../openPage')
2
- const scrapSection = require('../scrapSection')
3
- const scrapAccomplishmentPanel = require('./scrapAccomplishmentPanel')
4
2
  const scrollToPageBottom = require('./scrollToPageBottom')
5
3
  const seeMoreButtons = require('./seeMoreButtons')
6
4
  const contactInfo = require('./contactInfo')
7
- const template = require('./profileScraperTemplate')
8
5
  const cleanProfileData = require('./cleanProfileData')
9
6
 
10
7
  const logger = require('../logger')(__filename)
11
8
 
9
+ const extractProfileData = async (page) => {
10
+ return page.evaluate(() => {
11
+ const txt = (el) => el ? (el.textContent || '').trim() : ''
12
+
13
+ const byViewName = (name) => document.querySelector(`[data-view-name="${name}"]`)
14
+
15
+ const getSection = (viewName) => {
16
+ const el = byViewName(viewName)
17
+ if (!el) return null
18
+ return el.querySelector('section') || el.closest('section')
19
+ }
20
+
21
+ // Get items from a section. For sections without ul/li, items are divs
22
+ // found after the h2 heading: h2.parent.nextSibling > div > div > div
23
+ const getSectionItems = (viewName) => {
24
+ const section = getSection(viewName)
25
+ if (!section) return []
26
+ const ul = section.querySelector('ul')
27
+ if (ul) return [...ul.querySelectorAll(':scope > li')]
28
+ const h2 = section.querySelector('h2')
29
+ if (!h2) return []
30
+ const afterH2 = h2.parentElement.nextElementSibling
31
+ if (!afterH2) return []
32
+ const itemDivs = afterH2.querySelectorAll(':scope > div > div > div')
33
+ return [...itemDivs].filter(d => d.querySelectorAll('p').length > 0)
34
+ }
35
+
36
+ // Profile top card: name is in h2, headline/location in p tags
37
+ const titleName = document.title.replace(' | LinkedIn', '').trim()
38
+ const mainLevel = byViewName('profile-main-level')
39
+ const topSection = mainLevel ? (mainLevel.querySelector('section') || mainLevel.closest('section')) : null
40
+ const nameH2 = topSection ? topSection.querySelector('h2') : null
41
+ const photoImg = document.querySelector('img[src*="profile-displayphoto"]')
42
+
43
+ const profileName = nameH2 ? txt(nameH2) : titleName
44
+ let headline = ''
45
+ let location = ''
46
+
47
+ if (topSection) {
48
+ const allP = [...topSection.querySelectorAll('p')]
49
+ headline = txt(allP[0])
50
+ for (const p of allP) {
51
+ const t = txt(p)
52
+ if (t.includes(',') && !t.includes('at ') && !t.includes('\xB7')) {
53
+ location = t
54
+ break
55
+ }
56
+ }
57
+ }
58
+
59
+ const profile = {
60
+ name: profileName,
61
+ headline,
62
+ location,
63
+ connections: '',
64
+ imageurl: photoImg ? photoImg.getAttribute('src') || '' : ''
65
+ }
66
+
67
+ // Experience: uses LazyColumn with company groups containing ul > li
68
+ const positions = []
69
+ const expSection = getSection('profile-card-experience')
70
+ if (expSection) {
71
+ const lazyCol = expSection.querySelector('[data-component-type]') || expSection
72
+ const groups = [...lazyCol.children]
73
+
74
+ groups.forEach(group => {
75
+ const ul = group.querySelector('ul')
76
+ if (!ul) return
77
+
78
+ // Company info is in p tags before the ul
79
+ const allPs = group.querySelectorAll('p')
80
+ const beforeUlPs = []
81
+ for (const p of allPs) {
82
+ if (ul.contains(p)) break
83
+ beforeUlPs.push(p)
84
+ }
85
+ const companyName = txt(beforeUlPs[0])
86
+ const companyLink = group.querySelector('a[href*="/company/"]')
87
+ const companyUrl = companyLink ? companyLink.getAttribute('href') : ''
88
+ const companyLocation = txt(beforeUlPs[2])
89
+
90
+ const lis = [...ul.querySelectorAll(':scope > li')]
91
+ lis.forEach(li => {
92
+ const ps = [...li.querySelectorAll('div[role="button"] p')]
93
+ const descEl = li.querySelector('[data-testid="expandable-text-box"]')
94
+ const dateStr = txt(ps[2])
95
+ const dateRange = dateStr.split('\xB7')[0].trim()
96
+ const dateParts = dateRange.split(' - ')
97
+ positions.push({
98
+ title: txt(ps[0]),
99
+ companyName,
100
+ link: companyUrl,
101
+ url: companyUrl,
102
+ location: companyLocation,
103
+ description: descEl ? txt(descEl) : '',
104
+ date: dateRange,
105
+ date1: dateParts[0] ? dateParts[0].trim() : '',
106
+ date2: dateParts[1] ? dateParts[1].trim() : ''
107
+ })
108
+ })
109
+ })
110
+ }
111
+
112
+ // Education
113
+ const eduItems = getSectionItems('profile-card-education')
114
+ const educations = eduItems.map(item => {
115
+ const ps = [...item.querySelectorAll('p')]
116
+ const link = item.querySelector('a[href*="/school/"]') ||
117
+ item.closest('div')?.parentElement?.querySelector('a[href*="/school/"]')
118
+ return {
119
+ title: txt(ps[0]),
120
+ degree: txt(ps[1]),
121
+ fieldOfStudy: txt(ps[1]),
122
+ url: link ? link.getAttribute('href') : '',
123
+ date1: ps[2] ? txt(ps[2]).split('\u2013')[0].trim() : '',
124
+ date2: ps[2] ? (txt(ps[2]).split('\u2013')[1] || '').trim() : '',
125
+ description: ''
126
+ }
127
+ })
128
+
129
+ // Skills
130
+ const skillItems = getSectionItems('profile-card-skills')
131
+ const skills = skillItems.map(item => {
132
+ const ps = [...item.querySelectorAll('p')]
133
+ return { title: txt(ps[0]), count: '' }
134
+ })
135
+
136
+ // Languages
137
+ const langItems = getSectionItems('profile-card-languages')
138
+ const languages = langItems.map(item => {
139
+ const ps = [...item.querySelectorAll('p')]
140
+ return { name: txt(ps[0]), proficiency: txt(ps[1]) }
141
+ })
142
+
143
+ // Projects
144
+ const projItems = getSectionItems('profile-card-projects')
145
+ const projects = projItems.map(item => {
146
+ const ps = [...item.querySelectorAll('p')]
147
+ const descEl = item.querySelector('[data-testid="expandable-text-box"]')
148
+ const link = item.querySelector('a[href*="http"]')
149
+ return {
150
+ name: txt(ps[0]),
151
+ date: txt(ps[1]),
152
+ description: descEl ? txt(descEl) : '',
153
+ link: link ? link.getAttribute('href') : ''
154
+ }
155
+ })
156
+
157
+ // Certifications
158
+ const certItems = getSectionItems('profile-card-licenses-and-certifications')
159
+ const accomplishments = certItems.map(item => {
160
+ const ps = [...item.querySelectorAll('p')]
161
+ return { title: txt(ps[0]), count: '', items: [] }
162
+ })
163
+
164
+ return {
165
+ profile,
166
+ about: { text: '' },
167
+ positions,
168
+ educations,
169
+ skills,
170
+ recommendations: { givenCount: '0', receivedCount: '0', given: [], received: [] },
171
+ accomplishments,
172
+ courses: [],
173
+ languages,
174
+ projects,
175
+ peopleAlsoViewed: [],
176
+ volunteerExperience: [],
177
+ contact: []
178
+ }
179
+ })
180
+ }
181
+
12
182
  module.exports = async (browser, cookies, url, waitTimeToScrapMs = 500, hasToGetContactInfo = false, puppeteerAuthenticate = undefined) => {
13
183
  logger.info(`starting scraping url: ${url}`)
14
184
 
15
185
  const page = await openPage({ browser, cookies, url, puppeteerAuthenticate })
16
- const profilePageIndicatorSelector = '.pv-profile-section'
17
- await page.waitForSelector(profilePageIndicatorSelector, { timeout: 5000 })
186
+
187
+ // Wait for the SDUI profile to fully hydrate
188
+ await page.waitForFunction(() => {
189
+ return document.querySelector('[data-view-name="profile-card-experience"]')
190
+ }, { timeout: 30000 })
18
191
  .catch(() => {
19
- //why doesn't throw error instead of continuing scraping?
20
- //because it can be just a false negative meaning LinkedIn only changed that selector but everything else is fine :)
21
- logger.warn('profile selector was not found')
192
+ logger.warn('profile content did not fully render in time')
22
193
  })
23
194
 
24
195
  logger.info('scrolling page to the bottom')
25
196
  await scrollToPageBottom(page)
26
-
27
- if(waitTimeToScrapMs) {
28
- logger.info(`applying 1st delay`)
29
- await new Promise((resolve) => { setTimeout(() => { resolve() }, waitTimeToScrapMs / 2)})
30
- }
197
+
198
+ // Wait for lazy-loaded sections to render after scrolling
199
+ await new Promise((resolve) => { setTimeout(resolve, 2000) })
200
+
201
+ // Scroll again in case new content was loaded
202
+ await scrollToPageBottom(page)
203
+ await new Promise((resolve) => { setTimeout(resolve, 1000) })
31
204
 
32
205
  await seeMoreButtons.clickAll(page)
33
206
 
34
- if(waitTimeToScrapMs) {
35
- logger.info(`applying 2nd (and last) delay`)
36
- await new Promise((resolve) => { setTimeout(() => { resolve() }, waitTimeToScrapMs / 2)})
37
- }
207
+ // Final wait for content to settle
208
+ await new Promise((resolve) => { setTimeout(resolve, 1000) })
38
209
 
39
- const [profile] = await scrapSection(page, template.profile)
40
- const [about] = await scrapSection(page, template.about)
41
- const positions = await scrapSection(page, template.positions)
42
- const educations = await scrapSection(page, template.educations)
43
- const [recommendationsCount] = await scrapSection(page, template.recommendationsCount)
44
- const recommendationsReceived = await scrapSection(page, template.recommendationsReceived)
45
- const recommendationsGiven = await scrapSection(page, template.recommendationsGiven)
46
- const skills = await scrapSection(page, template.skills)
47
- const accomplishments = await scrapSection(page, template.accomplishments)
48
- const courses = await scrapAccomplishmentPanel(page, 'courses')
49
- const languages = await scrapAccomplishmentPanel(page, 'languages')
50
- const projects = await scrapAccomplishmentPanel(page, 'projects')
51
- const volunteerExperience = await scrapSection(page, template.volunteerExperience)
52
- const peopleAlsoViewed = await scrapSection(page, template.peopleAlsoViewed)
53
- const contact = hasToGetContactInfo ? await contactInfo(page) : []
210
+ const rawProfile = await extractProfileData(page)
211
+
212
+ if (hasToGetContactInfo) {
213
+ rawProfile.contact = await contactInfo(page) || []
214
+ }
54
215
 
55
216
  await page.close()
56
217
  logger.info(`finished scraping url: ${url}`)
57
218
 
58
- const rawProfile = {
59
- profile,
60
- about,
61
- positions,
62
- educations,
63
- skills,
64
- recommendations: {
65
- givenCount: recommendationsCount ? recommendationsCount.given : "0",
66
- receivedCount: recommendationsCount ? recommendationsCount.received : "0",
67
- given: recommendationsReceived,
68
- received: recommendationsGiven
69
- },
70
- accomplishments,
71
- courses,
72
- languages,
73
- projects,
74
- peopleAlsoViewed,
75
- volunteerExperience,
76
- contact
77
- }
78
-
79
219
  const cleanedProfile = cleanProfileData(rawProfile)
80
220
  return cleanedProfile
81
221
  }
@@ -1,189 +1,186 @@
1
- const profileSelector = '.core-rail > *:first-child section >'
2
-
3
1
  const template = {
4
2
  profile: {
5
- selector: '.pb5',
3
+ selector: '.scaffold-layout__main',
6
4
  fields: {
7
- name: `.text-heading-xlarge`,
8
- headline: `.text-body-medium`,
9
- location: `.pb2 .text-body-small`,
10
- connections: `li.text-body-small`,
5
+ name: '.text-heading-xlarge',
6
+ headline: '.text-body-medium',
7
+ location: '.text-body-small.inline.t-black--light.break-words',
8
+ connections: '.t-bold',
11
9
  imageurl: {
12
- selector: `img.pv-top-card__photo`,
10
+ selector: 'img.pv-top-card-profile-picture__image',
13
11
  attribute: 'src'
14
12
  }
15
13
  }
16
14
  },
17
15
  about: {
18
- selector: '.pv-about-section',
16
+ selector: '#about ~ .display-flex .inline-show-more-text',
19
17
  fields: {
20
- text: 'div'
18
+ text: 'span[aria-hidden="true"]'
21
19
  }
22
20
  },
23
21
  positions: {
24
- selector: 'div[id="experience"] + div + div li.artdeco-list__item',
22
+ selector: '#experience ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
25
23
  fields: {
26
- title: 'h3',
24
+ title: '.mr1.hoverable-link-text.t-bold > span',
27
25
  link: {
28
- selector: 'a',
29
- attribute: 'href',
26
+ selector: 'a.optional-action-target-wrapper',
27
+ attribute: 'href'
30
28
  },
31
29
  url: {
32
- selector: 'a',
30
+ selector: 'a.optional-action-target-wrapper',
33
31
  attribute: 'href'
34
32
  },
35
- companyName: 'div.t-bold span:first-child',
36
- location: '.pv-entity__location span:last-child',
37
- description: '.pv-entity__description',
38
- date1: '.pv-entity__date-range span:last-child',
39
- date2: '.pv-entity__bullet-item-v2',
33
+ companyName: '.t-14.t-normal > span',
34
+ location: '.t-14.t-normal.t-black--light > span',
35
+ description: '.pvs-list__outer-container .inline-show-more-text span[aria-hidden="true"]',
36
+ date1: '.pvs-entity__caption-wrapper',
37
+ date2: '.pvs-entity__caption-wrapper',
40
38
  roles: {
41
- selector: 'div.full-width > div > ul div[data-view-name]',
39
+ selector: '.pvs-entity__sub-components li.pvs-list__paged-list-item',
42
40
  hasChildrenFields: true,
43
41
  fields: {
44
- title: 'div.t-bold span:first-child',
45
- description: 'div.display-flex div.full-width > span[aria-hidden]',
42
+ title: '.mr1.hoverable-link-text.t-bold > span',
43
+ description: '.inline-show-more-text span[aria-hidden="true"]',
46
44
  date: '.pvs-entity__caption-wrapper',
47
- location: '.pv-entity__location span:last-child'
45
+ location: '.t-14.t-normal.t-black--light > span'
48
46
  }
49
47
  }
50
48
  }
51
49
  },
52
50
  educations: {
53
- selector: '#education-section li',
51
+ selector: '#education ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
54
52
  fields: {
55
- title: 'h3',
56
- degree: 'span[class=pv-entity__comma-item]',
53
+ title: '.hoverable-link-text.t-bold > span',
54
+ degree: '.t-14.t-normal > span',
57
55
  url: {
58
56
  selector: 'a',
59
57
  attribute: 'href'
60
58
  },
61
- fieldOfStudy: 'p.pv-entity__fos span:nth-child(2)',
62
- date1: '.pv-entity__dates time:nth-child(1)',
63
- date2: '.pv-entity__dates time:nth-child(2)',
64
- description: '.pv-entity__description'
59
+ fieldOfStudy: '.t-14.t-normal > span',
60
+ date1: '.pvs-entity__caption-wrapper',
61
+ date2: '.pvs-entity__caption-wrapper',
62
+ description: '.inline-show-more-text span[aria-hidden="true"]'
65
63
  }
66
64
  },
67
65
  skills: {
68
- selector: '.pv-skill-category-entity__skill-wrapper',
66
+ selector: '#skills ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
69
67
  fields: {
70
- title: '.pv-skill-category-entity__name-text',
71
- count: '.pv-skill-category-entity__endorsement-count'
68
+ title: '.mr1.hoverable-link-text.t-bold > span',
69
+ count: '.t-14.t-normal.t-black--light > span'
72
70
  }
73
71
  },
74
72
  recommendationsCount: {
75
- selector: '.recommendations-inlining',
73
+ selector: '#recommendations ~ .pvs-list__outer-container',
76
74
  fields: {
77
75
  received: '.artdeco-tab:nth-child(1)',
78
76
  given: '.artdeco-tab:nth-child(2)'
79
77
  }
80
78
  },
81
79
  recommendationsReceived: {
82
- selector: '.recommendations-inlining',
80
+ selector: '#recommendations ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
83
81
  fields: {
84
82
  user: {
85
- selector: '.pv-recommendation-entity__member',
83
+ selector: 'a',
86
84
  attribute: 'href'
87
85
  },
88
- text: 'blockquote.pv-recommendation-entity__text',
86
+ text: '.inline-show-more-text span[aria-hidden="true"]',
89
87
  profileImage: {
90
- selector: 'a img',
88
+ selector: 'img',
91
89
  attribute: 'src'
92
90
  },
93
91
  name: {
94
- selector: 'a h3'
92
+ selector: '.t-bold > span'
95
93
  },
96
94
  userDescription: {
97
- selector: '.pv-recommendation-entity__headline'
95
+ selector: '.t-14.t-normal > span'
98
96
  }
99
97
  }
100
98
  },
101
99
  recommendationsGiven: {
102
- selector: '.artdeco-tabpanel li.pv-recommendation-entity',
100
+ selector: '#recommendations ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
103
101
  fields: {
104
102
  user: {
105
- selector: '.pv-recommendation-entity__member',
103
+ selector: 'a',
106
104
  attribute: 'href'
107
105
  },
108
- text: 'blockquote.pv-recommendation-entity__text',
106
+ text: '.inline-show-more-text span[aria-hidden="true"]',
109
107
  profileImage: {
110
- selector: 'a img',
108
+ selector: 'img',
111
109
  attribute: 'src'
112
110
  },
113
111
  name: {
114
- selector: 'a h3'
112
+ selector: '.t-bold > span'
115
113
  },
116
114
  userDescription: {
117
- selector: '.pv-recommendation-entity__headline'
115
+ selector: '.t-14.t-normal > span'
118
116
  }
119
117
  }
120
118
  },
121
119
  accomplishments: {
122
- selector: '.pv-accomplishments-section > div',
120
+ selector: '#honors_and_awards ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
123
121
  fields: {
124
- count: 'h3 span:last-child',
125
- title: '.pv-accomplishments-block__title',
122
+ count: '.t-14.t-normal.t-black--light > span',
123
+ title: '.mr1.hoverable-link-text.t-bold > span',
126
124
  items: {
127
- selector: 'li',
125
+ selector: '.pvs-list__outer-container li',
128
126
  isMultipleFields: true
129
127
  }
130
128
  }
131
129
  },
132
130
  peopleAlsoViewed: {
133
- selector: 'li.pv-browsemap-section__member-container',
131
+ selector: '.pv-browsemap-section li',
134
132
  fields: {
135
133
  user: {
136
134
  selector: 'a',
137
135
  attribute: 'href'
138
136
  },
139
- text: 'p',
137
+ text: '.t-14.t-normal',
140
138
  profileImage: {
141
- selector: 'a img',
139
+ selector: 'img',
142
140
  attribute: 'src'
143
141
  },
144
142
  name: {
145
- selector: '.name'
143
+ selector: '.t-bold'
146
144
  }
147
145
  }
148
146
  },
149
147
  volunteerExperience: {
150
- selector: 'section.volunteering-section li',
148
+ selector: '#volunteering_experience ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
151
149
  fields: {
152
- title: 'h3',
153
- experience: 'span[class=pv-entity__secondary-title]',
154
- location: '.pv-entity__location span:nth-child(2)',
155
- description: '.pv-volunteer-causes',
156
- date1: '.pv-entity__date-range span:nth-child(2)',
157
- date2: '.pv-entity__bullet-item'
150
+ title: '.mr1.hoverable-link-text.t-bold > span',
151
+ experience: '.t-14.t-normal > span',
152
+ location: '.t-14.t-normal.t-black--light > span',
153
+ description: '.inline-show-more-text span[aria-hidden="true"]',
154
+ date1: '.pvs-entity__caption-wrapper',
155
+ date2: '.pvs-entity__caption-wrapper'
158
156
  }
159
157
  },
160
158
  courses: {
161
- selector: '.pv-accomplishments-section',
159
+ selector: '#courses ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
162
160
  fields: {
163
- name: '.pv-accomplishment-entity__title',
164
- year: '.pv-accomplishment-entity__course-number'
161
+ name: '.mr1.hoverable-link-text.t-bold > span',
162
+ year: '.t-14.t-normal > span'
165
163
  }
166
164
  },
167
165
  languages: {
168
- selector: '.pv-accomplishments-block.languages li',
166
+ selector: '#languages ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
169
167
  fields: {
170
- name: '.pv-accomplishment-entity__title',
171
- proficiency: '.pv-accomplishment-entity__proficiency',
168
+ name: '.mr1.t-bold > span',
169
+ proficiency: '.t-14.t-normal.t-black--light > span'
172
170
  }
173
171
  },
174
172
  projects: {
175
- selector: '.pv-accomplishments-block.projects li',
173
+ selector: '#projects ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
176
174
  fields: {
177
- name: '.pv-accomplishment-entity__title',
178
- date: '.pv-accomplishment-entity__date',
179
- description: '.pv-accomplishment-entity__description',
175
+ name: '.mr1.hoverable-link-text.t-bold > span',
176
+ date: '.pvs-entity__caption-wrapper',
177
+ description: '.inline-show-more-text span[aria-hidden="true"]',
180
178
  link: {
181
- selector: '.mt4',
179
+ selector: 'a',
182
180
  attribute: 'href'
183
181
  }
184
182
  }
185
183
  }
186
184
  }
187
185
 
188
-
189
186
  module.exports = template
@@ -1,18 +1,11 @@
1
- const scrapSection = require('../scrapSection');
2
- const template = require('./profileScraperTemplate');
1
+ const scrapSection = require('../scrapSection')
2
+ const template = require('./profileScraperTemplate')
3
3
 
4
4
  const scrapAccomplishmentPanel = async (page, section) => {
5
- const queryString = `.pv-accomplishments-block.${section} button`
6
-
7
- const openingButton = await page.$(queryString);
8
-
9
- if (openingButton) {
10
- await page.evaluate((q) => {
11
- document.querySelector(q).click();
12
- }, queryString);
13
-
14
- return scrapSection(page, template[section]);
5
+ if (!template[section]) {
6
+ return []
15
7
  }
16
- };
8
+ return scrapSection(page, template[section])
9
+ }
17
10
 
18
- module.exports = scrapAccomplishmentPanel;
11
+ module.exports = scrapAccomplishmentPanel
@@ -3,21 +3,20 @@ const logger = require('../logger')(__filename)
3
3
  module.exports = async (page) => {
4
4
  const MAX_TIMES_TO_SCROLL = 25
5
5
  const TIMEOUT_BETWEEN_SCROLLS = 500
6
- const PAGE_BOTTOM_SELECTOR_STRING = '#expanded-footer'
7
6
 
8
7
  for (let i = 0; i < MAX_TIMES_TO_SCROLL; i++) {
9
8
  await page.evaluate(() => window.scrollBy(0, window.innerHeight))
10
9
 
11
- const hasReachedEnd = await page.waitForSelector(PAGE_BOTTOM_SELECTOR_STRING, {
12
- visible: true,
13
- timeout: TIMEOUT_BETWEEN_SCROLLS
14
- }).catch(() => {
15
- logger.info(`scrolling to page bottom (${i + 1})`)
10
+ const hasReachedEnd = await page.evaluate(() => {
11
+ return (window.innerHeight + window.scrollY) >= (document.body.scrollHeight - 200)
16
12
  })
17
13
 
18
14
  if (hasReachedEnd) {
19
15
  return
20
16
  }
17
+
18
+ await new Promise(resolve => setTimeout(resolve, TIMEOUT_BETWEEN_SCROLLS))
19
+ logger.info(`scrolling to page bottom (${i + 1})`)
21
20
  }
22
21
 
23
22
  logger.warn('page bottom not found')
@@ -1,42 +1,18 @@
1
1
  const logger = require('../logger')(__filename)
2
- const seeMoreButtons = [
3
- {
4
- id: 'SHOW_MORE_ABOUT',
5
- selector: '#line-clamp-show-more-button'
6
- },{
7
- id: 'SHOW_MORE_EXPERIENCES',
8
- selector: '#experience-section .pv-profile-section__see-more-inline'
9
- },{
10
- id: 'SEE_MORE_EXPERIENCES',
11
- selector: '#experience-section .inline-show-more-text__button'
12
- },{
13
- id: 'SHOW_MORE_CERTIFICATIONS',
14
- selector: '#certifications-section .pv-profile-section__see-more-inline'
15
- },{
16
- id: 'SHOW_MORE_SKILLS',
17
- selector: '.pv-skills-section__additional-skills'
18
- },{
19
- id: 'SEE_MORE_RECOMMENDATIONS',
20
- selector: '.recommendations-inlining #line-clamp-show-more-button'
21
- }
22
- ]
23
-
24
2
 
25
- const clickAll = async(page) => {
26
- for(let i = 0; i < seeMoreButtons.length; i++){
27
- const button = seeMoreButtons[i]
28
- const elems = await page.$$(button.selector)
3
+ const clickAll = async (page) => {
4
+ const clicked = await page.evaluate(() => {
5
+ let count = 0
6
+ // Only click expandable text buttons (inline expand, not navigation)
7
+ const expandButtons = document.querySelectorAll('[data-testid="expandable-text-button"]')
8
+ expandButtons.forEach(btn => { btn.click(); count++ })
9
+ return count
10
+ })
29
11
 
30
- for(let j = 0; j < elems.length; j++){
31
- const elem = elems[j]
32
- if (elem) {
33
- await elem.click()
34
- .catch((e) => logger.warn(`couldn't click on ${button.selector}, it's probably invisible`))
35
- }
36
- }
12
+ if (clicked > 0) {
13
+ logger.info(`clicked ${clicked} show-more buttons`)
14
+ await new Promise(resolve => setTimeout(resolve, 500))
37
15
  }
38
-
39
- return
40
16
  }
41
17
 
42
18
  module.exports = { clickAll }