@mvegter/scrapedin 1.0.33 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@mvegter/scrapedin",
3
- "version": "1.0.33",
4
- "description": "linkedin scraper for 2020 website",
3
+ "version": "1.2.0",
4
+ "description": "linkedin scraper updated for 2025+ website",
5
5
  "keywords": [
6
6
  "linkedin",
7
7
  "scraper",
@@ -21,14 +21,14 @@
21
21
  "author": "Wagner Leonardi <leonardiwagner@gmail.com>",
22
22
  "license": "Apache-2.0",
23
23
  "dependencies": {
24
- "puppeteer": "14.1.1",
24
+ "puppeteer": "24.36.0",
25
25
  "winston": "3.7.2"
26
26
  },
27
27
  "devDependencies": {
28
28
  "standard": "17.0.0"
29
29
  },
30
30
  "engines": {
31
- "node": ">= 7.6.0"
31
+ "node": ">= 16.0.0"
32
32
  },
33
33
  "homepage": "https://github.com/linkedtales/scrapedin#readme"
34
34
  }
@@ -5,31 +5,30 @@ const template = require('./companyScraperTemplate')
5
5
  const logger = require('../logger')(__filename)
6
6
 
7
7
  module.exports = async (browser, cookies, url, waitTimeToScrapMs = 500, puppeteerAuthenticate = undefined) => {
8
- logger.info(`starting scraping url: ${url}`);
8
+ logger.info(`starting scraping url: ${url}`)
9
9
 
10
- let company = {};
10
+ const company = {}
11
11
 
12
- let page;
13
- if(url.includes('legacySchoolId=')){
14
- page = await openPage({ browser, cookies, url, puppeteerAuthenticate });
12
+ let page
13
+ if (url.includes('legacySchoolId=')) {
14
+ page = await openPage({ browser, cookies, url, puppeteerAuthenticate })
15
15
 
16
- const aboutSelector = 'a[href$="/about/"]';
16
+ const aboutSelector = 'a[href$="/about/"]'
17
17
 
18
- company.url = page.url();
19
-
20
- await page.$eval(aboutSelector, async about => await about.click());
21
- await page.waitForNavigation();
22
- } else{
23
- company.url = url;
24
- url = url + '/about';
25
- page = await openPage({ browser, cookies, url, puppeteerAuthenticate });
18
+ company.url = page.url()
19
+
20
+ await page.$eval(aboutSelector, async about => await about.click())
21
+ await page.waitForNavigation()
22
+ } else {
23
+ company.url = url
24
+ url = url + '/about'
25
+ page = await openPage({ browser, cookies, url, puppeteerAuthenticate })
26
26
  }
27
- company.about = (await scrapSection(page, template.about))[0];
28
- company.profile = (await scrapSection(page, template.profile))[0];
27
+ company.about = (await scrapSection(page, template.about))[0]
28
+ company.profile = (await scrapSection(page, template.profile))[0]
29
+
30
+ await page.close()
31
+ logger.info(`finished scraping url: ${url}`)
29
32
 
30
- await page.close();
31
- logger.info(`finished scraping url: ${url}`);
32
-
33
33
  return company
34
-
35
34
  }
@@ -1,30 +1,29 @@
1
1
  const template = {
2
- profile: {
3
- selector: '.org-top-card',
4
- fields: {
5
- name: `h1`,
6
- headline: `p`,
7
- imageurl: {
8
- selector: `img.org-top-card-primary-content__logo`,
9
- attribute: 'src'
10
- }
11
- }
12
- },
13
- about: {
14
- selector: '.org-grid__core-rail--no-margin-left',
15
- fields: {
16
- overview: 'p',
17
- types:{
18
- selector: 'dl dt',
19
- isMultipleFields: true
20
- },
21
- values:{
22
- selector: 'dl dd:not(.org-page-details__employees-on-linkedin-count)',
23
- isMultipleFields: true
24
- }
25
- }
2
+ profile: {
3
+ selector: '.org-top-card, .top-card-layout',
4
+ fields: {
5
+ name: 'h1',
6
+ headline: '.org-top-card-summary__tagline, .top-card-layout__headline, p',
7
+ imageurl: {
8
+ selector: 'img.org-top-card-primary-content__logo, img.top-card-layout__entity-image',
9
+ attribute: 'src'
10
+ }
26
11
  }
12
+ },
13
+ about: {
14
+ selector: '.org-grid__core-rail--no-margin-left, .org-about-us-organization-description, .core-section-container',
15
+ fields: {
16
+ overview: 'p',
17
+ types: {
18
+ selector: 'dl dt, .org-page-details__definition-term',
19
+ isMultipleFields: true
20
+ },
21
+ values: {
22
+ selector: 'dl dd:not(.org-page-details__employees-on-linkedin-count), .org-page-details__definition-text',
23
+ isMultipleFields: true
24
+ }
25
+ }
26
+ }
27
27
  }
28
28
 
29
-
30
29
  module.exports = template
package/src/login.js CHANGED
@@ -2,67 +2,173 @@ const openPage = require('./openPage')
2
2
  const logger = require('./logger')(__filename)
3
3
  const pkg = require('./package')
4
4
 
5
+ const ACCEPT_COOKIES_SELECTORS = [
6
+ 'button:has-text("Accept")',
7
+ 'button:has-text("Alle accepteren")',
8
+ 'button[action-type="ACCEPT"]',
9
+ '.cookie-consent-v2__button--accept',
10
+ '#artdeco-global-alert-container button:has-text("Accept")'
11
+ ]
12
+
13
+ const acceptCookies = async (page) => {
14
+ for (const selector of ACCEPT_COOKIES_SELECTORS) {
15
+ try {
16
+ const btn = await page.$(selector)
17
+ if (btn) {
18
+ await btn.click()
19
+ await new Promise((r) => setTimeout(r, 1000))
20
+ return
21
+ }
22
+ } catch {
23
+ // selector might not exist, try next
24
+ }
25
+ }
26
+ // Fallback: find any button with Accept text
27
+ try {
28
+ await page.evaluate(() => {
29
+ const btn = Array.from(document.querySelectorAll('button')).find(
30
+ (b) => b.textContent.trim().toLowerCase() === 'accept'
31
+ )
32
+ if (btn) btn.click()
33
+ })
34
+ await new Promise((r) => setTimeout(r, 1000))
35
+ } catch {
36
+ // ignore
37
+ }
38
+ }
39
+
40
+ const fillField = async (page, fieldValue) => {
41
+ // LinkedIn renders two sets of inputs: hidden (CSS-only, not visible) and visible.
42
+ // We find all <input> elements matching the autocomplete attribute, then pick
43
+ // the first one that is actually visible (has non-zero dimensions).
44
+ const autocomplete = fieldValue === 'username webauthn' ? 'username' : 'current-password'
45
+
46
+ const visibleInput = await page.evaluate((auto, val) => {
47
+ const inputs = Array.from(document.querySelectorAll(`input[autocomplete="${auto}"]`))
48
+ for (const input of inputs) {
49
+ const rect = input.getBoundingClientRect()
50
+ if (rect.width > 0 && rect.height > 0) {
51
+ const style = window.getComputedStyle(input)
52
+ if (style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0') {
53
+ input.focus()
54
+ input.value = ''
55
+ return true
56
+ }
57
+ }
58
+ }
59
+ return false
60
+ }, autocomplete, fieldValue)
61
+
62
+ if (visibleInput) {
63
+ await page.keyboard.type(fieldValue, { delay: 50 })
64
+ }
65
+ }
66
+
67
+ const clickSignIn = async (page) => {
68
+ // Find the "Sign in" button, excluding "Sign in with Apple" etc.
69
+ await page.evaluate(() => {
70
+ const buttons = Array.from(document.querySelectorAll('button'))
71
+ const signInBtn = buttons.find(
72
+ (b) => {
73
+ const text = b.textContent.trim().toLowerCase()
74
+ return (text === 'sign in' || text === 'inloggen' || text === 'aanmelden') &&
75
+ !text.includes('apple')
76
+ }
77
+ )
78
+ if (signInBtn) {
79
+ signInBtn.click()
80
+ return true
81
+ }
82
+ return false
83
+ })
84
+ }
85
+
86
+ const LOGGED_IN_PATHS = ['/feed', '/mynetwork', '/in/']
87
+
5
88
  module.exports = async (browser, email, password) => {
6
89
  const url = 'https://www.linkedin.com/login'
7
90
  const page = await openPage({ browser, url })
8
91
  logger.info(`logging at: ${url}`)
9
92
 
10
- await page.waitForSelector('#username')
93
+ // Accept cookie consent if present
94
+ await acceptCookies(page)
95
+ await new Promise((r) => setTimeout(r, 1000))
11
96
 
12
- await page.$('#username')
13
- .then((emailElement) => emailElement.type(email))
14
- await page.$('#password')
15
- .then((passwordElement) => passwordElement.type(password))
97
+ // Fill in email field
98
+ await fillField(page, 'username webauthn')
99
+ await new Promise((r) => setTimeout(r, 500))
16
100
 
17
- await page.$x("//button[contains(text(), 'Sign in')]")
18
- .then((button) => button[0].click())
101
+ // Fill in password field
102
+ await fillField(page, 'current-password')
103
+ await new Promise((r) => setTimeout(r, 500))
19
104
 
20
- return page.waitForSelector('input[role=combobox]', {
21
- timeout: 15000
22
- })
23
- .then(async () => {
24
- logger.info('logged feed page selector found')
25
- await page.close()
26
- })
27
- .catch(async () => {
28
- logger.warn('successful login element was not found')
29
- const emailError = await page.evaluate(() => {
30
- const e = document.querySelector('div[error-for=username]')
31
- if (!e) { return false }
32
- const style = window.getComputedStyle(e)
33
- return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
34
- })
35
-
36
- const passwordError = await page.evaluate(() => {
37
- const e = document.querySelector('div[error-for=password]')
38
- if (!e) { return false }
39
- const style = window.getComputedStyle(e)
40
- return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
41
- })
42
-
43
- const manualChallengeRequested = await page.evaluate(() => {
44
- const e = document.querySelector('.flow-challenge-content')
45
- if (!e) { return false }
46
- const style = window.getComputedStyle(e)
47
- return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
48
- })
49
-
50
- if (emailError) {
51
- logger.info('wrong username element found')
52
- return Promise.reject(new Error(`linkedin: invalid username: ${email}`))
53
- }
105
+ await clickSignIn(page)
54
106
 
55
- if (passwordError) {
56
- logger.info('wrong password element found')
57
- return Promise.reject(new Error('linkedin: invalid password'))
58
- }
107
+ let hadChallenge = false
108
+
109
+ try {
110
+ await page.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 15000 })
111
+ } catch {
112
+ // Navigation timeout is expected — the page may not navigate if already on login
113
+ }
59
114
 
60
- if (page.$(manualChallengeRequested)) {
61
- logger.warn('manual check was required')
62
- return Promise.reject(new Error(`linkedin: manual check was required, verify if your login is properly working manually or report this issue: ${pkg.name} ${pkg.version} ${pkg.bugs.url}`))
115
+ // Wait for either the feed/mynetwork or challenge page
116
+ const maxWaitMs = 120000
117
+ const start = Date.now()
118
+ let resolved = false
119
+ while (Date.now() - start < maxWaitMs && !resolved) {
120
+ const currentUrl = page.url()
121
+ if (LOGGED_IN_PATHS.some((p) => currentUrl.includes(p))) {
122
+ logger.info('logged in, redirected to: ' + currentUrl)
123
+ resolved = true
124
+ break
125
+ }
126
+ if (currentUrl.includes('/checkpoint')) {
127
+ if (!hadChallenge) {
128
+ logger.warn('2FA challenge detected, please complete the verification in the browser window (waiting up to 2 minutes)...')
129
+ hadChallenge = true
63
130
  }
131
+ await new Promise((r) => setTimeout(r, 2000))
132
+ continue
133
+ }
134
+ await new Promise((r) => setTimeout(r, 500))
135
+ }
64
136
 
65
- logger.error('could not find any element to retrieve a proper error')
66
- return Promise.reject(new Error(`${pkg.name} ${pkg.version} login is not working, please report: ${pkg.bugs.url}`))
137
+ if (!resolved) {
138
+ const finalUrl = page.url()
139
+ logger.warn('successful login element was not found, url: ' + finalUrl)
140
+
141
+ const emailError = await page.evaluate(() => {
142
+ const e = document.querySelector('div[error-for=username], #error-for-username')
143
+ if (!e) { return false }
144
+ const style = window.getComputedStyle(e)
145
+ return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
67
146
  })
147
+
148
+ const passwordError = await page.evaluate(() => {
149
+ const e = document.querySelector('div[error-for=password], #error-for-password')
150
+ if (!e) { return false }
151
+ const style = window.getComputedStyle(e)
152
+ return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
153
+ })
154
+
155
+ if (emailError) {
156
+ logger.info('wrong username element found')
157
+ await page.close()
158
+ return Promise.reject(new Error(`linkedin: invalid username: ${email}`))
159
+ }
160
+
161
+ if (passwordError) {
162
+ logger.info('wrong password element found')
163
+ await page.close()
164
+ return Promise.reject(new Error('linkedin: invalid password'))
165
+ }
166
+
167
+ logger.error('could not find any element to retrieve a proper error')
168
+ await page.close()
169
+ return Promise.reject(new Error(`${pkg.name} ${pkg.version} login is not working, please report: ${pkg.bugs.url}`))
170
+ }
171
+
172
+ await page.close()
173
+ return { hadChallenge }
68
174
  }
package/src/openPage.js CHANGED
@@ -1,23 +1,13 @@
1
- const agents = [
2
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36'
3
- // "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
4
- // "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
5
- // "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:34.0) Gecko/20100101 Firefox/34.0",
6
- // "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
7
- // "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
8
- // "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
9
- // "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
10
- // "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
11
- ]
1
+ const AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36'
12
2
 
13
3
  module.exports = async ({ browser, cookies, url, puppeteerAuthenticate }) => {
14
4
  const page = await browser.newPage()
15
- await page.setDefaultNavigationTimeout(0)
5
+ await page.setDefaultNavigationTimeout(60000)
16
6
 
17
7
  if (cookies) {
18
8
  await page.setCookie(...cookies)
19
9
  }
20
- await page.setUserAgent(agents[Math.floor(Math.random() * agents.length)])
10
+ await page.setUserAgent(AGENT)
21
11
  await page.setExtraHTTPHeaders({ 'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8' })
22
12
  await page.setViewport({
23
13
  width: 1920,
@@ -28,6 +18,6 @@ module.exports = async ({ browser, cookies, url, puppeteerAuthenticate }) => {
28
18
  await page.authenticate(puppeteerAuthenticate)
29
19
  }
30
20
 
31
- await page.goto(url, { waitUntil: 'load' })
21
+ await page.goto(url, { waitUntil: 'networkidle2' })
32
22
  return page
33
23
  }
@@ -2,7 +2,7 @@ const logger = require('../logger')(__filename)
2
2
  const pkg = require('../package')
3
3
 
4
4
  module.exports = (profile) => {
5
- if(!profile?.profile?.name) {
5
+ if (!profile?.profile?.name) {
6
6
  const messageError = `LinkedIn website changed and ${pkg.name} ${pkg.version} can't read basic data. Please report this issue at ${pkg.bugs.url}`
7
7
  logger.error(messageError, '')
8
8
  throw new Error(messageError)
@@ -11,17 +11,17 @@ module.exports = (profile) => {
11
11
  profile.profile.summary = profile?.about?.text
12
12
 
13
13
  profile.positions.forEach((position) => {
14
- if(position.title){
15
- position.title = position.title.replace('Company Name\n', '')
14
+ if (position.title) {
15
+ position.title = position.title.replace('Company Name\n', '')
16
16
  }
17
- if(position.description) {
18
- position.description = position.description.replace('See more', '');
19
- position.description = position.description.replace('see more', '');
20
- position.description = position.description.replace('See less', '');
17
+ if (position.description) {
18
+ position.description = position.description.replace('See more', '')
19
+ position.description = position.description.replace('see more', '')
20
+ position.description = position.description.replace('See less', '')
21
21
  }
22
- if(position.roles) {
22
+ if (position.roles) {
23
23
  position.roles.forEach((role) => {
24
- if(role.title) {
24
+ if (role.title) {
25
25
  role.title = role.title.replace('Title\n', '')
26
26
  }
27
27
  if (role.date) {
@@ -29,7 +29,7 @@ module.exports = (profile) => {
29
29
  role.date2 = role.date.replace('·', '-').split('-')[1].trim()
30
30
  delete role.date
31
31
  }
32
- if(role.description) {
32
+ if (role.description) {
33
33
  role.description = role.description.replace('See more', '')
34
34
  role.description = role.description.replace('see more', '')
35
35
  }
@@ -37,63 +37,62 @@ module.exports = (profile) => {
37
37
  }
38
38
  })
39
39
 
40
- if(profile.recommendations.receivedCount) {
40
+ if (profile.recommendations.receivedCount) {
41
41
  profile.recommendations.receivedCount = profile.recommendations.receivedCount.replace(/[^\d]/g, '')
42
42
  }
43
43
 
44
- if(profile.recommendations.givenCount) {
44
+ if (profile.recommendations.givenCount) {
45
45
  profile.recommendations.givenCount = profile.recommendations.givenCount.replace(/[^\d]/g, '')
46
46
  }
47
47
 
48
- if(profile.recommendations.received) {
48
+ if (profile.recommendations.received) {
49
49
  profile.recommendations.received.forEach((recommendation) => {
50
- if(recommendation.summary){
50
+ if (recommendation.summary) {
51
51
  recommendation.summary = recommendation.summary.replace('See more', '')
52
52
  recommendation.summary = recommendation.summary.replace('See less', '')
53
53
  }
54
54
  })
55
55
  }
56
56
 
57
- if(profile.recommendations.given) {
57
+ if (profile.recommendations.given) {
58
58
  profile.recommendations.given.forEach((recommendation) => {
59
- if(recommendation.summary){
59
+ if (recommendation.summary) {
60
60
  recommendation.summary = recommendation.summary.replace('See more', '')
61
61
  recommendation.summary = recommendation.summary.replace('See less', '')
62
62
  }
63
63
  })
64
64
  }
65
65
 
66
- if(profile.courses){
66
+ if (profile.courses) {
67
67
  profile.courses = profile.courses.map(({ name, year }) => {
68
68
  const coursesObj = {}
69
- if(name) {
69
+ if (name) {
70
70
  coursesObj.name = name.replace('Course name\n', '')
71
71
  }
72
- if(year) {
72
+ if (year) {
73
73
  coursesObj.year = year.replace('Course number\n', '')
74
74
  }
75
75
  return coursesObj
76
- }
77
- );
76
+ })
78
77
  }
79
78
 
80
- if(profile.languages){
79
+ if (profile.languages) {
81
80
  profile.languages = profile.languages.map(({ name, proficiency }) => ({
82
81
  name: name ? name.replace('Language name\n', '') : undefined,
83
- proficiency,
84
- }));
82
+ proficiency
83
+ }))
85
84
  }
86
85
 
87
- if(profile.projects){
86
+ if (profile.projects) {
88
87
  profile.projects = profile.projects.map(
89
88
  ({ name, date, description, link }) => ({
90
89
  name: name ? name.replace('Project name\n', '') : undefined,
91
90
  date,
92
91
  description: description ? description.replace('Project description\n', '') : undefined,
93
- link,
94
- }),
95
- );
92
+ link
93
+ })
94
+ )
96
95
  }
97
-
96
+
98
97
  return profile
99
98
  }
@@ -1,15 +1,15 @@
1
1
  const logger = require('../logger')(__filename)
2
2
  const scrapSection = require('../scrapSection')
3
3
 
4
- const SEE_MORE_SELECTOR = 'a[data-control-name=contact_see_more]'
5
- const CLOSE_MODAL_SELECTOR = '.artdeco-modal__dismiss';
4
+ const SEE_MORE_SELECTOR = '#top-card-text-details-contact-info'
5
+ const CLOSE_MODAL_SELECTOR = '.artdeco-modal__dismiss'
6
6
 
7
7
  const template = {
8
- selector: '.pv-contact-info__contact-type',
8
+ selector: '.pv-contact-info__contact-type, .ci-vanity-url, .ci-email, .ci-phone, .ci-websites, .ci-birthday, .ci-ims, .ci-address',
9
9
  fields: {
10
- type: 'header',
10
+ type: 'header, h3',
11
11
  values: {
12
- selector: '.pv-contact-info__ci-container',
12
+ selector: '.pv-contact-info__ci-container, .t-14',
13
13
  isMultipleFields: true
14
14
  },
15
15
  links: {
@@ -18,31 +18,29 @@ const template = {
18
18
  isMultipleFields: true
19
19
  }
20
20
  }
21
- }
22
- const getContactInfo = async(page) => {
23
- await page.waitFor(SEE_MORE_SELECTOR, { timeout: 2000 })
21
+ }
22
+ const getContactInfo = async (page) => {
23
+ await page.waitForSelector(SEE_MORE_SELECTOR, { timeout: 2000 })
24
24
  .catch(() => {
25
- logger.warn('contact-info', 'selector not found')
25
+ logger.warn('contact-info selector not found')
26
26
  return {}
27
27
  })
28
28
 
29
29
  const element = await page.$(SEE_MORE_SELECTOR)
30
- if(element){
30
+ if (element) {
31
31
  await element.click()
32
- const contactInfoIndicatorSelector = '#pv-contact-info'
33
- await page.waitFor(contactInfoIndicatorSelector, { timeout: 5000 })
34
- .catch(() => {
35
- logger.warn('contact info was not found')
36
- })
37
-
32
+ const contactInfoIndicatorSelector = '.pv-profile-section__section-info, .artdeco-modal__content'
33
+ await page.waitForSelector(contactInfoIndicatorSelector, { timeout: 5000 })
34
+ .catch(() => {
35
+ logger.warn('contact info was not found')
36
+ })
37
+
38
38
  const contactInfo = await scrapSection(page, template)
39
39
  const closeButton = await page.$(CLOSE_MODAL_SELECTOR)
40
- if(closeButton)
41
- await closeButton.click()
40
+ if (closeButton) { await closeButton.click() }
42
41
 
43
42
  return contactInfo
44
43
  }
45
-
46
44
  }
47
45
 
48
46
  module.exports = getContactInfo
@@ -1,81 +1,276 @@
1
1
  const openPage = require('../openPage')
2
- const scrapSection = require('../scrapSection')
3
- const scrapAccomplishmentPanel = require('./scrapAccomplishmentPanel')
4
2
  const scrollToPageBottom = require('./scrollToPageBottom')
5
3
  const seeMoreButtons = require('./seeMoreButtons')
6
4
  const contactInfo = require('./contactInfo')
7
- const template = require('./profileScraperTemplate')
8
5
  const cleanProfileData = require('./cleanProfileData')
9
6
 
10
7
  const logger = require('../logger')(__filename)
11
8
 
9
+ const extractProfileData = async (page) => {
10
+ return page.evaluate(() => {
11
+ const txt = (el) => el ? (el.textContent || '').trim() : ''
12
+
13
+ const findSection = (headingText) => {
14
+ const sections = document.querySelectorAll('section')
15
+ for (const s of sections) {
16
+ const h2 = s.querySelector('h2')
17
+ if (h2 && h2.textContent.trim().startsWith(headingText)) return s
18
+ }
19
+ return null
20
+ }
21
+
22
+ const getContentDivs = (section) => {
23
+ if (!section) return []
24
+ const h2 = section.querySelector('h2')
25
+ if (!h2) return []
26
+ const content = h2.parentElement?.nextElementSibling
27
+ if (!content) return []
28
+ return [...content.querySelectorAll(':scope > div')].filter(d => d.querySelector('p'))
29
+ }
30
+
31
+ // Profile top card
32
+ const sections = document.querySelectorAll('section')
33
+ let topSection = null
34
+ for (const s of sections) {
35
+ const h2 = s.querySelector('h2')
36
+ if (h2 && ['0 notifications', 'Suggested for you', 'Analytics', 'Activity'].includes(h2.textContent.trim())) continue
37
+ if (s.textContent.trim().length > 200) { topSection = s; break }
38
+ }
39
+
40
+ const nameH1 = document.querySelector('h1')
41
+ const profileName = nameH1 ? txt(nameH1) : document.title.replace(' | LinkedIn', '').trim()
42
+ let headline = ''
43
+ let location = ''
44
+ if (topSection) {
45
+ const allP = [...topSection.querySelectorAll('p')]
46
+ headline = txt(allP[0])
47
+ for (const p of allP) {
48
+ const t = txt(p)
49
+ if (t.includes(',') && !t.includes('at ') && !t.includes('\u00B7') && !t.includes('follow')) {
50
+ location = t
51
+ break
52
+ }
53
+ }
54
+ }
55
+ const photoImg = document.querySelector('img[src*="profile-displayphoto"]')
56
+ const profile = {
57
+ name: profileName,
58
+ headline,
59
+ location,
60
+ connections: '',
61
+ imageurl: photoImg ? photoImg.getAttribute('src') || '' : ''
62
+ }
63
+
64
+ // Experience
65
+ const positions = []
66
+ const expSection = findSection('Experience')
67
+ if (expSection) {
68
+ const companyGroups = expSection.querySelectorAll('[componentkey^="entity-collection"]')
69
+ companyGroups.forEach(group => {
70
+ const allP = [...group.querySelectorAll('p')].filter(p => txt(p).length > 0)
71
+ if (allP.length === 0) return
72
+ const companyName = txt(allP[0])
73
+ const companyLink = group.querySelector('a[href*="/company/"]')
74
+ const companyUrl = companyLink ? companyLink.getAttribute('href') : ''
75
+
76
+ const positionLis = group.querySelectorAll('ul > li')
77
+ if (positionLis.length > 0) {
78
+ positionLis.forEach(li => {
79
+ const liPs = [...li.querySelectorAll('p')].filter(p => txt(p).length > 0)
80
+ if (liPs.length === 0) return
81
+ const descEl = li.querySelector('[data-testid="expandable-text-box"]')
82
+ let dateStr = ''
83
+ for (let i = 1; i < liPs.length; i++) {
84
+ const t = txt(liPs[i])
85
+ if (t.includes('\u00B7') || /\d{4}/.test(t)) { dateStr = t; break }
86
+ }
87
+ const dateRange = dateStr.split('\u00B7')[0].trim()
88
+ const dateParts = dateRange.split(' - ')
89
+ positions.push({
90
+ title: txt(liPs[0]),
91
+ companyName,
92
+ link: companyUrl,
93
+ url: companyUrl,
94
+ location: '',
95
+ description: descEl ? txt(descEl) : '',
96
+ date: dateRange,
97
+ date1: dateParts[0] ? dateParts[0].trim() : '',
98
+ date2: dateParts[1] ? dateParts[1].trim() : ''
99
+ })
100
+ })
101
+ } else {
102
+ const descEl = group.querySelector('[data-testid="expandable-text-box"]')
103
+ let dateStr = ''
104
+ for (let i = 1; i < allP.length; i++) {
105
+ const t = txt(allP[i])
106
+ if (t.includes('\u00B7') || /\d{4}/.test(t.split(' ')[0])) { dateStr = t; break }
107
+ }
108
+ const dateRange = dateStr.split('\u00B7')[0].trim()
109
+ const dateParts = dateRange.split(' - ')
110
+ positions.push({
111
+ title: txt(allP[3]) || txt(allP[1]),
112
+ companyName,
113
+ link: companyUrl,
114
+ url: companyUrl,
115
+ location: txt(allP[2]) || '',
116
+ description: descEl ? txt(descEl) : '',
117
+ date: dateRange,
118
+ date1: dateParts[0] ? dateParts[0].trim() : '',
119
+ date2: dateParts[1] ? dateParts[1].trim() : ''
120
+ })
121
+ }
122
+ })
123
+ }
124
+
125
+ // Education
126
+ const educations = []
127
+ const eduSection = findSection('Education')
128
+ const eduDivs = getContentDivs(eduSection)
129
+ eduDivs.forEach(div => {
130
+ const ps = [...div.querySelectorAll('p')].filter(p => txt(p).length > 0)
131
+ if (ps.length === 0) return
132
+ const schoolLink = div.querySelector('a[href*="/school/"]')
133
+ const url = schoolLink ? schoolLink.getAttribute('href') : ''
134
+ const dateStr = txt(ps[2]) || ''
135
+ const dateParts = dateStr.split('\u2013')
136
+ educations.push({
137
+ title: txt(ps[0]),
138
+ degree: txt(ps[1]) || '',
139
+ fieldOfStudy: txt(ps[1]) || '',
140
+ url,
141
+ date1: dateParts[0] ? dateParts[0].trim() : '',
142
+ date2: dateParts[1] ? dateParts[1].trim() : '',
143
+ description: ''
144
+ })
145
+ })
146
+
147
+ // Skills
148
+ const skills = []
149
+ const skillsSection = findSection('Skills')
150
+ if (skillsSection) {
151
+ const allSectionP = [...skillsSection.querySelectorAll('p')].filter(p => txt(p).length > 0)
152
+ for (let i = 0; i < allSectionP.length; i += 2) {
153
+ const name = txt(allSectionP[i])
154
+ if (name && !name.includes('Show all') && !name.includes('Private')) {
155
+ skills.push({ title: name, count: '' })
156
+ }
157
+ }
158
+ }
159
+
160
+ // Languages
161
+ const languages = []
162
+ const langSection = findSection('Languages')
163
+ if (langSection) {
164
+ const langDivs = getContentDivs(langSection)
165
+ langDivs.forEach(div => {
166
+ const ps = [...div.querySelectorAll('p')].filter(p => txt(p).length > 0)
167
+ for (let i = 0; i < ps.length; i += 2) {
168
+ const n = txt(ps[i])
169
+ const pr = ps[i + 1] ? txt(ps[i + 1]) : ''
170
+ if (n) languages.push({ name: n, proficiency: pr })
171
+ }
172
+ })
173
+ }
174
+
175
+ // Projects
176
+ const projects = []
177
+ const projSection = findSection('Projects')
178
+ const projDivs = getContentDivs(projSection)
179
+ projDivs.forEach(div => {
180
+ const ps = [...div.querySelectorAll('p')].filter(p => txt(p).length > 0)
181
+ if (ps.length === 0) return
182
+ const descEl = div.querySelector('[data-testid="expandable-text-box"]')
183
+ const link = div.querySelector('a[href*="http"]')
184
+ projects.push({
185
+ name: txt(ps[0]),
186
+ date: txt(ps[1]) || '',
187
+ description: descEl ? txt(descEl) : '',
188
+ link: link ? link.getAttribute('href') : ''
189
+ })
190
+ })
191
+
192
+ // Certifications
193
+ const accomplishments = []
194
+ const certSection = findSection('Licenses')
195
+ const certDivs = getContentDivs(certSection)
196
+ certDivs.forEach(div => {
197
+ const ps = [...div.querySelectorAll('p')].filter(p => txt(p).length > 0)
198
+ if (ps.length > 0) accomplishments.push({ title: txt(ps[0]), count: '', items: [] })
199
+ })
200
+
201
+ return {
202
+ profile,
203
+ about: { text: '' },
204
+ positions,
205
+ educations,
206
+ skills,
207
+ recommendations: { givenCount: '0', receivedCount: '0', given: [], received: [] },
208
+ accomplishments,
209
+ courses: [],
210
+ languages,
211
+ projects,
212
+ peopleAlsoViewed: [],
213
+ volunteerExperience: [],
214
+ contact: []
215
+ }
216
+ })
217
+ }
218
+
12
219
  module.exports = async (browser, cookies, url, waitTimeToScrapMs = 500, hasToGetContactInfo = false, puppeteerAuthenticate = undefined) => {
13
220
  logger.info(`starting scraping url: ${url}`)
14
221
 
15
222
  const page = await openPage({ browser, cookies, url, puppeteerAuthenticate })
16
- const profilePageIndicatorSelector = '.pv-profile-section'
17
- await page.waitForSelector(profilePageIndicatorSelector, { timeout: 5000 })
223
+
224
+ // Check for authwall (expired session)
225
+ const isAuthwall = await page.evaluate(() =>
226
+ window.location.href.includes('/authwall') || document.title.toLowerCase().includes('inschrijven')
227
+ )
228
+ if (isAuthwall) {
229
+ await page.close()
230
+ throw new Error('authwall: LinkedIn session expired, re-authentication required')
231
+ }
232
+
233
+ // Accept cookie consent if present
234
+ await page.evaluate(() => {
235
+ const btn = Array.from(document.querySelectorAll('button')).find(
236
+ (b) => b.textContent.trim().toLowerCase() === 'accept'
237
+ )
238
+ if (btn) btn.click()
239
+ })
240
+ await new Promise((r) => setTimeout(r, 2000))
241
+
242
+ // Wait for sections to appear (profile content is SDUI rendered)
243
+ await page.waitForFunction(() => {
244
+ return document.querySelectorAll('section h2').length > 1
245
+ }, { timeout: 30000 })
18
246
  .catch(() => {
19
- //why doesn't throw error instead of continuing scraping?
20
- //because it can be just a false negative meaning LinkedIn only changed that selector but everything else is fine :)
21
- logger.warn('profile selector was not found')
247
+ logger.warn('profile content did not fully render in time')
22
248
  })
23
249
 
24
250
  logger.info('scrolling page to the bottom')
25
251
  await scrollToPageBottom(page)
26
-
27
- if(waitTimeToScrapMs) {
28
- logger.info(`applying 1st delay`)
29
- await new Promise((resolve) => { setTimeout(() => { resolve() }, waitTimeToScrapMs / 2)})
30
- }
252
+
253
+ // Wait for lazy-loaded sections to render after scrolling
254
+ await new Promise((resolve) => { setTimeout(resolve, 2000) })
255
+
256
+ // Scroll again in case new content was loaded
257
+ await scrollToPageBottom(page)
258
+ await new Promise((resolve) => { setTimeout(resolve, 1000) })
31
259
 
32
260
  await seeMoreButtons.clickAll(page)
33
261
 
34
- if(waitTimeToScrapMs) {
35
- logger.info(`applying 2nd (and last) delay`)
36
- await new Promise((resolve) => { setTimeout(() => { resolve() }, waitTimeToScrapMs / 2)})
37
- }
262
+ // Final wait for content to settle
263
+ await new Promise((resolve) => { setTimeout(resolve, 1000) })
38
264
 
39
- const [profile] = await scrapSection(page, template.profile)
40
- const [about] = await scrapSection(page, template.about)
41
- const positions = await scrapSection(page, template.positions)
42
- const educations = await scrapSection(page, template.educations)
43
- const [recommendationsCount] = await scrapSection(page, template.recommendationsCount)
44
- const recommendationsReceived = await scrapSection(page, template.recommendationsReceived)
45
- const recommendationsGiven = await scrapSection(page, template.recommendationsGiven)
46
- const skills = await scrapSection(page, template.skills)
47
- const accomplishments = await scrapSection(page, template.accomplishments)
48
- const courses = await scrapAccomplishmentPanel(page, 'courses')
49
- const languages = await scrapAccomplishmentPanel(page, 'languages')
50
- const projects = await scrapAccomplishmentPanel(page, 'projects')
51
- const volunteerExperience = await scrapSection(page, template.volunteerExperience)
52
- const peopleAlsoViewed = await scrapSection(page, template.peopleAlsoViewed)
53
- const contact = hasToGetContactInfo ? await contactInfo(page) : []
265
+ const rawProfile = await extractProfileData(page)
266
+
267
+ if (hasToGetContactInfo) {
268
+ rawProfile.contact = await contactInfo(page) || []
269
+ }
54
270
 
55
271
  await page.close()
56
272
  logger.info(`finished scraping url: ${url}`)
57
273
 
58
- const rawProfile = {
59
- profile,
60
- about,
61
- positions,
62
- educations,
63
- skills,
64
- recommendations: {
65
- givenCount: recommendationsCount ? recommendationsCount.given : "0",
66
- receivedCount: recommendationsCount ? recommendationsCount.received : "0",
67
- given: recommendationsReceived,
68
- received: recommendationsGiven
69
- },
70
- accomplishments,
71
- courses,
72
- languages,
73
- projects,
74
- peopleAlsoViewed,
75
- volunteerExperience,
76
- contact
77
- }
78
-
79
274
  const cleanedProfile = cleanProfileData(rawProfile)
80
275
  return cleanedProfile
81
276
  }
@@ -1,189 +1,186 @@
1
- const profileSelector = '.core-rail > *:first-child section >'
2
-
3
1
  const template = {
4
2
  profile: {
5
- selector: '.pb5',
3
+ selector: '.scaffold-layout__main',
6
4
  fields: {
7
- name: `.text-heading-xlarge`,
8
- headline: `.text-body-medium`,
9
- location: `.pb2 .text-body-small`,
10
- connections: `li.text-body-small`,
5
+ name: '.text-heading-xlarge',
6
+ headline: '.text-body-medium',
7
+ location: '.text-body-small.inline.t-black--light.break-words',
8
+ connections: '.t-bold',
11
9
  imageurl: {
12
- selector: `img.pv-top-card__photo`,
10
+ selector: 'img.pv-top-card-profile-picture__image',
13
11
  attribute: 'src'
14
12
  }
15
13
  }
16
14
  },
17
15
  about: {
18
- selector: '.pv-about-section',
16
+ selector: '#about ~ .display-flex .inline-show-more-text',
19
17
  fields: {
20
- text: 'div'
18
+ text: 'span[aria-hidden="true"]'
21
19
  }
22
20
  },
23
21
  positions: {
24
- selector: 'div[id="experience"] + div + div li.artdeco-list__item',
22
+ selector: '#experience ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
25
23
  fields: {
26
- title: 'h3',
24
+ title: '.mr1.hoverable-link-text.t-bold > span',
27
25
  link: {
28
- selector: 'a',
29
- attribute: 'href',
26
+ selector: 'a.optional-action-target-wrapper',
27
+ attribute: 'href'
30
28
  },
31
29
  url: {
32
- selector: 'a',
30
+ selector: 'a.optional-action-target-wrapper',
33
31
  attribute: 'href'
34
32
  },
35
- companyName: 'div.t-bold span:first-child',
36
- location: '.pv-entity__location span:last-child',
37
- description: '.pv-entity__description',
38
- date1: '.pv-entity__date-range span:last-child',
39
- date2: '.pv-entity__bullet-item-v2',
33
+ companyName: '.t-14.t-normal > span',
34
+ location: '.t-14.t-normal.t-black--light > span',
35
+ description: '.pvs-list__outer-container .inline-show-more-text span[aria-hidden="true"]',
36
+ date1: '.pvs-entity__caption-wrapper',
37
+ date2: '.pvs-entity__caption-wrapper',
40
38
  roles: {
41
- selector: 'div.full-width > div > ul div[data-view-name]',
39
+ selector: '.pvs-entity__sub-components li.pvs-list__paged-list-item',
42
40
  hasChildrenFields: true,
43
41
  fields: {
44
- title: 'div.t-bold span:first-child',
45
- description: 'div.display-flex div.full-width > span[aria-hidden]',
42
+ title: '.mr1.hoverable-link-text.t-bold > span',
43
+ description: '.inline-show-more-text span[aria-hidden="true"]',
46
44
  date: '.pvs-entity__caption-wrapper',
47
- location: '.pv-entity__location span:last-child'
45
+ location: '.t-14.t-normal.t-black--light > span'
48
46
  }
49
47
  }
50
48
  }
51
49
  },
52
50
  educations: {
53
- selector: '#education-section li',
51
+ selector: '#education ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
54
52
  fields: {
55
- title: 'h3',
56
- degree: 'span[class=pv-entity__comma-item]',
53
+ title: '.hoverable-link-text.t-bold > span',
54
+ degree: '.t-14.t-normal > span',
57
55
  url: {
58
56
  selector: 'a',
59
57
  attribute: 'href'
60
58
  },
61
- fieldOfStudy: 'p.pv-entity__fos span:nth-child(2)',
62
- date1: '.pv-entity__dates time:nth-child(1)',
63
- date2: '.pv-entity__dates time:nth-child(2)',
64
- description: '.pv-entity__description'
59
+ fieldOfStudy: '.t-14.t-normal > span',
60
+ date1: '.pvs-entity__caption-wrapper',
61
+ date2: '.pvs-entity__caption-wrapper',
62
+ description: '.inline-show-more-text span[aria-hidden="true"]'
65
63
  }
66
64
  },
67
65
  skills: {
68
- selector: '.pv-skill-category-entity__skill-wrapper',
66
+ selector: '#skills ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
69
67
  fields: {
70
- title: '.pv-skill-category-entity__name-text',
71
- count: '.pv-skill-category-entity__endorsement-count'
68
+ title: '.mr1.hoverable-link-text.t-bold > span',
69
+ count: '.t-14.t-normal.t-black--light > span'
72
70
  }
73
71
  },
74
72
  recommendationsCount: {
75
- selector: '.recommendations-inlining',
73
+ selector: '#recommendations ~ .pvs-list__outer-container',
76
74
  fields: {
77
75
  received: '.artdeco-tab:nth-child(1)',
78
76
  given: '.artdeco-tab:nth-child(2)'
79
77
  }
80
78
  },
81
79
  recommendationsReceived: {
82
- selector: '.recommendations-inlining',
80
+ selector: '#recommendations ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
83
81
  fields: {
84
82
  user: {
85
- selector: '.pv-recommendation-entity__member',
83
+ selector: 'a',
86
84
  attribute: 'href'
87
85
  },
88
- text: 'blockquote.pv-recommendation-entity__text',
86
+ text: '.inline-show-more-text span[aria-hidden="true"]',
89
87
  profileImage: {
90
- selector: 'a img',
88
+ selector: 'img',
91
89
  attribute: 'src'
92
90
  },
93
91
  name: {
94
- selector: 'a h3'
92
+ selector: '.t-bold > span'
95
93
  },
96
94
  userDescription: {
97
- selector: '.pv-recommendation-entity__headline'
95
+ selector: '.t-14.t-normal > span'
98
96
  }
99
97
  }
100
98
  },
101
99
  recommendationsGiven: {
102
- selector: '.artdeco-tabpanel li.pv-recommendation-entity',
100
+ selector: '#recommendations ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
103
101
  fields: {
104
102
  user: {
105
- selector: '.pv-recommendation-entity__member',
103
+ selector: 'a',
106
104
  attribute: 'href'
107
105
  },
108
- text: 'blockquote.pv-recommendation-entity__text',
106
+ text: '.inline-show-more-text span[aria-hidden="true"]',
109
107
  profileImage: {
110
- selector: 'a img',
108
+ selector: 'img',
111
109
  attribute: 'src'
112
110
  },
113
111
  name: {
114
- selector: 'a h3'
112
+ selector: '.t-bold > span'
115
113
  },
116
114
  userDescription: {
117
- selector: '.pv-recommendation-entity__headline'
115
+ selector: '.t-14.t-normal > span'
118
116
  }
119
117
  }
120
118
  },
121
119
  accomplishments: {
122
- selector: '.pv-accomplishments-section > div',
120
+ selector: '#honors_and_awards ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
123
121
  fields: {
124
- count: 'h3 span:last-child',
125
- title: '.pv-accomplishments-block__title',
122
+ count: '.t-14.t-normal.t-black--light > span',
123
+ title: '.mr1.hoverable-link-text.t-bold > span',
126
124
  items: {
127
- selector: 'li',
125
+ selector: '.pvs-list__outer-container li',
128
126
  isMultipleFields: true
129
127
  }
130
128
  }
131
129
  },
132
130
  peopleAlsoViewed: {
133
- selector: 'li.pv-browsemap-section__member-container',
131
+ selector: '.pv-browsemap-section li',
134
132
  fields: {
135
133
  user: {
136
134
  selector: 'a',
137
135
  attribute: 'href'
138
136
  },
139
- text: 'p',
137
+ text: '.t-14.t-normal',
140
138
  profileImage: {
141
- selector: 'a img',
139
+ selector: 'img',
142
140
  attribute: 'src'
143
141
  },
144
142
  name: {
145
- selector: '.name'
143
+ selector: '.t-bold'
146
144
  }
147
145
  }
148
146
  },
149
147
  volunteerExperience: {
150
- selector: 'section.volunteering-section li',
148
+ selector: '#volunteering_experience ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
151
149
  fields: {
152
- title: 'h3',
153
- experience: 'span[class=pv-entity__secondary-title]',
154
- location: '.pv-entity__location span:nth-child(2)',
155
- description: '.pv-volunteer-causes',
156
- date1: '.pv-entity__date-range span:nth-child(2)',
157
- date2: '.pv-entity__bullet-item'
150
+ title: '.mr1.hoverable-link-text.t-bold > span',
151
+ experience: '.t-14.t-normal > span',
152
+ location: '.t-14.t-normal.t-black--light > span',
153
+ description: '.inline-show-more-text span[aria-hidden="true"]',
154
+ date1: '.pvs-entity__caption-wrapper',
155
+ date2: '.pvs-entity__caption-wrapper'
158
156
  }
159
157
  },
160
158
  courses: {
161
- selector: '.pv-accomplishments-section',
159
+ selector: '#courses ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
162
160
  fields: {
163
- name: '.pv-accomplishment-entity__title',
164
- year: '.pv-accomplishment-entity__course-number'
161
+ name: '.mr1.hoverable-link-text.t-bold > span',
162
+ year: '.t-14.t-normal > span'
165
163
  }
166
164
  },
167
165
  languages: {
168
- selector: '.pv-accomplishments-block.languages li',
166
+ selector: '#languages ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
169
167
  fields: {
170
- name: '.pv-accomplishment-entity__title',
171
- proficiency: '.pv-accomplishment-entity__proficiency',
168
+ name: '.mr1.t-bold > span',
169
+ proficiency: '.t-14.t-normal.t-black--light > span'
172
170
  }
173
171
  },
174
172
  projects: {
175
- selector: '.pv-accomplishments-block.projects li',
173
+ selector: '#projects ~ .pvs-list__outer-container .pvs-list > li.pvs-list__paged-list-item',
176
174
  fields: {
177
- name: '.pv-accomplishment-entity__title',
178
- date: '.pv-accomplishment-entity__date',
179
- description: '.pv-accomplishment-entity__description',
175
+ name: '.mr1.hoverable-link-text.t-bold > span',
176
+ date: '.pvs-entity__caption-wrapper',
177
+ description: '.inline-show-more-text span[aria-hidden="true"]',
180
178
  link: {
181
- selector: '.mt4',
179
+ selector: 'a',
182
180
  attribute: 'href'
183
181
  }
184
182
  }
185
183
  }
186
184
  }
187
185
 
188
-
189
186
  module.exports = template
@@ -1,18 +1,11 @@
1
- const scrapSection = require('../scrapSection');
2
- const template = require('./profileScraperTemplate');
1
+ const scrapSection = require('../scrapSection')
2
+ const template = require('./profileScraperTemplate')
3
3
 
4
4
  const scrapAccomplishmentPanel = async (page, section) => {
5
- const queryString = `.pv-accomplishments-block.${section} button`
6
-
7
- const openingButton = await page.$(queryString);
8
-
9
- if (openingButton) {
10
- await page.evaluate((q) => {
11
- document.querySelector(q).click();
12
- }, queryString);
13
-
14
- return scrapSection(page, template[section]);
5
+ if (!template[section]) {
6
+ return []
15
7
  }
16
- };
8
+ return scrapSection(page, template[section])
9
+ }
17
10
 
18
- module.exports = scrapAccomplishmentPanel;
11
+ module.exports = scrapAccomplishmentPanel
@@ -3,21 +3,20 @@ const logger = require('../logger')(__filename)
3
3
  module.exports = async (page) => {
4
4
  const MAX_TIMES_TO_SCROLL = 25
5
5
  const TIMEOUT_BETWEEN_SCROLLS = 500
6
- const PAGE_BOTTOM_SELECTOR_STRING = '#expanded-footer'
7
6
 
8
7
  for (let i = 0; i < MAX_TIMES_TO_SCROLL; i++) {
9
8
  await page.evaluate(() => window.scrollBy(0, window.innerHeight))
10
9
 
11
- const hasReachedEnd = await page.waitForSelector(PAGE_BOTTOM_SELECTOR_STRING, {
12
- visible: true,
13
- timeout: TIMEOUT_BETWEEN_SCROLLS
14
- }).catch(() => {
15
- logger.info(`scrolling to page bottom (${i + 1})`)
10
+ const hasReachedEnd = await page.evaluate(() => {
11
+ return (window.innerHeight + window.scrollY) >= (document.body.scrollHeight - 200)
16
12
  })
17
13
 
18
14
  if (hasReachedEnd) {
19
15
  return
20
16
  }
17
+
18
+ await new Promise(resolve => setTimeout(resolve, TIMEOUT_BETWEEN_SCROLLS))
19
+ logger.info(`scrolling to page bottom (${i + 1})`)
21
20
  }
22
21
 
23
22
  logger.warn('page bottom not found')
@@ -1,42 +1,18 @@
1
1
  const logger = require('../logger')(__filename)
2
- const seeMoreButtons = [
3
- {
4
- id: 'SHOW_MORE_ABOUT',
5
- selector: '#line-clamp-show-more-button'
6
- },{
7
- id: 'SHOW_MORE_EXPERIENCES',
8
- selector: '#experience-section .pv-profile-section__see-more-inline'
9
- },{
10
- id: 'SEE_MORE_EXPERIENCES',
11
- selector: '#experience-section .inline-show-more-text__button'
12
- },{
13
- id: 'SHOW_MORE_CERTIFICATIONS',
14
- selector: '#certifications-section .pv-profile-section__see-more-inline'
15
- },{
16
- id: 'SHOW_MORE_SKILLS',
17
- selector: '.pv-skills-section__additional-skills'
18
- },{
19
- id: 'SEE_MORE_RECOMMENDATIONS',
20
- selector: '.recommendations-inlining #line-clamp-show-more-button'
21
- }
22
- ]
23
-
24
2
 
25
- const clickAll = async(page) => {
26
- for(let i = 0; i < seeMoreButtons.length; i++){
27
- const button = seeMoreButtons[i]
28
- const elems = await page.$$(button.selector)
3
+ const clickAll = async (page) => {
4
+ const clicked = await page.evaluate(() => {
5
+ let count = 0
6
+ // Only click expandable text buttons (inline expand, not navigation)
7
+ const expandButtons = document.querySelectorAll('[data-testid="expandable-text-button"]')
8
+ expandButtons.forEach(btn => { btn.click(); count++ })
9
+ return count
10
+ })
29
11
 
30
- for(let j = 0; j < elems.length; j++){
31
- const elem = elems[j]
32
- if (elem) {
33
- await elem.click()
34
- .catch((e) => logger.warn(`couldn't click on ${button.selector}, it's probably invisible`))
35
- }
36
- }
12
+ if (clicked > 0) {
13
+ logger.info(`clicked ${clicked} show-more buttons`)
14
+ await new Promise(resolve => setTimeout(resolve, 500))
37
15
  }
38
-
39
- return
40
16
  }
41
17
 
42
18
  module.exports = { clickAll }
package/src/scrapedin.js CHANGED
@@ -4,7 +4,22 @@ const profile = require('./profile/profile')
4
4
  const company = require('./company/company')
5
5
  const logger = require('./logger')(__filename)
6
6
 
7
- module.exports = async ({ cookies, email, password, isHeadless, hasToLog, hasToGetContactInfo, puppeteerArgs, puppeteerAuthenticate, endpoint } = { isHeadless: true, hasToLog: false }) => {
7
+ const saveBrowserCookies = async (browser, cookiesPath) => {
8
+ if (!cookiesPath) return
9
+ try {
10
+ const pages = await browser.pages()
11
+ if (pages.length > 0) {
12
+ const pageCookies = await pages[0].cookies()
13
+ const fs = require('fs')
14
+ fs.writeFileSync(cookiesPath, JSON.stringify(pageCookies, null, 2))
15
+ logger.info('cookies saved to: ' + cookiesPath)
16
+ }
17
+ } catch (e) {
18
+ logger.warn('failed to save cookies: ' + e.message)
19
+ }
20
+ }
21
+
22
+ module.exports = async ({ cookies, email, password, isHeadless, hasToLog, hasToGetContactInfo, cookiesPath, puppeteerArgs, puppeteerAuthenticate, endpoint } = { isHeadless: true, hasToLog: false }) => {
8
23
  if (!hasToLog) {
9
24
  logger.stopLogging()
10
25
  }
@@ -26,7 +41,11 @@ module.exports = async ({ cookies, email, password, isHeadless, hasToLog, hasToG
26
41
  logger.info('email and password was provided, we\'re going to login...')
27
42
 
28
43
  try {
29
- await login(browser, email, password, logger)
44
+ const loginResult = await login(browser, email, password, logger)
45
+ // Only save cookies if login fully completed (no 2FA challenge)
46
+ if (loginResult && !loginResult.hadChallenge && cookiesPath) {
47
+ await saveBrowserCookies(browser, cookiesPath)
48
+ }
30
49
  } catch (e) {
31
50
  if (!endpoint) {
32
51
  await browser.close()