@mvegter/scrapedin 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/login.js +156 -52
- package/src/openPage.js +1 -1
- package/src/profile/profile.js +145 -90
- package/src/scrapedin.js +21 -2
package/package.json
CHANGED
package/src/login.js
CHANGED
|
@@ -2,69 +2,173 @@ const openPage = require('./openPage')
|
|
|
2
2
|
const logger = require('./logger')(__filename)
|
|
3
3
|
const pkg = require('./package')
|
|
4
4
|
|
|
5
|
+
const ACCEPT_COOKIES_SELECTORS = [
|
|
6
|
+
'button:has-text("Accept")',
|
|
7
|
+
'button:has-text("Alle accepteren")',
|
|
8
|
+
'button[action-type="ACCEPT"]',
|
|
9
|
+
'.cookie-consent-v2__button--accept',
|
|
10
|
+
'#artdeco-global-alert-container button:has-text("Accept")'
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
const acceptCookies = async (page) => {
|
|
14
|
+
for (const selector of ACCEPT_COOKIES_SELECTORS) {
|
|
15
|
+
try {
|
|
16
|
+
const btn = await page.$(selector)
|
|
17
|
+
if (btn) {
|
|
18
|
+
await btn.click()
|
|
19
|
+
await new Promise((r) => setTimeout(r, 1000))
|
|
20
|
+
return
|
|
21
|
+
}
|
|
22
|
+
} catch {
|
|
23
|
+
// selector might not exist, try next
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
// Fallback: find any button with Accept text
|
|
27
|
+
try {
|
|
28
|
+
await page.evaluate(() => {
|
|
29
|
+
const btn = Array.from(document.querySelectorAll('button')).find(
|
|
30
|
+
(b) => b.textContent.trim().toLowerCase() === 'accept'
|
|
31
|
+
)
|
|
32
|
+
if (btn) btn.click()
|
|
33
|
+
})
|
|
34
|
+
await new Promise((r) => setTimeout(r, 1000))
|
|
35
|
+
} catch {
|
|
36
|
+
// ignore
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
const fillField = async (page, fieldValue) => {
|
|
41
|
+
// LinkedIn renders two sets of inputs: hidden (CSS-only, not visible) and visible.
|
|
42
|
+
// We find all <input> elements matching the autocomplete attribute, then pick
|
|
43
|
+
// the first one that is actually visible (has non-zero dimensions).
|
|
44
|
+
const autocomplete = fieldValue === 'username webauthn' ? 'username' : 'current-password'
|
|
45
|
+
|
|
46
|
+
const visibleInput = await page.evaluate((auto, val) => {
|
|
47
|
+
const inputs = Array.from(document.querySelectorAll(`input[autocomplete="${auto}"]`))
|
|
48
|
+
for (const input of inputs) {
|
|
49
|
+
const rect = input.getBoundingClientRect()
|
|
50
|
+
if (rect.width > 0 && rect.height > 0) {
|
|
51
|
+
const style = window.getComputedStyle(input)
|
|
52
|
+
if (style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0') {
|
|
53
|
+
input.focus()
|
|
54
|
+
input.value = ''
|
|
55
|
+
return true
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
return false
|
|
60
|
+
}, autocomplete, fieldValue)
|
|
61
|
+
|
|
62
|
+
if (visibleInput) {
|
|
63
|
+
await page.keyboard.type(fieldValue, { delay: 50 })
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
const clickSignIn = async (page) => {
|
|
68
|
+
// Find the "Sign in" button, excluding "Sign in with Apple" etc.
|
|
69
|
+
await page.evaluate(() => {
|
|
70
|
+
const buttons = Array.from(document.querySelectorAll('button'))
|
|
71
|
+
const signInBtn = buttons.find(
|
|
72
|
+
(b) => {
|
|
73
|
+
const text = b.textContent.trim().toLowerCase()
|
|
74
|
+
return (text === 'sign in' || text === 'inloggen' || text === 'aanmelden') &&
|
|
75
|
+
!text.includes('apple')
|
|
76
|
+
}
|
|
77
|
+
)
|
|
78
|
+
if (signInBtn) {
|
|
79
|
+
signInBtn.click()
|
|
80
|
+
return true
|
|
81
|
+
}
|
|
82
|
+
return false
|
|
83
|
+
})
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const LOGGED_IN_PATHS = ['/feed', '/mynetwork', '/in/']
|
|
87
|
+
|
|
5
88
|
module.exports = async (browser, email, password) => {
|
|
6
89
|
const url = 'https://www.linkedin.com/login'
|
|
7
90
|
const page = await openPage({ browser, url })
|
|
8
91
|
logger.info(`logging at: ${url}`)
|
|
9
92
|
|
|
10
|
-
|
|
93
|
+
// Accept cookie consent if present
|
|
94
|
+
await acceptCookies(page)
|
|
95
|
+
await new Promise((r) => setTimeout(r, 1000))
|
|
11
96
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
await
|
|
15
|
-
.then((passwordElement) => passwordElement.type(password))
|
|
97
|
+
// Fill in email field
|
|
98
|
+
await fillField(page, 'username webauthn')
|
|
99
|
+
await new Promise((r) => setTimeout(r, 500))
|
|
16
100
|
|
|
17
|
-
|
|
101
|
+
// Fill in password field
|
|
102
|
+
await fillField(page, 'current-password')
|
|
103
|
+
await new Promise((r) => setTimeout(r, 500))
|
|
18
104
|
|
|
19
|
-
|
|
20
|
-
.then(async () => {
|
|
21
|
-
const currentUrl = page.url()
|
|
22
|
-
if (currentUrl.includes('/feed') || currentUrl.includes('/mynetwork') || currentUrl.includes('/in/')) {
|
|
23
|
-
logger.info('logged in, redirected to: ' + currentUrl)
|
|
24
|
-
await page.close()
|
|
25
|
-
return
|
|
26
|
-
}
|
|
27
|
-
throw new Error('unexpected redirect: ' + currentUrl)
|
|
28
|
-
})
|
|
29
|
-
.catch(async () => {
|
|
30
|
-
logger.warn('successful login element was not found')
|
|
31
|
-
const emailError = await page.evaluate(() => {
|
|
32
|
-
const e = document.querySelector('div[error-for=username], #error-for-username')
|
|
33
|
-
if (!e) { return false }
|
|
34
|
-
const style = window.getComputedStyle(e)
|
|
35
|
-
return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
|
|
36
|
-
})
|
|
37
|
-
|
|
38
|
-
const passwordError = await page.evaluate(() => {
|
|
39
|
-
const e = document.querySelector('div[error-for=password], #error-for-password')
|
|
40
|
-
if (!e) { return false }
|
|
41
|
-
const style = window.getComputedStyle(e)
|
|
42
|
-
return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
|
|
43
|
-
})
|
|
44
|
-
|
|
45
|
-
const manualChallengeRequested = await page.evaluate(() => {
|
|
46
|
-
const e = document.querySelector('.flow-challenge-content, #challenge, [data-test-id="challenge"]')
|
|
47
|
-
if (!e) { return false }
|
|
48
|
-
const style = window.getComputedStyle(e)
|
|
49
|
-
return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
|
|
50
|
-
})
|
|
51
|
-
|
|
52
|
-
if (emailError) {
|
|
53
|
-
logger.info('wrong username element found')
|
|
54
|
-
return Promise.reject(new Error(`linkedin: invalid username: ${email}`))
|
|
55
|
-
}
|
|
105
|
+
await clickSignIn(page)
|
|
56
106
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
107
|
+
let hadChallenge = false
|
|
108
|
+
|
|
109
|
+
try {
|
|
110
|
+
await page.waitForNavigation({ waitUntil: 'domcontentloaded', timeout: 15000 })
|
|
111
|
+
} catch {
|
|
112
|
+
// Navigation timeout is expected — the page may not navigate if already on login
|
|
113
|
+
}
|
|
61
114
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
115
|
+
// Wait for either the feed/mynetwork or challenge page
|
|
116
|
+
const maxWaitMs = 120000
|
|
117
|
+
const start = Date.now()
|
|
118
|
+
let resolved = false
|
|
119
|
+
while (Date.now() - start < maxWaitMs && !resolved) {
|
|
120
|
+
const currentUrl = page.url()
|
|
121
|
+
if (LOGGED_IN_PATHS.some((p) => currentUrl.includes(p))) {
|
|
122
|
+
logger.info('logged in, redirected to: ' + currentUrl)
|
|
123
|
+
resolved = true
|
|
124
|
+
break
|
|
125
|
+
}
|
|
126
|
+
if (currentUrl.includes('/checkpoint')) {
|
|
127
|
+
if (!hadChallenge) {
|
|
128
|
+
logger.warn('2FA challenge detected, please complete the verification in the browser window (waiting up to 2 minutes)...')
|
|
129
|
+
hadChallenge = true
|
|
65
130
|
}
|
|
131
|
+
await new Promise((r) => setTimeout(r, 2000))
|
|
132
|
+
continue
|
|
133
|
+
}
|
|
134
|
+
await new Promise((r) => setTimeout(r, 500))
|
|
135
|
+
}
|
|
66
136
|
|
|
67
|
-
|
|
68
|
-
|
|
137
|
+
if (!resolved) {
|
|
138
|
+
const finalUrl = page.url()
|
|
139
|
+
logger.warn('successful login element was not found, url: ' + finalUrl)
|
|
140
|
+
|
|
141
|
+
const emailError = await page.evaluate(() => {
|
|
142
|
+
const e = document.querySelector('div[error-for=username], #error-for-username')
|
|
143
|
+
if (!e) { return false }
|
|
144
|
+
const style = window.getComputedStyle(e)
|
|
145
|
+
return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
|
|
146
|
+
})
|
|
147
|
+
|
|
148
|
+
const passwordError = await page.evaluate(() => {
|
|
149
|
+
const e = document.querySelector('div[error-for=password], #error-for-password')
|
|
150
|
+
if (!e) { return false }
|
|
151
|
+
const style = window.getComputedStyle(e)
|
|
152
|
+
return style && style.display !== 'none' && style.visibility !== 'hidden' && style.opacity !== '0'
|
|
69
153
|
})
|
|
154
|
+
|
|
155
|
+
if (emailError) {
|
|
156
|
+
logger.info('wrong username element found')
|
|
157
|
+
await page.close()
|
|
158
|
+
return Promise.reject(new Error(`linkedin: invalid username: ${email}`))
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
if (passwordError) {
|
|
162
|
+
logger.info('wrong password element found')
|
|
163
|
+
await page.close()
|
|
164
|
+
return Promise.reject(new Error('linkedin: invalid password'))
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
logger.error('could not find any element to retrieve a proper error')
|
|
168
|
+
await page.close()
|
|
169
|
+
return Promise.reject(new Error(`${pkg.name} ${pkg.version} login is not working, please report: ${pkg.bugs.url}`))
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
await page.close()
|
|
173
|
+
return { hadChallenge }
|
|
70
174
|
}
|
package/src/openPage.js
CHANGED
package/src/profile/profile.js
CHANGED
|
@@ -10,52 +10,49 @@ const extractProfileData = async (page) => {
|
|
|
10
10
|
return page.evaluate(() => {
|
|
11
11
|
const txt = (el) => el ? (el.textContent || '').trim() : ''
|
|
12
12
|
|
|
13
|
-
const
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
13
|
+
const findSection = (headingText) => {
|
|
14
|
+
const sections = document.querySelectorAll('section')
|
|
15
|
+
for (const s of sections) {
|
|
16
|
+
const h2 = s.querySelector('h2')
|
|
17
|
+
if (h2 && h2.textContent.trim().startsWith(headingText)) return s
|
|
18
|
+
}
|
|
19
|
+
return null
|
|
19
20
|
}
|
|
20
21
|
|
|
21
|
-
|
|
22
|
-
// found after the h2 heading: h2.parent.nextSibling > div > div > div
|
|
23
|
-
const getSectionItems = (viewName) => {
|
|
24
|
-
const section = getSection(viewName)
|
|
22
|
+
const getContentDivs = (section) => {
|
|
25
23
|
if (!section) return []
|
|
26
|
-
const ul = section.querySelector('ul')
|
|
27
|
-
if (ul) return [...ul.querySelectorAll(':scope > li')]
|
|
28
24
|
const h2 = section.querySelector('h2')
|
|
29
25
|
if (!h2) return []
|
|
30
|
-
const
|
|
31
|
-
if (!
|
|
32
|
-
|
|
33
|
-
return [...itemDivs].filter(d => d.querySelectorAll('p').length > 0)
|
|
26
|
+
const content = h2.parentElement?.nextElementSibling
|
|
27
|
+
if (!content) return []
|
|
28
|
+
return [...content.querySelectorAll(':scope > div')].filter(d => d.querySelector('p'))
|
|
34
29
|
}
|
|
35
30
|
|
|
36
|
-
// Profile top card
|
|
37
|
-
const
|
|
38
|
-
|
|
39
|
-
const
|
|
40
|
-
|
|
41
|
-
|
|
31
|
+
// Profile top card
|
|
32
|
+
const sections = document.querySelectorAll('section')
|
|
33
|
+
let topSection = null
|
|
34
|
+
for (const s of sections) {
|
|
35
|
+
const h2 = s.querySelector('h2')
|
|
36
|
+
if (h2 && ['0 notifications', 'Suggested for you', 'Analytics', 'Activity'].includes(h2.textContent.trim())) continue
|
|
37
|
+
if (s.textContent.trim().length > 200) { topSection = s; break }
|
|
38
|
+
}
|
|
42
39
|
|
|
43
|
-
const
|
|
40
|
+
const nameH1 = document.querySelector('h1')
|
|
41
|
+
const profileName = nameH1 ? txt(nameH1) : document.title.replace(' | LinkedIn', '').trim()
|
|
44
42
|
let headline = ''
|
|
45
43
|
let location = ''
|
|
46
|
-
|
|
47
44
|
if (topSection) {
|
|
48
45
|
const allP = [...topSection.querySelectorAll('p')]
|
|
49
46
|
headline = txt(allP[0])
|
|
50
47
|
for (const p of allP) {
|
|
51
48
|
const t = txt(p)
|
|
52
|
-
if (t.includes(',') && !t.includes('at ') && !t.includes('\
|
|
49
|
+
if (t.includes(',') && !t.includes('at ') && !t.includes('\u00B7') && !t.includes('follow')) {
|
|
53
50
|
location = t
|
|
54
51
|
break
|
|
55
52
|
}
|
|
56
53
|
}
|
|
57
54
|
}
|
|
58
|
-
|
|
55
|
+
const photoImg = document.querySelector('img[src*="profile-displayphoto"]')
|
|
59
56
|
const profile = {
|
|
60
57
|
name: profileName,
|
|
61
58
|
headline,
|
|
@@ -64,101 +61,141 @@ const extractProfileData = async (page) => {
|
|
|
64
61
|
imageurl: photoImg ? photoImg.getAttribute('src') || '' : ''
|
|
65
62
|
}
|
|
66
63
|
|
|
67
|
-
// Experience
|
|
64
|
+
// Experience
|
|
68
65
|
const positions = []
|
|
69
|
-
const expSection =
|
|
66
|
+
const expSection = findSection('Experience')
|
|
70
67
|
if (expSection) {
|
|
71
|
-
const
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
const
|
|
76
|
-
if (!ul) return
|
|
77
|
-
|
|
78
|
-
// Company info is in p tags before the ul
|
|
79
|
-
const allPs = group.querySelectorAll('p')
|
|
80
|
-
const beforeUlPs = []
|
|
81
|
-
for (const p of allPs) {
|
|
82
|
-
if (ul.contains(p)) break
|
|
83
|
-
beforeUlPs.push(p)
|
|
84
|
-
}
|
|
85
|
-
const companyName = txt(beforeUlPs[0])
|
|
68
|
+
const companyGroups = expSection.querySelectorAll('[componentkey^="entity-collection"]')
|
|
69
|
+
companyGroups.forEach(group => {
|
|
70
|
+
const allP = [...group.querySelectorAll('p')].filter(p => txt(p).length > 0)
|
|
71
|
+
if (allP.length === 0) return
|
|
72
|
+
const companyName = txt(allP[0])
|
|
86
73
|
const companyLink = group.querySelector('a[href*="/company/"]')
|
|
87
74
|
const companyUrl = companyLink ? companyLink.getAttribute('href') : ''
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
75
|
+
|
|
76
|
+
const positionLis = group.querySelectorAll('ul > li')
|
|
77
|
+
if (positionLis.length > 0) {
|
|
78
|
+
positionLis.forEach(li => {
|
|
79
|
+
const liPs = [...li.querySelectorAll('p')].filter(p => txt(p).length > 0)
|
|
80
|
+
if (liPs.length === 0) return
|
|
81
|
+
const descEl = li.querySelector('[data-testid="expandable-text-box"]')
|
|
82
|
+
let dateStr = ''
|
|
83
|
+
for (let i = 1; i < liPs.length; i++) {
|
|
84
|
+
const t = txt(liPs[i])
|
|
85
|
+
if (t.includes('\u00B7') || /\d{4}/.test(t)) { dateStr = t; break }
|
|
86
|
+
}
|
|
87
|
+
const dateRange = dateStr.split('\u00B7')[0].trim()
|
|
88
|
+
const dateParts = dateRange.split(' - ')
|
|
89
|
+
positions.push({
|
|
90
|
+
title: txt(liPs[0]),
|
|
91
|
+
companyName,
|
|
92
|
+
link: companyUrl,
|
|
93
|
+
url: companyUrl,
|
|
94
|
+
location: '',
|
|
95
|
+
description: descEl ? txt(descEl) : '',
|
|
96
|
+
date: dateRange,
|
|
97
|
+
date1: dateParts[0] ? dateParts[0].trim() : '',
|
|
98
|
+
date2: dateParts[1] ? dateParts[1].trim() : ''
|
|
99
|
+
})
|
|
100
|
+
})
|
|
101
|
+
} else {
|
|
102
|
+
const descEl = group.querySelector('[data-testid="expandable-text-box"]')
|
|
103
|
+
let dateStr = ''
|
|
104
|
+
for (let i = 1; i < allP.length; i++) {
|
|
105
|
+
const t = txt(allP[i])
|
|
106
|
+
if (t.includes('\u00B7') || /\d{4}/.test(t.split(' ')[0])) { dateStr = t; break }
|
|
107
|
+
}
|
|
108
|
+
const dateRange = dateStr.split('\u00B7')[0].trim()
|
|
96
109
|
const dateParts = dateRange.split(' - ')
|
|
97
110
|
positions.push({
|
|
98
|
-
title: txt(
|
|
111
|
+
title: txt(allP[3]) || txt(allP[1]),
|
|
99
112
|
companyName,
|
|
100
113
|
link: companyUrl,
|
|
101
114
|
url: companyUrl,
|
|
102
|
-
location:
|
|
115
|
+
location: txt(allP[2]) || '',
|
|
103
116
|
description: descEl ? txt(descEl) : '',
|
|
104
117
|
date: dateRange,
|
|
105
118
|
date1: dateParts[0] ? dateParts[0].trim() : '',
|
|
106
119
|
date2: dateParts[1] ? dateParts[1].trim() : ''
|
|
107
120
|
})
|
|
108
|
-
}
|
|
121
|
+
}
|
|
109
122
|
})
|
|
110
123
|
}
|
|
111
124
|
|
|
112
125
|
// Education
|
|
113
|
-
const
|
|
114
|
-
const
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
return
|
|
126
|
+
const educations = []
|
|
127
|
+
const eduSection = findSection('Education')
|
|
128
|
+
const eduDivs = getContentDivs(eduSection)
|
|
129
|
+
eduDivs.forEach(div => {
|
|
130
|
+
const ps = [...div.querySelectorAll('p')].filter(p => txt(p).length > 0)
|
|
131
|
+
if (ps.length === 0) return
|
|
132
|
+
const schoolLink = div.querySelector('a[href*="/school/"]')
|
|
133
|
+
const url = schoolLink ? schoolLink.getAttribute('href') : ''
|
|
134
|
+
const dateStr = txt(ps[2]) || ''
|
|
135
|
+
const dateParts = dateStr.split('\u2013')
|
|
136
|
+
educations.push({
|
|
119
137
|
title: txt(ps[0]),
|
|
120
|
-
degree: txt(ps[1]),
|
|
121
|
-
fieldOfStudy: txt(ps[1]),
|
|
122
|
-
url
|
|
123
|
-
date1:
|
|
124
|
-
date2:
|
|
138
|
+
degree: txt(ps[1]) || '',
|
|
139
|
+
fieldOfStudy: txt(ps[1]) || '',
|
|
140
|
+
url,
|
|
141
|
+
date1: dateParts[0] ? dateParts[0].trim() : '',
|
|
142
|
+
date2: dateParts[1] ? dateParts[1].trim() : '',
|
|
125
143
|
description: ''
|
|
126
|
-
}
|
|
144
|
+
})
|
|
127
145
|
})
|
|
128
146
|
|
|
129
147
|
// Skills
|
|
130
|
-
const
|
|
131
|
-
const
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
148
|
+
const skills = []
|
|
149
|
+
const skillsSection = findSection('Skills')
|
|
150
|
+
if (skillsSection) {
|
|
151
|
+
const allSectionP = [...skillsSection.querySelectorAll('p')].filter(p => txt(p).length > 0)
|
|
152
|
+
for (let i = 0; i < allSectionP.length; i += 2) {
|
|
153
|
+
const name = txt(allSectionP[i])
|
|
154
|
+
if (name && !name.includes('Show all') && !name.includes('Private')) {
|
|
155
|
+
skills.push({ title: name, count: '' })
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
135
159
|
|
|
136
160
|
// Languages
|
|
137
|
-
const
|
|
138
|
-
const
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
161
|
+
const languages = []
|
|
162
|
+
const langSection = findSection('Languages')
|
|
163
|
+
if (langSection) {
|
|
164
|
+
const langDivs = getContentDivs(langSection)
|
|
165
|
+
langDivs.forEach(div => {
|
|
166
|
+
const ps = [...div.querySelectorAll('p')].filter(p => txt(p).length > 0)
|
|
167
|
+
for (let i = 0; i < ps.length; i += 2) {
|
|
168
|
+
const n = txt(ps[i])
|
|
169
|
+
const pr = ps[i + 1] ? txt(ps[i + 1]) : ''
|
|
170
|
+
if (n) languages.push({ name: n, proficiency: pr })
|
|
171
|
+
}
|
|
172
|
+
})
|
|
173
|
+
}
|
|
142
174
|
|
|
143
175
|
// Projects
|
|
144
|
-
const
|
|
145
|
-
const
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
const
|
|
149
|
-
return
|
|
176
|
+
const projects = []
|
|
177
|
+
const projSection = findSection('Projects')
|
|
178
|
+
const projDivs = getContentDivs(projSection)
|
|
179
|
+
projDivs.forEach(div => {
|
|
180
|
+
const ps = [...div.querySelectorAll('p')].filter(p => txt(p).length > 0)
|
|
181
|
+
if (ps.length === 0) return
|
|
182
|
+
const descEl = div.querySelector('[data-testid="expandable-text-box"]')
|
|
183
|
+
const link = div.querySelector('a[href*="http"]')
|
|
184
|
+
projects.push({
|
|
150
185
|
name: txt(ps[0]),
|
|
151
|
-
date: txt(ps[1]),
|
|
186
|
+
date: txt(ps[1]) || '',
|
|
152
187
|
description: descEl ? txt(descEl) : '',
|
|
153
188
|
link: link ? link.getAttribute('href') : ''
|
|
154
|
-
}
|
|
189
|
+
})
|
|
155
190
|
})
|
|
156
191
|
|
|
157
192
|
// Certifications
|
|
158
|
-
const
|
|
159
|
-
const
|
|
160
|
-
|
|
161
|
-
|
|
193
|
+
const accomplishments = []
|
|
194
|
+
const certSection = findSection('Licenses')
|
|
195
|
+
const certDivs = getContentDivs(certSection)
|
|
196
|
+
certDivs.forEach(div => {
|
|
197
|
+
const ps = [...div.querySelectorAll('p')].filter(p => txt(p).length > 0)
|
|
198
|
+
if (ps.length > 0) accomplishments.push({ title: txt(ps[0]), count: '', items: [] })
|
|
162
199
|
})
|
|
163
200
|
|
|
164
201
|
return {
|
|
@@ -184,9 +221,27 @@ module.exports = async (browser, cookies, url, waitTimeToScrapMs = 500, hasToGet
|
|
|
184
221
|
|
|
185
222
|
const page = await openPage({ browser, cookies, url, puppeteerAuthenticate })
|
|
186
223
|
|
|
187
|
-
//
|
|
224
|
+
// Check for authwall (expired session)
|
|
225
|
+
const isAuthwall = await page.evaluate(() =>
|
|
226
|
+
window.location.href.includes('/authwall') || document.title.toLowerCase().includes('inschrijven')
|
|
227
|
+
)
|
|
228
|
+
if (isAuthwall) {
|
|
229
|
+
await page.close()
|
|
230
|
+
throw new Error('authwall: LinkedIn session expired, re-authentication required')
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// Accept cookie consent if present
|
|
234
|
+
await page.evaluate(() => {
|
|
235
|
+
const btn = Array.from(document.querySelectorAll('button')).find(
|
|
236
|
+
(b) => b.textContent.trim().toLowerCase() === 'accept'
|
|
237
|
+
)
|
|
238
|
+
if (btn) btn.click()
|
|
239
|
+
})
|
|
240
|
+
await new Promise((r) => setTimeout(r, 2000))
|
|
241
|
+
|
|
242
|
+
// Wait for sections to appear (profile content is SDUI rendered)
|
|
188
243
|
await page.waitForFunction(() => {
|
|
189
|
-
return document.
|
|
244
|
+
return document.querySelectorAll('section h2').length > 1
|
|
190
245
|
}, { timeout: 30000 })
|
|
191
246
|
.catch(() => {
|
|
192
247
|
logger.warn('profile content did not fully render in time')
|
package/src/scrapedin.js
CHANGED
|
@@ -4,7 +4,22 @@ const profile = require('./profile/profile')
|
|
|
4
4
|
const company = require('./company/company')
|
|
5
5
|
const logger = require('./logger')(__filename)
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
const saveBrowserCookies = async (browser, cookiesPath) => {
|
|
8
|
+
if (!cookiesPath) return
|
|
9
|
+
try {
|
|
10
|
+
const pages = await browser.pages()
|
|
11
|
+
if (pages.length > 0) {
|
|
12
|
+
const pageCookies = await pages[0].cookies()
|
|
13
|
+
const fs = require('fs')
|
|
14
|
+
fs.writeFileSync(cookiesPath, JSON.stringify(pageCookies, null, 2))
|
|
15
|
+
logger.info('cookies saved to: ' + cookiesPath)
|
|
16
|
+
}
|
|
17
|
+
} catch (e) {
|
|
18
|
+
logger.warn('failed to save cookies: ' + e.message)
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
module.exports = async ({ cookies, email, password, isHeadless, hasToLog, hasToGetContactInfo, cookiesPath, puppeteerArgs, puppeteerAuthenticate, endpoint } = { isHeadless: true, hasToLog: false }) => {
|
|
8
23
|
if (!hasToLog) {
|
|
9
24
|
logger.stopLogging()
|
|
10
25
|
}
|
|
@@ -26,7 +41,11 @@ module.exports = async ({ cookies, email, password, isHeadless, hasToLog, hasToG
|
|
|
26
41
|
logger.info('email and password was provided, we\'re going to login...')
|
|
27
42
|
|
|
28
43
|
try {
|
|
29
|
-
await login(browser, email, password, logger)
|
|
44
|
+
const loginResult = await login(browser, email, password, logger)
|
|
45
|
+
// Only save cookies if login fully completed (no 2FA challenge)
|
|
46
|
+
if (loginResult && !loginResult.hadChallenge && cookiesPath) {
|
|
47
|
+
await saveBrowserCookies(browser, cookiesPath)
|
|
48
|
+
}
|
|
30
49
|
} catch (e) {
|
|
31
50
|
if (!endpoint) {
|
|
32
51
|
await browser.close()
|