@mvegter/scrapedin 1.0.28 → 1.0.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/dependabot.yml +11 -11
- package/LICENSE +201 -201
- package/README.md +36 -36
- package/package.json +34 -34
- package/src/company/company.js +35 -35
- package/src/company/companyScraperTemplate.js +30 -30
- package/src/logger.js +29 -29
- package/src/login.js +68 -68
- package/src/openPage.js +33 -33
- package/src/package.js +11 -11
- package/src/profile/cleanProfileData.js +99 -94
- package/src/profile/contactInfo.js +48 -48
- package/src/profile/profile.js +81 -81
- package/src/profile/profileScraperTemplate.js +189 -190
- package/src/profile/scrapAccomplishmentPanel.js +17 -17
- package/src/profile/scrollToPageBottom.js +24 -24
- package/src/profile/seeMoreButtons.js +42 -42
- package/src/scrapSection.js +49 -49
- package/src/scrapedin.js +41 -41
|
@@ -1,42 +1,42 @@
|
|
|
1
|
-
const logger = require('../logger')(__filename)
|
|
2
|
-
const seeMoreButtons = [
|
|
3
|
-
{
|
|
4
|
-
id: 'SHOW_MORE_ABOUT',
|
|
5
|
-
selector: '#line-clamp-show-more-button'
|
|
6
|
-
},{
|
|
7
|
-
id: 'SHOW_MORE_EXPERIENCES',
|
|
8
|
-
selector: '#experience-section .pv-profile-section__see-more-inline'
|
|
9
|
-
},{
|
|
10
|
-
id: 'SEE_MORE_EXPERIENCES',
|
|
11
|
-
selector: '#experience-section .inline-show-more-text__button'
|
|
12
|
-
},{
|
|
13
|
-
id: 'SHOW_MORE_CERTIFICATIONS',
|
|
14
|
-
selector: '#certifications-section .pv-profile-section__see-more-inline'
|
|
15
|
-
},{
|
|
16
|
-
id: 'SHOW_MORE_SKILLS',
|
|
17
|
-
selector: '.pv-skills-section__additional-skills'
|
|
18
|
-
},{
|
|
19
|
-
id: 'SEE_MORE_RECOMMENDATIONS',
|
|
20
|
-
selector: '.recommendations-inlining #line-clamp-show-more-button'
|
|
21
|
-
}
|
|
22
|
-
]
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
const clickAll = async(page) => {
|
|
26
|
-
for(let i = 0; i < seeMoreButtons.length; i++){
|
|
27
|
-
const button = seeMoreButtons[i]
|
|
28
|
-
const elems = await page.$$(button.selector)
|
|
29
|
-
|
|
30
|
-
for(let j = 0; j < elems.length; j++){
|
|
31
|
-
const elem = elems[j]
|
|
32
|
-
if (elem) {
|
|
33
|
-
await elem.click()
|
|
34
|
-
.catch((e) => logger.warn(`couldn't click on ${button.selector}, it's probably invisible`))
|
|
35
|
-
}
|
|
36
|
-
}
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
return
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
module.exports = { clickAll }
|
|
1
|
+
const logger = require('../logger')(__filename)
|
|
2
|
+
const seeMoreButtons = [
|
|
3
|
+
{
|
|
4
|
+
id: 'SHOW_MORE_ABOUT',
|
|
5
|
+
selector: '#line-clamp-show-more-button'
|
|
6
|
+
},{
|
|
7
|
+
id: 'SHOW_MORE_EXPERIENCES',
|
|
8
|
+
selector: '#experience-section .pv-profile-section__see-more-inline'
|
|
9
|
+
},{
|
|
10
|
+
id: 'SEE_MORE_EXPERIENCES',
|
|
11
|
+
selector: '#experience-section .inline-show-more-text__button'
|
|
12
|
+
},{
|
|
13
|
+
id: 'SHOW_MORE_CERTIFICATIONS',
|
|
14
|
+
selector: '#certifications-section .pv-profile-section__see-more-inline'
|
|
15
|
+
},{
|
|
16
|
+
id: 'SHOW_MORE_SKILLS',
|
|
17
|
+
selector: '.pv-skills-section__additional-skills'
|
|
18
|
+
},{
|
|
19
|
+
id: 'SEE_MORE_RECOMMENDATIONS',
|
|
20
|
+
selector: '.recommendations-inlining #line-clamp-show-more-button'
|
|
21
|
+
}
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
const clickAll = async(page) => {
|
|
26
|
+
for(let i = 0; i < seeMoreButtons.length; i++){
|
|
27
|
+
const button = seeMoreButtons[i]
|
|
28
|
+
const elems = await page.$$(button.selector)
|
|
29
|
+
|
|
30
|
+
for(let j = 0; j < elems.length; j++){
|
|
31
|
+
const elem = elems[j]
|
|
32
|
+
if (elem) {
|
|
33
|
+
await elem.click()
|
|
34
|
+
.catch((e) => logger.warn(`couldn't click on ${button.selector}, it's probably invisible`))
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
return
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
module.exports = { clickAll }
|
package/src/scrapSection.js
CHANGED
|
@@ -1,49 +1,49 @@
|
|
|
1
|
-
const scrapSelectorFields = (selector, section) => async (scrapedObjectPromise, fieldKey) => {
|
|
2
|
-
const scrapedObject = await scrapedObjectPromise
|
|
3
|
-
const field = section.fields[fieldKey]
|
|
4
|
-
|
|
5
|
-
// currently field can be a selector string, or an object containing a selector field
|
|
6
|
-
const fieldSelectorString = await field.selector
|
|
7
|
-
? field.selector
|
|
8
|
-
: field
|
|
9
|
-
|
|
10
|
-
const isFieldPresent = await selector.$(fieldSelectorString)
|
|
11
|
-
|
|
12
|
-
if (!isFieldPresent) { return scrapedObject }
|
|
13
|
-
|
|
14
|
-
if (field.isMultipleFields) {
|
|
15
|
-
if (field.attribute === 'href') {
|
|
16
|
-
scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.href ? elem.href.trim() : elem.innerHTML.trim()))
|
|
17
|
-
} else if (field.attribute === 'src') {
|
|
18
|
-
scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.src ? elem.src.trim() : elem.innerHTML.trim()))
|
|
19
|
-
} else {
|
|
20
|
-
scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.innerText.trim()))
|
|
21
|
-
}
|
|
22
|
-
} else if (field.hasChildrenFields) {
|
|
23
|
-
const fieldChildrenSelectors = await selector.$$(field.selector)
|
|
24
|
-
|
|
25
|
-
scrapedObject[fieldKey] = await Promise.all(
|
|
26
|
-
fieldChildrenSelectors.map((s) => scrapSelector(s, field))
|
|
27
|
-
)
|
|
28
|
-
} else if (field.attribute && field.attribute === 'href') {
|
|
29
|
-
scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.href ? elem.href.trim() : '')
|
|
30
|
-
} else if (field.attribute && field.attribute === 'src') {
|
|
31
|
-
scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.src ? elem.src.trim() : '')
|
|
32
|
-
} else {
|
|
33
|
-
scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.innerText ? elem.innerText.trim() : '')
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
return scrapedObject
|
|
37
|
-
}
|
|
38
|
-
const scrapSelector = (selector, section) =>
|
|
39
|
-
Object.keys(section.fields)
|
|
40
|
-
.reduce(scrapSelectorFields(selector, section), Promise.resolve({}))
|
|
41
|
-
|
|
42
|
-
module.exports = async (page, section) => {
|
|
43
|
-
const sectionSelectors = await page.$$(section.selector)
|
|
44
|
-
|
|
45
|
-
const scrapedPromises = sectionSelectors
|
|
46
|
-
.map((selector) => scrapSelector(selector, section))
|
|
47
|
-
|
|
48
|
-
return Promise.all(scrapedPromises)
|
|
49
|
-
}
|
|
1
|
+
const scrapSelectorFields = (selector, section) => async (scrapedObjectPromise, fieldKey) => {
|
|
2
|
+
const scrapedObject = await scrapedObjectPromise
|
|
3
|
+
const field = section.fields[fieldKey]
|
|
4
|
+
|
|
5
|
+
// currently field can be a selector string, or an object containing a selector field
|
|
6
|
+
const fieldSelectorString = await field.selector
|
|
7
|
+
? field.selector
|
|
8
|
+
: field
|
|
9
|
+
|
|
10
|
+
const isFieldPresent = await selector.$(fieldSelectorString)
|
|
11
|
+
|
|
12
|
+
if (!isFieldPresent) { return scrapedObject }
|
|
13
|
+
|
|
14
|
+
if (field.isMultipleFields) {
|
|
15
|
+
if (field.attribute === 'href') {
|
|
16
|
+
scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.href ? elem.href.trim() : elem.innerHTML.trim()))
|
|
17
|
+
} else if (field.attribute === 'src') {
|
|
18
|
+
scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.src ? elem.src.trim() : elem.innerHTML.trim()))
|
|
19
|
+
} else {
|
|
20
|
+
scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.innerText.trim()))
|
|
21
|
+
}
|
|
22
|
+
} else if (field.hasChildrenFields) {
|
|
23
|
+
const fieldChildrenSelectors = await selector.$$(field.selector)
|
|
24
|
+
|
|
25
|
+
scrapedObject[fieldKey] = await Promise.all(
|
|
26
|
+
fieldChildrenSelectors.map((s) => scrapSelector(s, field))
|
|
27
|
+
)
|
|
28
|
+
} else if (field.attribute && field.attribute === 'href') {
|
|
29
|
+
scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.href ? elem.href.trim() : '')
|
|
30
|
+
} else if (field.attribute && field.attribute === 'src') {
|
|
31
|
+
scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.src ? elem.src.trim() : '')
|
|
32
|
+
} else {
|
|
33
|
+
scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.innerText ? elem.innerText.trim() : '')
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
return scrapedObject
|
|
37
|
+
}
|
|
38
|
+
const scrapSelector = (selector, section) =>
|
|
39
|
+
Object.keys(section.fields)
|
|
40
|
+
.reduce(scrapSelectorFields(selector, section), Promise.resolve({}))
|
|
41
|
+
|
|
42
|
+
module.exports = async (page, section) => {
|
|
43
|
+
const sectionSelectors = await page.$$(section.selector)
|
|
44
|
+
|
|
45
|
+
const scrapedPromises = sectionSelectors
|
|
46
|
+
.map((selector) => scrapSelector(selector, section))
|
|
47
|
+
|
|
48
|
+
return Promise.all(scrapedPromises)
|
|
49
|
+
}
|
package/src/scrapedin.js
CHANGED
|
@@ -1,41 +1,41 @@
|
|
|
1
|
-
const puppeteer = require('puppeteer')
|
|
2
|
-
const login = require('./login')
|
|
3
|
-
const profile = require('./profile/profile')
|
|
4
|
-
const company = require('./company/company')
|
|
5
|
-
const logger = require('./logger')(__filename)
|
|
6
|
-
|
|
7
|
-
module.exports = async ({ cookies, email, password, isHeadless, hasToLog, hasToGetContactInfo, puppeteerArgs, puppeteerAuthenticate, endpoint } = { isHeadless: true, hasToLog: false }) => {
|
|
8
|
-
if (!hasToLog) {
|
|
9
|
-
logger.stopLogging()
|
|
10
|
-
}
|
|
11
|
-
logger.info('initializing')
|
|
12
|
-
|
|
13
|
-
let browser
|
|
14
|
-
if (endpoint) {
|
|
15
|
-
browser = await puppeteer.connect({
|
|
16
|
-
browserWSEndpoint: endpoint
|
|
17
|
-
})
|
|
18
|
-
} else {
|
|
19
|
-
const args = Object.assign({ headless: isHeadless, args: ['--no-sandbox'] }, puppeteerArgs)
|
|
20
|
-
browser = await puppeteer.launch(args)
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
if (cookies) {
|
|
24
|
-
logger.info('using cookies, login will be bypassed')
|
|
25
|
-
} else if (email && password) {
|
|
26
|
-
logger.info('email and password was provided, we\'re going to login...')
|
|
27
|
-
|
|
28
|
-
try {
|
|
29
|
-
await login(browser, email, password, logger)
|
|
30
|
-
} catch (e) {
|
|
31
|
-
if (!endpoint) {
|
|
32
|
-
await browser.close()
|
|
33
|
-
}
|
|
34
|
-
throw e
|
|
35
|
-
}
|
|
36
|
-
} else {
|
|
37
|
-
logger.warn('email/password and cookies wasn\'t provided, only public data will be collected')
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
return (url, waitMs) => url.includes('/school/') || url.includes('/company/') ? company(browser, cookies, url, waitMs, hasToGetContactInfo, puppeteerAuthenticate) : profile(browser, cookies, url, waitMs, hasToGetContactInfo, puppeteerAuthenticate)
|
|
41
|
-
}
|
|
1
|
+
const puppeteer = require('puppeteer')
|
|
2
|
+
const login = require('./login')
|
|
3
|
+
const profile = require('./profile/profile')
|
|
4
|
+
const company = require('./company/company')
|
|
5
|
+
const logger = require('./logger')(__filename)
|
|
6
|
+
|
|
7
|
+
module.exports = async ({ cookies, email, password, isHeadless, hasToLog, hasToGetContactInfo, puppeteerArgs, puppeteerAuthenticate, endpoint } = { isHeadless: true, hasToLog: false }) => {
|
|
8
|
+
if (!hasToLog) {
|
|
9
|
+
logger.stopLogging()
|
|
10
|
+
}
|
|
11
|
+
logger.info('initializing')
|
|
12
|
+
|
|
13
|
+
let browser
|
|
14
|
+
if (endpoint) {
|
|
15
|
+
browser = await puppeteer.connect({
|
|
16
|
+
browserWSEndpoint: endpoint
|
|
17
|
+
})
|
|
18
|
+
} else {
|
|
19
|
+
const args = Object.assign({ headless: isHeadless, args: ['--no-sandbox'] }, puppeteerArgs)
|
|
20
|
+
browser = await puppeteer.launch(args)
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
if (cookies) {
|
|
24
|
+
logger.info('using cookies, login will be bypassed')
|
|
25
|
+
} else if (email && password) {
|
|
26
|
+
logger.info('email and password was provided, we\'re going to login...')
|
|
27
|
+
|
|
28
|
+
try {
|
|
29
|
+
await login(browser, email, password, logger)
|
|
30
|
+
} catch (e) {
|
|
31
|
+
if (!endpoint) {
|
|
32
|
+
await browser.close()
|
|
33
|
+
}
|
|
34
|
+
throw e
|
|
35
|
+
}
|
|
36
|
+
} else {
|
|
37
|
+
logger.warn('email/password and cookies wasn\'t provided, only public data will be collected')
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
return (url, waitMs) => url.includes('/school/') || url.includes('/company/') ? company(browser, cookies, url, waitMs, hasToGetContactInfo, puppeteerAuthenticate) : profile(browser, cookies, url, waitMs, hasToGetContactInfo, puppeteerAuthenticate)
|
|
41
|
+
}
|