@mvegter/scrapedin 1.0.32 → 1.0.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,42 +1,42 @@
1
- const logger = require('../logger')(__filename)
2
- const seeMoreButtons = [
3
- {
4
- id: 'SHOW_MORE_ABOUT',
5
- selector: '#line-clamp-show-more-button'
6
- },{
7
- id: 'SHOW_MORE_EXPERIENCES',
8
- selector: '#experience-section .pv-profile-section__see-more-inline'
9
- },{
10
- id: 'SEE_MORE_EXPERIENCES',
11
- selector: '#experience-section .inline-show-more-text__button'
12
- },{
13
- id: 'SHOW_MORE_CERTIFICATIONS',
14
- selector: '#certifications-section .pv-profile-section__see-more-inline'
15
- },{
16
- id: 'SHOW_MORE_SKILLS',
17
- selector: '.pv-skills-section__additional-skills'
18
- },{
19
- id: 'SEE_MORE_RECOMMENDATIONS',
20
- selector: '.recommendations-inlining #line-clamp-show-more-button'
21
- }
22
- ]
23
-
24
-
25
- const clickAll = async(page) => {
26
- for(let i = 0; i < seeMoreButtons.length; i++){
27
- const button = seeMoreButtons[i]
28
- const elems = await page.$$(button.selector)
29
-
30
- for(let j = 0; j < elems.length; j++){
31
- const elem = elems[j]
32
- if (elem) {
33
- await elem.click()
34
- .catch((e) => logger.warn(`couldn't click on ${button.selector}, it's probably invisible`))
35
- }
36
- }
37
- }
38
-
39
- return
40
- }
41
-
42
- module.exports = { clickAll }
1
+ const logger = require('../logger')(__filename)
2
+ const seeMoreButtons = [
3
+ {
4
+ id: 'SHOW_MORE_ABOUT',
5
+ selector: '#line-clamp-show-more-button'
6
+ },{
7
+ id: 'SHOW_MORE_EXPERIENCES',
8
+ selector: '#experience-section .pv-profile-section__see-more-inline'
9
+ },{
10
+ id: 'SEE_MORE_EXPERIENCES',
11
+ selector: '#experience-section .inline-show-more-text__button'
12
+ },{
13
+ id: 'SHOW_MORE_CERTIFICATIONS',
14
+ selector: '#certifications-section .pv-profile-section__see-more-inline'
15
+ },{
16
+ id: 'SHOW_MORE_SKILLS',
17
+ selector: '.pv-skills-section__additional-skills'
18
+ },{
19
+ id: 'SEE_MORE_RECOMMENDATIONS',
20
+ selector: '.recommendations-inlining #line-clamp-show-more-button'
21
+ }
22
+ ]
23
+
24
+
25
+ const clickAll = async(page) => {
26
+ for(let i = 0; i < seeMoreButtons.length; i++){
27
+ const button = seeMoreButtons[i]
28
+ const elems = await page.$$(button.selector)
29
+
30
+ for(let j = 0; j < elems.length; j++){
31
+ const elem = elems[j]
32
+ if (elem) {
33
+ await elem.click()
34
+ .catch((e) => logger.warn(`couldn't click on ${button.selector}, it's probably invisible`))
35
+ }
36
+ }
37
+ }
38
+
39
+ return
40
+ }
41
+
42
+ module.exports = { clickAll }
@@ -1,49 +1,49 @@
1
- const scrapSelectorFields = (selector, section) => async (scrapedObjectPromise, fieldKey) => {
2
- const scrapedObject = await scrapedObjectPromise
3
- const field = section.fields[fieldKey]
4
-
5
- // currently field can be a selector string, or an object containing a selector field
6
- const fieldSelectorString = await field.selector
7
- ? field.selector
8
- : field
9
-
10
- const isFieldPresent = await selector.$(fieldSelectorString)
11
-
12
- if (!isFieldPresent) { return scrapedObject }
13
-
14
- if (field.isMultipleFields) {
15
- if (field.attribute === 'href') {
16
- scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.href ? elem.href.trim() : elem.innerHTML.trim()))
17
- } else if (field.attribute === 'src') {
18
- scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.src ? elem.src.trim() : elem.innerHTML.trim()))
19
- } else {
20
- scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.innerText.trim()))
21
- }
22
- } else if (field.hasChildrenFields) {
23
- const fieldChildrenSelectors = await selector.$$(field.selector)
24
-
25
- scrapedObject[fieldKey] = await Promise.all(
26
- fieldChildrenSelectors.map((s) => scrapSelector(s, field))
27
- )
28
- } else if (field.attribute && field.attribute === 'href') {
29
- scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.href ? elem.href.trim() : '')
30
- } else if (field.attribute && field.attribute === 'src') {
31
- scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.src ? elem.src.trim() : '')
32
- } else {
33
- scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.innerText ? elem.innerText.trim() : '')
34
- }
35
-
36
- return scrapedObject
37
- }
38
- const scrapSelector = (selector, section) =>
39
- Object.keys(section.fields)
40
- .reduce(scrapSelectorFields(selector, section), Promise.resolve({}))
41
-
42
- module.exports = async (page, section) => {
43
- const sectionSelectors = await page.$$(section.selector)
44
-
45
- const scrapedPromises = sectionSelectors
46
- .map((selector) => scrapSelector(selector, section))
47
-
48
- return Promise.all(scrapedPromises)
49
- }
1
+ const scrapSelectorFields = (selector, section) => async (scrapedObjectPromise, fieldKey) => {
2
+ const scrapedObject = await scrapedObjectPromise
3
+ const field = section.fields[fieldKey]
4
+
5
+ // currently field can be a selector string, or an object containing a selector field
6
+ const fieldSelectorString = await field.selector
7
+ ? field.selector
8
+ : field
9
+
10
+ const isFieldPresent = await selector.$(fieldSelectorString)
11
+
12
+ if (!isFieldPresent) { return scrapedObject }
13
+
14
+ if (field.isMultipleFields) {
15
+ if (field.attribute === 'href') {
16
+ scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.href ? elem.href.trim() : elem.innerHTML.trim()))
17
+ } else if (field.attribute === 'src') {
18
+ scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.src ? elem.src.trim() : elem.innerHTML.trim()))
19
+ } else {
20
+ scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.innerText.trim()))
21
+ }
22
+ } else if (field.hasChildrenFields) {
23
+ const fieldChildrenSelectors = await selector.$$(field.selector)
24
+
25
+ scrapedObject[fieldKey] = await Promise.all(
26
+ fieldChildrenSelectors.map((s) => scrapSelector(s, field))
27
+ )
28
+ } else if (field.attribute && field.attribute === 'href') {
29
+ scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.href ? elem.href.trim() : '')
30
+ } else if (field.attribute && field.attribute === 'src') {
31
+ scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.src ? elem.src.trim() : '')
32
+ } else {
33
+ scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.innerText ? elem.innerText.trim() : '')
34
+ }
35
+
36
+ return scrapedObject
37
+ }
38
+ const scrapSelector = (selector, section) =>
39
+ Object.keys(section.fields)
40
+ .reduce(scrapSelectorFields(selector, section), Promise.resolve({}))
41
+
42
+ module.exports = async (page, section) => {
43
+ const sectionSelectors = await page.$$(section.selector)
44
+
45
+ const scrapedPromises = sectionSelectors
46
+ .map((selector) => scrapSelector(selector, section))
47
+
48
+ return Promise.all(scrapedPromises)
49
+ }
package/src/scrapedin.js CHANGED
@@ -1,41 +1,41 @@
1
- const puppeteer = require('puppeteer')
2
- const login = require('./login')
3
- const profile = require('./profile/profile')
4
- const company = require('./company/company')
5
- const logger = require('./logger')(__filename)
6
-
7
- module.exports = async ({ cookies, email, password, isHeadless, hasToLog, hasToGetContactInfo, puppeteerArgs, puppeteerAuthenticate, endpoint } = { isHeadless: true, hasToLog: false }) => {
8
- if (!hasToLog) {
9
- logger.stopLogging()
10
- }
11
- logger.info('initializing')
12
-
13
- let browser
14
- if (endpoint) {
15
- browser = await puppeteer.connect({
16
- browserWSEndpoint: endpoint
17
- })
18
- } else {
19
- const args = Object.assign({ headless: isHeadless, args: ['--no-sandbox'] }, puppeteerArgs)
20
- browser = await puppeteer.launch(args)
21
- }
22
-
23
- if (cookies) {
24
- logger.info('using cookies, login will be bypassed')
25
- } else if (email && password) {
26
- logger.info('email and password was provided, we\'re going to login...')
27
-
28
- try {
29
- await login(browser, email, password, logger)
30
- } catch (e) {
31
- if (!endpoint) {
32
- await browser.close()
33
- }
34
- throw e
35
- }
36
- } else {
37
- logger.warn('email/password and cookies wasn\'t provided, only public data will be collected')
38
- }
39
-
40
- return (url, waitMs) => url.includes('/school/') || url.includes('/company/') ? company(browser, cookies, url, waitMs, hasToGetContactInfo, puppeteerAuthenticate) : profile(browser, cookies, url, waitMs, hasToGetContactInfo, puppeteerAuthenticate)
41
- }
1
+ const puppeteer = require('puppeteer')
2
+ const login = require('./login')
3
+ const profile = require('./profile/profile')
4
+ const company = require('./company/company')
5
+ const logger = require('./logger')(__filename)
6
+
7
+ module.exports = async ({ cookies, email, password, isHeadless, hasToLog, hasToGetContactInfo, puppeteerArgs, puppeteerAuthenticate, endpoint } = { isHeadless: true, hasToLog: false }) => {
8
+ if (!hasToLog) {
9
+ logger.stopLogging()
10
+ }
11
+ logger.info('initializing')
12
+
13
+ let browser
14
+ if (endpoint) {
15
+ browser = await puppeteer.connect({
16
+ browserWSEndpoint: endpoint
17
+ })
18
+ } else {
19
+ const args = Object.assign({ headless: isHeadless, args: ['--no-sandbox'] }, puppeteerArgs)
20
+ browser = await puppeteer.launch(args)
21
+ }
22
+
23
+ if (cookies) {
24
+ logger.info('using cookies, login will be bypassed')
25
+ } else if (email && password) {
26
+ logger.info('email and password was provided, we\'re going to login...')
27
+
28
+ try {
29
+ await login(browser, email, password, logger)
30
+ } catch (e) {
31
+ if (!endpoint) {
32
+ await browser.close()
33
+ }
34
+ throw e
35
+ }
36
+ } else {
37
+ logger.warn('email/password and cookies wasn\'t provided, only public data will be collected')
38
+ }
39
+
40
+ return (url, waitMs) => url.includes('/school/') || url.includes('/company/') ? company(browser, cookies, url, waitMs, hasToGetContactInfo, puppeteerAuthenticate) : profile(browser, cookies, url, waitMs, hasToGetContactInfo, puppeteerAuthenticate)
41
+ }