@mvegter/scrapedin 1.0.26 → 1.0.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,42 +1,42 @@
1
- const logger = require('../logger')(__filename)
2
- const seeMoreButtons = [
3
- {
4
- id: 'SHOW_MORE_ABOUT',
5
- selector: '#line-clamp-show-more-button'
6
- },{
7
- id: 'SHOW_MORE_EXPERIENCES',
8
- selector: '#experience-section .pv-profile-section__see-more-inline'
9
- },{
10
- id: 'SEE_MORE_EXPERIENCES',
11
- selector: '#experience-section .inline-show-more-text__button'
12
- },{
13
- id: 'SHOW_MORE_CERTIFICATIONS',
14
- selector: '#certifications-section .pv-profile-section__see-more-inline'
15
- },{
16
- id: 'SHOW_MORE_SKILLS',
17
- selector: '.pv-skills-section__additional-skills'
18
- },{
19
- id: 'SEE_MORE_RECOMMENDATIONS',
20
- selector: '.recommendations-inlining #line-clamp-show-more-button'
21
- }
22
- ]
23
-
24
-
25
- const clickAll = async(page) => {
26
- for(let i = 0; i < seeMoreButtons.length; i++){
27
- const button = seeMoreButtons[i]
28
- const elems = await page.$$(button.selector)
29
-
30
- for(let j = 0; j < elems.length; j++){
31
- const elem = elems[j]
32
- if (elem) {
33
- await elem.click()
34
- .catch((e) => logger.warn(`couldn't click on ${button.selector}, it's probably invisible`))
35
- }
36
- }
37
- }
38
-
39
- return
40
- }
41
-
42
- module.exports = { clickAll }
1
+ const logger = require('../logger')(__filename)
2
+ const seeMoreButtons = [
3
+ {
4
+ id: 'SHOW_MORE_ABOUT',
5
+ selector: '#line-clamp-show-more-button'
6
+ },{
7
+ id: 'SHOW_MORE_EXPERIENCES',
8
+ selector: '#experience-section .pv-profile-section__see-more-inline'
9
+ },{
10
+ id: 'SEE_MORE_EXPERIENCES',
11
+ selector: '#experience-section .inline-show-more-text__button'
12
+ },{
13
+ id: 'SHOW_MORE_CERTIFICATIONS',
14
+ selector: '#certifications-section .pv-profile-section__see-more-inline'
15
+ },{
16
+ id: 'SHOW_MORE_SKILLS',
17
+ selector: '.pv-skills-section__additional-skills'
18
+ },{
19
+ id: 'SEE_MORE_RECOMMENDATIONS',
20
+ selector: '.recommendations-inlining #line-clamp-show-more-button'
21
+ }
22
+ ]
23
+
24
+
25
+ const clickAll = async(page) => {
26
+ for(let i = 0; i < seeMoreButtons.length; i++){
27
+ const button = seeMoreButtons[i]
28
+ const elems = await page.$$(button.selector)
29
+
30
+ for(let j = 0; j < elems.length; j++){
31
+ const elem = elems[j]
32
+ if (elem) {
33
+ await elem.click()
34
+ .catch((e) => logger.warn(`couldn't click on ${button.selector}, it's probably invisible`))
35
+ }
36
+ }
37
+ }
38
+
39
+ return
40
+ }
41
+
42
+ module.exports = { clickAll }
@@ -1,49 +1,49 @@
1
- const scrapSelectorFields = (selector, section) => async (scrapedObjectPromise, fieldKey) => {
2
- const scrapedObject = await scrapedObjectPromise
3
- const field = section.fields[fieldKey]
4
-
5
- // currently field can be a selector string, or an object containing a selector field
6
- const fieldSelectorString = await field.selector
7
- ? field.selector
8
- : field
9
-
10
- const isFieldPresent = await selector.$(fieldSelectorString)
11
-
12
- if (!isFieldPresent) { return scrapedObject }
13
-
14
- if (field.isMultipleFields) {
15
- if (field.attribute === 'href') {
16
- scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.href ? elem.href.trim() : elem.innerHTML.trim()))
17
- } else if(field.attribute === 'src'){
18
- scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.src ? elem.src.trim() : elem.innerHTML.trim()))
19
- }else{
20
- scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.innerText.trim()))
21
- }
22
- } else if (field.hasChildrenFields) {
23
- const fieldChildrenSelectors = await selector.$$(field.selector)
24
-
25
- scrapedObject[fieldKey] = await Promise.all(
26
- fieldChildrenSelectors.map((s) => scrapSelector(s, field))
27
- )
28
- } else if (field.attribute && field.attribute === 'href') {
29
- scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.href ? elem.href.trim() : '')
30
- } else if (field.attribute && field.attribute === 'src') {
31
- scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.src ? elem.src.trim() : '')
32
- } else {
33
- scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.innerText ? elem.innerText.trim() : '')
34
- }
35
-
36
- return scrapedObject
37
- }
38
- const scrapSelector = (selector, section) =>
39
- Object.keys(section.fields)
40
- .reduce(scrapSelectorFields(selector, section), Promise.resolve({}))
41
-
42
- module.exports = async (page, section) => {
43
- const sectionSelectors = await page.$$(section.selector)
44
-
45
- const scrapedPromises = sectionSelectors
46
- .map((selector) => scrapSelector(selector, section))
47
-
48
- return Promise.all(scrapedPromises)
49
- }
1
+ const scrapSelectorFields = (selector, section) => async (scrapedObjectPromise, fieldKey) => {
2
+ const scrapedObject = await scrapedObjectPromise
3
+ const field = section.fields[fieldKey]
4
+
5
+ // currently field can be a selector string, or an object containing a selector field
6
+ const fieldSelectorString = await field.selector
7
+ ? field.selector
8
+ : field
9
+
10
+ const isFieldPresent = await selector.$(fieldSelectorString)
11
+
12
+ if (!isFieldPresent) { return scrapedObject }
13
+
14
+ if (field.isMultipleFields) {
15
+ if (field.attribute === 'href') {
16
+ scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.href ? elem.href.trim() : elem.innerHTML.trim()))
17
+ } else if (field.attribute === 'src') {
18
+ scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.src ? elem.src.trim() : elem.innerHTML.trim()))
19
+ } else {
20
+ scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.innerText.trim()))
21
+ }
22
+ } else if (field.hasChildrenFields) {
23
+ const fieldChildrenSelectors = await selector.$$(field.selector)
24
+
25
+ scrapedObject[fieldKey] = await Promise.all(
26
+ fieldChildrenSelectors.map((s) => scrapSelector(s, field))
27
+ )
28
+ } else if (field.attribute && field.attribute === 'href') {
29
+ scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.href ? elem.href.trim() : '')
30
+ } else if (field.attribute && field.attribute === 'src') {
31
+ scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.src ? elem.src.trim() : '')
32
+ } else {
33
+ scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem && elem.innerText ? elem.innerText.trim() : '')
34
+ }
35
+
36
+ return scrapedObject
37
+ }
38
+ const scrapSelector = (selector, section) =>
39
+ Object.keys(section.fields)
40
+ .reduce(scrapSelectorFields(selector, section), Promise.resolve({}))
41
+
42
+ module.exports = async (page, section) => {
43
+ const sectionSelectors = await page.$$(section.selector)
44
+
45
+ const scrapedPromises = sectionSelectors
46
+ .map((selector) => scrapSelector(selector, section))
47
+
48
+ return Promise.all(scrapedPromises)
49
+ }
package/src/scrapedin.js CHANGED
@@ -1,41 +1,41 @@
1
- const puppeteer = require('puppeteer')
2
- const login = require('./login')
3
- const profile = require('./profile/profile')
4
- const company = require('./company/company')
5
- const logger = require('./logger')(__filename)
6
-
7
- module.exports = async ({ cookies, email, password, isHeadless, hasToLog, hasToGetContactInfo, puppeteerArgs, puppeteerAuthenticate, endpoint } = { isHeadless: true, hasToLog: false }) => {
8
- if (!hasToLog) {
9
- logger.stopLogging()
10
- }
11
- logger.info('initializing')
12
-
13
- let browser;
14
- if(endpoint){
15
- browser = await puppeteer.connect({
16
- browserWSEndpoint: endpoint,
17
- });
18
- }else{
19
- const args = Object.assign({ headless: isHeadless, args: ['--no-sandbox'] }, puppeteerArgs)
20
- browser = await puppeteer.launch(args)
21
- }
22
-
23
- if (cookies) {
24
- logger.info('using cookies, login will be bypassed')
25
- } else if (email && password) {
26
- logger.info('email and password was provided, we\'re going to login...')
27
-
28
- try {
29
- await login(browser, email, password, logger)
30
- } catch (e) {
31
- if(!endpoint){
32
- await browser.close()
33
- }
34
- throw e
35
- }
36
- } else {
37
- logger.warn('email/password and cookies wasn\'t provided, only public data will be collected')
38
- }
39
-
40
- return (url, waitMs) => url.includes('/school/') || url.includes('/company/') ? company(browser, cookies, url, waitMs, hasToGetContactInfo, puppeteerAuthenticate) :profile(browser, cookies, url, waitMs, hasToGetContactInfo, puppeteerAuthenticate)
41
- }
1
+ const puppeteer = require('puppeteer')
2
+ const login = require('./login')
3
+ const profile = require('./profile/profile')
4
+ const company = require('./company/company')
5
+ const logger = require('./logger')(__filename)
6
+
7
+ module.exports = async ({ cookies, email, password, isHeadless, hasToLog, hasToGetContactInfo, puppeteerArgs, puppeteerAuthenticate, endpoint } = { isHeadless: true, hasToLog: false }) => {
8
+ if (!hasToLog) {
9
+ logger.stopLogging()
10
+ }
11
+ logger.info('initializing')
12
+
13
+ let browser
14
+ if (endpoint) {
15
+ browser = await puppeteer.connect({
16
+ browserWSEndpoint: endpoint
17
+ })
18
+ } else {
19
+ const args = Object.assign({ headless: isHeadless, args: ['--no-sandbox'] }, puppeteerArgs)
20
+ browser = await puppeteer.launch(args)
21
+ }
22
+
23
+ if (cookies) {
24
+ logger.info('using cookies, login will be bypassed')
25
+ } else if (email && password) {
26
+ logger.info('email and password was provided, we\'re going to login...')
27
+
28
+ try {
29
+ await login(browser, email, password, logger)
30
+ } catch (e) {
31
+ if (!endpoint) {
32
+ await browser.close()
33
+ }
34
+ throw e
35
+ }
36
+ } else {
37
+ logger.warn('email/password and cookies wasn\'t provided, only public data will be collected')
38
+ }
39
+
40
+ return (url, waitMs) => url.includes('/school/') || url.includes('/company/') ? company(browser, cookies, url, waitMs, hasToGetContactInfo, puppeteerAuthenticate) : profile(browser, cookies, url, waitMs, hasToGetContactInfo, puppeteerAuthenticate)
41
+ }
package/.travis.yml DELETED
@@ -1,5 +0,0 @@
1
- language: node_js
2
- node_js:
3
- - "7.6"
4
- script:
5
- - npm test
@@ -1,338 +0,0 @@
1
- const faker = require('faker')
2
- const { expect } = require('chai')
3
- const profile = require('./profile/profile')
4
- const logger = require('./logger')(__filename)
5
- const { mock, match } = require('sinon')
6
- const profileScraperTemplate = require('./profile/profileScraperTemplate')
7
- const url = faker.internet.url()
8
- const fakeEvalResult = faker.lorem.words(1)
9
-
10
- // Make the linter happy.
11
- var mocha = require('mocha')
12
- var it = mocha.it
13
-
14
- logger.stopLogging()
15
-
16
- it('should get complete profile', async () => {
17
- const browserMock = prepareBrowserMock()
18
- const result = await profile(browserMock, [], url, 0)
19
- const expectedResult = {
20
- aboutAlternative: {
21
- text: fakeEvalResult
22
- },
23
- aboutLegacy: {
24
- text: fakeEvalResult
25
- },
26
- accomplishments: [
27
- {
28
- count: fakeEvalResult,
29
- items: [fakeEvalResult],
30
- title: fakeEvalResult
31
- }
32
- ],
33
- contact: {},
34
- courses: [
35
- {
36
- name: fakeEvalResult,
37
- year: fakeEvalResult
38
- }
39
- ],
40
- educations: [
41
- {
42
- date1: fakeEvalResult,
43
- date2: fakeEvalResult,
44
- degree: fakeEvalResult,
45
- fieldOfStudy: fakeEvalResult,
46
- url: fakeEvalResult,
47
- title: fakeEvalResult
48
- }
49
- ],
50
- languages: [
51
- {
52
- name: fakeEvalResult,
53
- proficiency: fakeEvalResult
54
- }
55
- ],
56
- peopleAlsoViewed: [
57
- {
58
- text: fakeEvalResult,
59
- user: fakeEvalResult
60
- }
61
- ],
62
- positions: [
63
- {
64
- companyName: fakeEvalResult,
65
- date1: fakeEvalResult,
66
- date2: fakeEvalResult,
67
- description: fakeEvalResult,
68
- link: fakeEvalResult,
69
- location: fakeEvalResult,
70
- roles: [
71
- {
72
- date1: fakeEvalResult,
73
- date2: fakeEvalResult,
74
- description: fakeEvalResult,
75
- location: fakeEvalResult,
76
- title: fakeEvalResult
77
- }
78
- ],
79
- title: fakeEvalResult,
80
- url: fakeEvalResult
81
- }
82
- ],
83
- profile: {
84
- connections: fakeEvalResult,
85
- headline: fakeEvalResult,
86
- location: fakeEvalResult,
87
- name: fakeEvalResult,
88
- summary: fakeEvalResult
89
- },
90
- profileAlternative: {
91
- connections: fakeEvalResult,
92
- headline: fakeEvalResult,
93
- imageurl: fakeEvalResult,
94
- location: fakeEvalResult,
95
- name: fakeEvalResult
96
- },
97
- profileLegacy: {
98
- connections: fakeEvalResult,
99
- headline: fakeEvalResult,
100
- location: fakeEvalResult,
101
- name: fakeEvalResult,
102
- summary: fakeEvalResult
103
- },
104
- projects: [
105
- {
106
- date: fakeEvalResult,
107
- description: fakeEvalResult,
108
- link: fakeEvalResult,
109
- name: fakeEvalResult
110
- }
111
- ],
112
- recommendations: {
113
- given: [
114
- {
115
- text: fakeEvalResult,
116
- user: fakeEvalResult
117
- }
118
- ],
119
- givenCount: '',
120
- received: [
121
- {
122
- text: fakeEvalResult,
123
- user: fakeEvalResult
124
- }
125
- ],
126
- receivedCount: ''
127
- },
128
- skills: [
129
- {
130
- count: fakeEvalResult,
131
- title: fakeEvalResult
132
- }
133
- ],
134
- volunteerExperience: [
135
- {
136
- date1: fakeEvalResult,
137
- date2: fakeEvalResult,
138
- description: fakeEvalResult,
139
- experience: fakeEvalResult,
140
- location: fakeEvalResult,
141
- title: fakeEvalResult
142
- }
143
- ]
144
- }
145
-
146
- expect(result).to.deep.equals(expectedResult)
147
- })
148
-
149
- it('should get an incomplete profile', async () => {
150
- const browser = prepareBrowserMock(true)
151
-
152
- const result = await profile(browser, [], url, 0)
153
- const expectedResult = {
154
- aboutAlternative: {
155
- text: ''
156
- },
157
- aboutLegacy: {
158
- text: ''
159
- },
160
- accomplishments: [
161
- {
162
- count: '',
163
- items: [fakeEvalResult],
164
- title: ''
165
- }
166
- ],
167
- contact: {},
168
- courses: [{}],
169
- educations: [
170
- {
171
- date1: '',
172
- date2: '',
173
- degree: '',
174
- fieldOfStudy: '',
175
- url: ''
176
- }
177
- ],
178
- languages: [
179
- {
180
- name: undefined,
181
- proficiency: ''
182
- }
183
- ],
184
- peopleAlsoViewed: [
185
- {
186
- text: '',
187
- user: ''
188
- }
189
- ],
190
- positions: [
191
- {
192
- companyName: '',
193
- date1: '',
194
- date2: '',
195
- description: '',
196
- link: '',
197
- location: '',
198
- roles: [
199
- {
200
- date1: '',
201
- date2: '',
202
- description: '',
203
- location: '',
204
- title: ''
205
- }
206
- ],
207
- url: ''
208
- }
209
- ],
210
- profile: {
211
- connections: '',
212
- headline: '',
213
- location: '',
214
- name: ''
215
- },
216
- profileAlternative: {
217
- connections: '',
218
- headline: '',
219
- imageurl: '',
220
- location: '',
221
- name: ''
222
- },
223
- profileLegacy: {
224
- connections: '',
225
- headline: '',
226
- location: '',
227
- name: ''
228
- },
229
- projects: [
230
- {
231
- date: '',
232
- description: undefined,
233
- link: '',
234
- name: undefined
235
- }
236
- ],
237
- recommendations: {
238
- given: [
239
- {
240
- text: '',
241
- user: ''
242
- }
243
- ],
244
- givenCount: '',
245
- received: [
246
- {
247
- text: '',
248
- user: ''
249
- }
250
- ],
251
- receivedCount: ''
252
- },
253
- skills: [
254
- {
255
- count: '',
256
- title: ''
257
- }
258
- ],
259
- volunteerExperience: [
260
- {
261
- date1: '',
262
- date2: '',
263
- description: '',
264
- experience: '',
265
- location: ''
266
- }
267
- ]
268
- }
269
-
270
- expect(result).to.deep.equals(expectedResult)
271
- })
272
-
273
- const prepareBrowserMock = (isIncompleteProfile) => {
274
- const Page = function () {
275
- this.goto = mock().once().withExactArgs(url).resolves()
276
- this.setUserAgent = mock().once().resolves()
277
- this.setExtraHTTPHeaders = mock().once().resolves()
278
- this.setViewport = mock().once().resolves()
279
- this.waitFor = mock().once().resolves()
280
-
281
- this.evaluate = mock()
282
- .twice()
283
- .withExactArgs(match.func)
284
- .atLeast(1)
285
- .resolves()
286
- this.waitForSelector = mock()
287
- .withExactArgs(match.string, match.object)
288
- .twice()
289
- .onCall(0)
290
- .rejects()
291
- .onCall(1)
292
- .resolves(true)
293
-
294
- this.setCookie = mock().once().withExactArgs().resolves()
295
-
296
- this.click = mock().atLeast(1).withExactArgs().resolves()
297
- this.$$eval = mock()
298
- .withExactArgs(match.string, match.func)
299
- .atLeast(1)
300
- .callsArgWith(1, [{ innerText: fakeEvalResult }])
301
- .resolves([fakeEvalResult])
302
-
303
- this.$eval = mock()
304
- .withExactArgs(match.string, match.func)
305
- .atLeast(1)
306
- .callsArgWith(
307
- 1,
308
- isIncompleteProfile
309
- ? undefined
310
- : {
311
- innerText: fakeEvalResult,
312
- src: fakeEvalResult,
313
- href: fakeEvalResult
314
- }
315
- )
316
- .resolves(isIncompleteProfile ? '' : fakeEvalResult)
317
-
318
- this.close = mock().once().resolves()
319
- }
320
-
321
- Page.prototype.$ = () => new Page()
322
-
323
- if (isIncompleteProfile) {
324
- // I couldn't do that with sinon :(
325
- Page.prototype.$ = (arg) =>
326
- arg === profileScraperTemplate.positions.fields.title
327
- ? undefined
328
- : Promise.resolve(new Page())
329
- }
330
-
331
- Page.prototype.$$ = () => [new Page()]
332
-
333
- const browser = {
334
- newPage: mock().once().withExactArgs().resolves(new Page())
335
- }
336
-
337
- return browser
338
- }