linsc 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,70 @@
1
+ require_relative 'csv_handlers'
2
+
3
+
4
+ class Merger
5
+ include CSVHandlers
6
+
7
+ def initialize(input_dir, output_path, mapping = nil)
8
+ @input_dir, @output_path, @mapping = input_dir, output_path, mapping
9
+ recruiter_file = Pathname.new(File.dirname __dir__).realdirpath + '../data/recruiters.txt'
10
+ @recruiters = recruiter_file.read.split(",").collect{|r| r.strip}
11
+ @lin_files = @input_dir.children.select{|fn| fn.to_s.match(/LIN.+\.csv/)}
12
+ if mapping
13
+ @headers = mapping.values
14
+ else
15
+ @headers = get_headers(@lin_files.first)
16
+ end
17
+ if File.exist?(@output_path)
18
+ File.delete(@output_path)
19
+ end
20
+ create_file(@output_path)
21
+ end
22
+
23
+ def construct_emails_hash
24
+ emails = {}
25
+ @lin_files.each do |pn|
26
+ lin_file = pn.to_s
27
+ recruiter_name = lin_file.match(/LIN[^.]+/)[0]
28
+ puts "merging #{recruiter_name}"
29
+ clean_file = File.read(lin_file, encoding: 'windows-1252').strip
30
+ CSV.parse(clean_file, headers: true, encoding: 'windows-1252') do |row|
31
+ row["Recruiter"] = recruiter_name
32
+ email = row['E-mail Address']&.downcase
33
+ if emails.has_key?(email)
34
+ emails[email] << row
35
+ else
36
+ emails[email] = [row]
37
+ end
38
+ end
39
+ end
40
+ emails
41
+ end
42
+
43
+ def merge
44
+ emails = construct_emails_hash
45
+ i = 0
46
+ j = emails.length
47
+ emails.each do |ek, ev|
48
+ i += 1
49
+ puts "merging - row #{i}/#{j}"
50
+ correct_row = ev.find do |row|
51
+ row['Recruiter'] == @recruiters.find do |rec|
52
+ ev.collect {|row| row['Recruiter']}.include?(rec)
53
+ end
54
+ end
55
+ if @mapping
56
+ output_row = CSV::Row.new(@headers, [])
57
+ correct_row.each do |key, value|
58
+ if @mapping[key]
59
+ output_row[@mapping[key]] = value&.encode('utf-8')
60
+ end
61
+ end
62
+ output_row['Email'] = output_row['Email']&.downcase
63
+ else
64
+ output_row = create_row(correct_row, @headers, 'utf-8')
65
+ end
66
+ append_to_csv(@output_path, output_row)
67
+ end
68
+ @output_path
69
+ end
70
+ end
@@ -0,0 +1,320 @@
1
+ module Parsers
2
+ def scrape_contact(input_row, page, mode)
3
+ row = CSV::Row.new(@headers, [])
4
+ name = page.at_css("#name")&.text&.split
5
+ contact_id = input_row["Contact ID"]
6
+ lin_id = input_row["LIN ID"]
7
+ cv_tr = input_row["CV TR"]
8
+ acc_name = input_row["Account Name"]
9
+ import_status = input_row["Linkedin Import Status"]
10
+ email = input_row["Email"]
11
+ lin_profile = input_row["Linkedin Profile"]
12
+ cand_id = input_row["Candidate ID"]
13
+ cand_source = input_row["LIN 1st Degree"]
14
+ title = page.at_css(".headline.title")&.text
15
+ country = page.at_css("#demographics .locality")&.text
16
+ sector = page.at_css("#demographics .descriptor:not(.adr)")&.text
17
+
18
+ positions = page.css("#experience .positions .position")
19
+ if positions
20
+ e1_title = positions[0]&.at_css(".item-title")&.text
21
+ e1_org = positions[0]&.at_css(".item-subtitle")&.text
22
+ e1_start = positions[0]&.css(".date-range time")[0]&.text
23
+ e1_end = positions[0]&.css(".date-range time")[1]&.text
24
+ e1_loc = positions[0]&.at_css(".location")&.text
25
+ e1_desc = positions[0]&.at_css(".description")&.text
26
+ e2_title = positions[1]&.at_css(".item-title")&.text
27
+ e2_org = positions[1]&.at_css(".item-subtitle")&.text
28
+ e2_start = positions[1]&.css(".date-range time")[0]&.text
29
+ e2_end = positions[1]&.css(".date-range time")[1]&.text
30
+ e2_loc = positions[1]&.at_css(".location")&.text
31
+ e2_desc = positions[1]&.at_css(".description")&.text
32
+ e3_title = positions[2]&.at_css(".item-title")&.text
33
+ e3_org = positions[2]&.at_css(".item-subtitle")&.text
34
+ e3_start = positions[2]&.css(".date-range time")[0]&.text
35
+ e3_end = positions[2]&.css(".date-range time")[1]&.text
36
+ e3_loc = positions[2]&.at_css(".location")&.text
37
+ e3_desc = positions[2]&.at_css(".description")&.text
38
+ end
39
+
40
+ certs = page.css(".certifications .certification")
41
+ if certs
42
+ c1_name = certs[0]&.at_css(".item-title")&.text
43
+ c2_name = certs[1]&.at_css(".item-title")&.text
44
+ c_type = certs[0]&.at_css(".item-subtitle")&.text
45
+ end
46
+
47
+ schools = page.css("#education .schools .school")
48
+ if schools
49
+ s1_name = schools[0]&.at_css(".item-title")&.text
50
+ s2_name = schools[1]&.at_css(".item-title")&.text
51
+ s1_start = schools[0]&.css(".date-range time")[0]&.text
52
+ s2_start = schools[1]&.css(".date-range time")[0]&.text
53
+ s1_end = schools[0]&.css(".date-range time")[1]&.text
54
+ s2_end = schools[1]&.css(".date-range time")[1]&.text
55
+ s1_degree = schools[0]&.at_css(".item-subtitle")&.text
56
+ s2_degree = schools[1]&.at_css(".item-subtitle")&.text
57
+ end
58
+
59
+ summary = page.at_css("#summary .description")
60
+ summary&.css('br').each{|br| br.replace "\n"} if summary
61
+
62
+ text_resume = "\n\n***IMPORTED FROM LINKEDIN***\n#{lin_profile}\n\n"
63
+ text_resume += name.join(" ")
64
+ text_resume += "\n#{email}"
65
+ text_resume += "\nTitle: #{title}" if title
66
+ text_resume += "\nLocation: #{country}" if country
67
+ text_resume += "\nSector: #{sector}" if sector
68
+ text_resume += "\n\nSUMMARY\n#{summary.text}" if summary
69
+ text_resume += "\n\nEXPERIENCE\n" if positions && positions.length > 0
70
+ positions.each do |position|
71
+ jtitle = position.at_css(".item-title")
72
+ jcompany = position.at_css(".item-subtitle")
73
+ jdates = position.at_css(".date-range")
74
+ jlocation = position.at_css(".location")
75
+ jdesc = position.at_css(".description")
76
+ jdesc.css('br').each{|br| br.replace "\n"} if jdesc
77
+ text_resume += "\n#{jtitle.text}\n" if jtitle
78
+ text_resume += " - #{jcompany.text}\n" if jcompany && jcompany.text.length > 0
79
+ text_resume += "#{jdates.text}\n" if jdates
80
+ text_resume += "#{jlocation.text}\n" if jlocation
81
+ text_resume += "#{jdesc.text}\n" if jdesc
82
+ end
83
+ text_resume += "\n\nEDUCATION\n" if schools && schools.length > 0
84
+ schools.each do |school|
85
+ stitle = school.at_css(".item-title")
86
+ sdegree = school.at_css(".item-subtitle")
87
+ sdates = school.at_css(".date-range")
88
+ sdesc = school.at_css(".description")
89
+ sdesc.css('br').each{|br| br.replace "\n"} if sdesc
90
+ text_resume += "\n#{stitle.text}\n" if stitle
91
+ text_resume += " - #{sdegree.text}\n" if sdegree && sdegree.text.length > 0
92
+ text_resume += "#{sdates.text}\n" if sdates
93
+ text_resume += "#{sdesc.text}\n" if sdesc
94
+ end
95
+ text_resume += "\n\nCERTIFICATIONS\n" if certs && certs.length > 0
96
+ certs.each do |cert|
97
+ ctitle = cert.at_css(".item-title")
98
+ csub = cert.at_css(".item-subtitle")
99
+ cdates = cert.at_css(".date-range")
100
+ text_resume += "\n#{ctitle.text}\n" if ctitle
101
+ text_resume += "#{csub.text}\n" if csub
102
+ text_resume += "#{cdates.text}\n" if cdates
103
+ end
104
+ interests = page.css("#interests .pills .interest")
105
+ text_resume += "\nINTERESTS\n" if interests && interests.length > 0
106
+ ints = []
107
+ interests.each do |interest|
108
+ int = interest.at_css(".wrap")&.text
109
+ if int
110
+ ints << int unless (int == "See less") || (int.match(/See \d+\+/))
111
+ end
112
+ end
113
+ text_resume += "#{ints.join(", ")}\n\n"
114
+ skills = page.css("#skills .pills .skill")
115
+ text_resume += "\n\nSKILLS\n" if skills && skills.length > 0
116
+ sks = []
117
+ skills.each do |skill|
118
+ sk = skill.at_css(".wrap")&.text
119
+ if sk
120
+ sks << sk unless (sk == "See less") || (sk.match(/See \d+\+/))
121
+ end
122
+ end
123
+ text_resume += "#{sks.join(", ")}\n\n"
124
+ languages = page.css("#languages .language")
125
+ text_resume += "\n\nLANGUAGES\n" if languages.length > 0
126
+ langs = []
127
+ languages.each do |language|
128
+ lang = language.at_css(".name")&.text
129
+ prof = language.at_css(".proficiency")
130
+ lang += " (#{prof.text})" if prof && prof.text.length > 0
131
+ langs << lang if lang
132
+ end
133
+ text_resume += "#{langs.join(", ")}\n\n"
134
+ projects = page.css("#projects .project")
135
+ text_resume += "\n\nPROJECTS\n" if projects && projects.length > 0
136
+ projects.each do |project|
137
+ ptitle = project.at_css(".item-title")
138
+ pdates = project.at_css(".date-range")
139
+ pdesc = project.at_css(".description")
140
+ pdesc.css('br').each{|br| br.replace "\n"} if pdesc
141
+ pcont = project.at_css(".contributors")
142
+ text_resume += "\n#{ptitle.text}\n" if ptitle
143
+ text_resume += "#{pdates.text}\n" if pdates
144
+ text_resume += "#{pdesc.text}\n" if pdesc
145
+ text_resume += "#{pcont.text}\n " if pcont
146
+ end
147
+ pubs = page.css("#publications .publication")
148
+ text_resume += "\n\nPUBLICATIONS\n" if pubs && pubs.length > 0
149
+ pubs.each do |pub|
150
+ pubtitle = pub.at_css(".item-title")
151
+ pubsub = pub.at_css(".item-subtitle")
152
+ pubdates = pub.at_css(".date-range")
153
+ pubdesc = pub.at_css(".description")
154
+ pubdesc.css('br').each{|br| br.replace "\n"} if pubdesc
155
+ pubcont = pub.at_css(".contributors")
156
+ text_resume += "\n#{pubtitle.text}\n" if pubtitle
157
+ text_resume += "#{pubsub.text}\n" if pubsub
158
+ text_resume += "#{pubdates.text}\n" if pubdates
159
+ text_resume += "#{pubdesc.text}\n" if pubdesc
160
+ text_resume += "#{pubcont.text}\n" if pubcont
161
+ end
162
+ vols = page.css("#volunteering .position")
163
+ text_resume += "\n\nVOLUNTEERING\n" if vols && vols.length > 0
164
+ vols.each do |vol|
165
+ voltitle = vol.at_css(".item-title")
166
+ volsub = vol.at_css(".item-subtitle")
167
+ voldates = vol.at_css(".date-range")
168
+ voldesc = vol.at_css(".description")
169
+ voldesc.css('br').each{|br| br.replace "\n"} if voldesc
170
+ volcause = vol.at_css(".cause")
171
+ text_resume += "\n#{voltitle.text}\n" if voltitle
172
+ text_resume += "#{volsub.text}\n" if volsub
173
+ text_resume += "#{voldates.text}\n" if voldates
174
+ text_resume += "Cause: #{volcause.text}\n" if volcause
175
+ text_resume += "#{voldesc.text}\n" if voldesc
176
+ end
177
+ orgs = page.css("#organizations li")
178
+ text_resume += "\n\nORGANIZATIONS\n" if orgs && orgs.length > 0
179
+ orgs.each do |org|
180
+ orgtitle = org.at_css(".item-title")
181
+ orgsub = org.at_css(".item-subtitle")
182
+ orgdates = org.at_css(".date-range")
183
+ orgdesc = org.at_css(".description")
184
+ orgdesc.css('br').each{|br| br.replace "\n"} if orgdesc
185
+ text_resume += "\n#{orgtitle.text}\n" if orgtitle
186
+ text_resume += "#{orgsub.text}\n" if orgsub
187
+ text_resume += "#{orgdates.text}\n" if orgdates
188
+ text_resume += "#{orgdesc.text}\n" if orgdesc
189
+ end
190
+ pats = page.css("#patents .patent")
191
+ text_resume += "\n\nPATENTS\n" if pats && pats.length > 0
192
+ pats.each do |pat|
193
+ pattitle = pat.at_css(".item-title")
194
+ patsub = pat.at_css(".item-subtitle")
195
+ patdates = pat.at_css(".date-range")
196
+ patdesc = pat.at_css(".description")
197
+ patdesc.css('br').each{|br| br.replace "\n"} if patdesc
198
+ patcont = pat.at_css(".contributors")
199
+ text_resume += "\n#{pattitle.text}\n" if pattitle
200
+ text_resume += "#{patsub.text}\n" if patsub
201
+ text_resume += "#{patdates.text}\n" if patdates
202
+ text_resume += "#{patdesc.text}\n" if patdesc
203
+ text_resume += "#{patcont.text}\n" if patcont
204
+ end
205
+ awards = page.css("#awards .award")
206
+ text_resume += "\n\nAWARDS\n" if awards && awards.length > 0
207
+ awards.each do |award|
208
+ atitle = award.at_css(".item-title")
209
+ asub = award.at_css(".item-subtitle")
210
+ adates = award.at_css(".date-range")
211
+ adesc = award.at_css(".description")
212
+ adesc.css('br').each{|br| br.replace "\n"} if adesc
213
+ text_resume += "\n#{atitle.text}\n" if atitle
214
+ text_resume += "#{asub.text}\n" if asub
215
+ text_resume += "#{adates.text}\n" if adates
216
+ text_resume += "#{adesc.text}\n" if adesc
217
+ end
218
+ courses = page.css("#courses li")
219
+ text_resume += "\n\nCOURSES\n" if courses && courses.length > 0
220
+ courses.each do |course|
221
+ coutitle = course.at_css(".item-title")
222
+ coulist = course.at_css(".courses-list")
223
+ text_resume += "\n#{coutitle.text}\n" if coutitle
224
+ text_resume += "#{coulist.text}\n" if coulist
225
+ end
226
+
227
+
228
+ row["Contact ID"] = contact_id
229
+ row["LIN ID"] = lin_id
230
+ row["CV TR"] = "1"
231
+ row["Account Name"] = acc_name
232
+ row["Linkedin Import Status"] = import_status
233
+ row["First Name"] = name[0]&.slice(0, 39)
234
+ row["Last Name"] = name[1..-1]&.join(" ")&.slice(0, 79)
235
+ row["Email"] = email
236
+ row["Candidate ID"] = cand_id
237
+ row["LIN 1st Degree"] = cand_source
238
+ row["Title"] = title&.slice(0, 127)
239
+ row["Contact Country"] = country
240
+ row["Contact LIN Sector"] = sector&.slice(0, 99)
241
+ row["Employer 1 Title"] = e1_title&.slice(0, 31999)
242
+ row["Employer Organization Name 1"] = e1_org&.slice(0, 254)
243
+ row["Employer 1 Start Date"] = format_date(e1_start) #format
244
+ row["Employer 1 End Date"] = format_date(e1_end) #format
245
+ row["Employer 1 Location"] = e1_loc&.slice(0, 254)
246
+ row["Employer 1 Description"] = e1_desc&.slice(0, 31999)
247
+ row["Employer 2 Title"] = e2_title&.slice(0, 31999)
248
+ row["Employer Organization Name 2"] = e2_org&.slice(0, 254)
249
+ row["Employer 2 Start Date"] = format_date(e2_start) #format
250
+ row["Employer 2 End Date"] = format_date(e2_end) #format
251
+ row["Employer 2 Location"] = e2_loc&.slice(0, 254)
252
+ row["Employer 2 Description"] = e2_desc&.slice(0, 31999)
253
+ row["Employer 3 Title"] = e3_title&.slice(0, 31999)
254
+ row["Employer Organization Name 3"] = e3_org&.slice(0, 254)
255
+ row["Employer 3 Start Date"] = format_date(e3_start) #format
256
+ row["Employer 3 End Date"] = format_date(e3_end) #format
257
+ row["Employer 3 Location"] = e3_loc&.slice(0, 254)
258
+ row["Employer 3 Description"] = e3_desc&.slice(0, 31999)
259
+ row["License or Certification Name 1"] = c1_name&.slice(0, 254)
260
+ row["License or Certification Name 2"] = c2_name&.slice(0, 254)
261
+ row["License or Certification Credential Type"] = c_type&.slice(0, 254)
262
+ row["Education School 1"] = s1_name&.slice(0, 124)
263
+ row["Education Degree Name 1"] = s1_degree&.slice(0, 254)
264
+ row["Education Degree Date 1"] = format_date(s1_end)
265
+ row["Education School 2"] = s2_name&.slice(0, 124)
266
+ row["Education Degree Name 2"] = s2_degree&.slice(0, 254)
267
+ row["Education Degree Date 2"] = format_date(s2_end)
268
+ row["Text Resume"] = text_resume&.slice(0, 31999)
269
+ row["LinkedIn Profile"] = lin_profile&.slice(0, 254)
270
+ row["Resume Last Updated"] = Time.now.strftime('%Y-%m-%d %H:%M:%S')
271
+ row["LIN Import Date"] = Time.now.strftime('%Y-%m-%d')
272
+ row["CV Uploaded"] = "1"
273
+
274
+ row
275
+
276
+ end
277
+
278
+ def scrape_education(input_row, page)
279
+ rows = []
280
+ schools = page.css("#education .schools .school")
281
+
282
+ schools.each do |school|
283
+ row = CSV::Row.new(@education_headers, [])
284
+ row["Contact"] = input_row["Contact ID"]
285
+ row["LIN ID"] = input_row["LIN ID"]
286
+ row["School Name"] = school.at_css(".item-title").text.slice(0, 149)
287
+ row["Major"] = school.at_css(".item-subtitle").text.slice(0, 254)
288
+ dstart = school.css(".date-range time")[0]
289
+ dend = school.css(".date-range time")[1]
290
+ if dend
291
+ row["Graduation Year"] = dend.text.gsub(/\D/, '').slice(0, 74)
292
+ else
293
+ row["Graduation Year"] = dstart.text.gsub(/\D/, '').slice(0, 74)
294
+ end
295
+ rows << row
296
+ end
297
+ rows
298
+ end
299
+
300
+ def scrape_employment(input_row, page)
301
+ rows = []
302
+ positions = page.css("#experience .positions .position")
303
+
304
+ positions.each do |position|
305
+ row = CSV::Row.new(@employment_headers, [])
306
+ row["Contact"] = input_row["Contact ID"]
307
+ row["LIN ID"] = input_row["LIN ID"]
308
+ row["Job Title"] = position.at_css(".item-title").text.slice(0, 74)
309
+ row["Employer Name"] = position.at_css(".item-subtitle").text.slice(0, 149)
310
+ jstart = position.css(".date-range time")[0]
311
+ jend = position.css(".date-range time")[1]
312
+ row["Start Date"] = format_date(jstart.text)
313
+ row["End Date"] = format_date(jend.text)
314
+ row["Location"] = position.at_css(".location").text.slice(0, 254)
315
+ rows << row
316
+ end
317
+ rows
318
+ end
319
+
320
+ end
@@ -0,0 +1,30 @@
1
+ class Proxy
2
+ attr_accessor :ip, :port, :username, :password, :status, :last_used, :user_agent
3
+
4
+ def initialize(ip:, port: 80, username: nil, password: nil, status: nil, last_used: nil, user_agent: nil)
5
+ @ip, @port, @username, @password, @status, @last_used =
6
+ ip, port, username, password, status, last_used
7
+ end
8
+
9
+ def dead
10
+ @status = 'dead'
11
+ @last_used = Time.now
12
+ end
13
+
14
+ def good
15
+ @status = 'good'
16
+ @last_used = Time.now
17
+ end
18
+
19
+ def good?
20
+ @status == 'good' ? true : false
21
+ end
22
+
23
+ def dead?
24
+ @status == 'dead' ? true : false
25
+ end
26
+
27
+ def used
28
+ @last_used = Time.now
29
+ end
30
+ end
@@ -0,0 +1,42 @@
1
+ require_relative 'proxy'
2
+
3
+ class ProxyHandler
4
+
5
+ def initialize(cooldown_time = 5)
6
+ @cooldown_time = cooldown_time
7
+ @proxy_list = File.read('./../data/proxies.txt').split("\n")
8
+ .collect{|proxy| proxy.split(':')}
9
+ @proxies = []
10
+ @ua_list = File.read('./../data/agents.txt').split("\n")
11
+
12
+ @proxy_list.each do |proxy_details|
13
+ proxy = Proxy.new(ip: proxy_details[0], port: proxy_details[1],
14
+ username: proxy_details[2], password: proxy_details[3], status: 'good',
15
+ last_used: Time.now - @cooldown_time, user_agent: @ua_list.shift)
16
+ @proxies << proxy
17
+ end
18
+ if @proxies.length == 0
19
+ puts "proxies.txt is empty! if you don't want to use any proxies, use the -n flag. see docs for more."
20
+ exit
21
+ end
22
+
23
+ end
24
+
25
+ def get_proxy
26
+ @good_proxies = @proxies.select { |proxy| proxy.good? }
27
+ if @good_proxies.length > 0
28
+ @good_proxies.sort!{|a, b| a.last_used <=> b.last_used}
29
+ best_proxy = @good_proxies.first
30
+ duration = Time.now - best_proxy.last_used
31
+ sleep(@cooldown_time - duration) if duration < @cooldown_time
32
+ best_proxy
33
+ else
34
+ puts "All proxies are dead. Wait a few hours before resuming."
35
+ exit
36
+ end
37
+ end
38
+
39
+ def length
40
+ @proxies.length
41
+ end
42
+ end