linsc 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,70 @@
1
+ require_relative 'csv_handlers'
2
+
3
+
4
+ class Merger
5
+ include CSVHandlers
6
+
7
+ def initialize(input_dir, output_path, mapping = nil)
8
+ @input_dir, @output_path, @mapping = input_dir, output_path, mapping
9
+ recruiter_file = Pathname.new(File.dirname __dir__).realdirpath + '../data/recruiters.txt'
10
+ @recruiters = recruiter_file.read.split(",").collect{|r| r.strip}
11
+ @lin_files = @input_dir.children.select{|fn| fn.to_s.match(/LIN.+\.csv/)}
12
+ if mapping
13
+ @headers = mapping.values
14
+ else
15
+ @headers = get_headers(@lin_files.first)
16
+ end
17
+ if File.exist?(@output_path)
18
+ File.delete(@output_path)
19
+ end
20
+ create_file(@output_path)
21
+ end
22
+
23
+ def construct_emails_hash
24
+ emails = {}
25
+ @lin_files.each do |pn|
26
+ lin_file = pn.to_s
27
+ recruiter_name = lin_file.match(/LIN[^.]+/)[0]
28
+ puts "merging #{recruiter_name}"
29
+ clean_file = File.read(lin_file, encoding: 'windows-1252').strip
30
+ CSV.parse(clean_file, headers: true, encoding: 'windows-1252') do |row|
31
+ row["Recruiter"] = recruiter_name
32
+ email = row['E-mail Address']&.downcase
33
+ if emails.has_key?(email)
34
+ emails[email] << row
35
+ else
36
+ emails[email] = [row]
37
+ end
38
+ end
39
+ end
40
+ emails
41
+ end
42
+
43
+ def merge
44
+ emails = construct_emails_hash
45
+ i = 0
46
+ j = emails.length
47
+ emails.each do |ek, ev|
48
+ i += 1
49
+ puts "merging - row #{i}/#{j}"
50
+ correct_row = ev.find do |row|
51
+ row['Recruiter'] == @recruiters.find do |rec|
52
+ ev.collect {|row| row['Recruiter']}.include?(rec)
53
+ end
54
+ end
55
+ if @mapping
56
+ output_row = CSV::Row.new(@headers, [])
57
+ correct_row.each do |key, value|
58
+ if @mapping[key]
59
+ output_row[@mapping[key]] = value&.encode('utf-8')
60
+ end
61
+ end
62
+ output_row['Email'] = output_row['Email']&.downcase
63
+ else
64
+ output_row = create_row(correct_row, @headers, 'utf-8')
65
+ end
66
+ append_to_csv(@output_path, output_row)
67
+ end
68
+ @output_path
69
+ end
70
+ end
@@ -0,0 +1,320 @@
1
+ module Parsers
2
+ def scrape_contact(input_row, page, mode)
3
+ row = CSV::Row.new(@headers, [])
4
+ name = page.at_css("#name")&.text&.split
5
+ contact_id = input_row["Contact ID"]
6
+ lin_id = input_row["LIN ID"]
7
+ cv_tr = input_row["CV TR"]
8
+ acc_name = input_row["Account Name"]
9
+ import_status = input_row["Linkedin Import Status"]
10
+ email = input_row["Email"]
11
+ lin_profile = input_row["Linkedin Profile"]
12
+ cand_id = input_row["Candidate ID"]
13
+ cand_source = input_row["LIN 1st Degree"]
14
+ title = page.at_css(".headline.title")&.text
15
+ country = page.at_css("#demographics .locality")&.text
16
+ sector = page.at_css("#demographics .descriptor:not(.adr)")&.text
17
+
18
+ positions = page.css("#experience .positions .position")
19
+ if positions
20
+ e1_title = positions[0]&.at_css(".item-title")&.text
21
+ e1_org = positions[0]&.at_css(".item-subtitle")&.text
22
+ e1_start = positions[0]&.css(".date-range time")[0]&.text
23
+ e1_end = positions[0]&.css(".date-range time")[1]&.text
24
+ e1_loc = positions[0]&.at_css(".location")&.text
25
+ e1_desc = positions[0]&.at_css(".description")&.text
26
+ e2_title = positions[1]&.at_css(".item-title")&.text
27
+ e2_org = positions[1]&.at_css(".item-subtitle")&.text
28
+ e2_start = positions[1]&.css(".date-range time")[0]&.text
29
+ e2_end = positions[1]&.css(".date-range time")[1]&.text
30
+ e2_loc = positions[1]&.at_css(".location")&.text
31
+ e2_desc = positions[1]&.at_css(".description")&.text
32
+ e3_title = positions[2]&.at_css(".item-title")&.text
33
+ e3_org = positions[2]&.at_css(".item-subtitle")&.text
34
+ e3_start = positions[2]&.css(".date-range time")[0]&.text
35
+ e3_end = positions[2]&.css(".date-range time")[1]&.text
36
+ e3_loc = positions[2]&.at_css(".location")&.text
37
+ e3_desc = positions[2]&.at_css(".description")&.text
38
+ end
39
+
40
+ certs = page.css(".certifications .certification")
41
+ if certs
42
+ c1_name = certs[0]&.at_css(".item-title")&.text
43
+ c2_name = certs[1]&.at_css(".item-title")&.text
44
+ c_type = certs[0]&.at_css(".item-subtitle")&.text
45
+ end
46
+
47
+ schools = page.css("#education .schools .school")
48
+ if schools
49
+ s1_name = schools[0]&.at_css(".item-title")&.text
50
+ s2_name = schools[1]&.at_css(".item-title")&.text
51
+ s1_start = schools[0]&.css(".date-range time")[0]&.text
52
+ s2_start = schools[1]&.css(".date-range time")[0]&.text
53
+ s1_end = schools[0]&.css(".date-range time")[1]&.text
54
+ s2_end = schools[1]&.css(".date-range time")[1]&.text
55
+ s1_degree = schools[0]&.at_css(".item-subtitle")&.text
56
+ s2_degree = schools[1]&.at_css(".item-subtitle")&.text
57
+ end
58
+
59
+ summary = page.at_css("#summary .description")
60
+ summary&.css('br').each{|br| br.replace "\n"} if summary
61
+
62
+ text_resume = "\n\n***IMPORTED FROM LINKEDIN***\n#{lin_profile}\n\n"
63
+ text_resume += name.join(" ")
64
+ text_resume += "\n#{email}"
65
+ text_resume += "\nTitle: #{title}" if title
66
+ text_resume += "\nLocation: #{country}" if country
67
+ text_resume += "\nSector: #{sector}" if sector
68
+ text_resume += "\n\nSUMMARY\n#{summary.text}" if summary
69
+ text_resume += "\n\nEXPERIENCE\n" if positions && positions.length > 0
70
+ positions.each do |position|
71
+ jtitle = position.at_css(".item-title")
72
+ jcompany = position.at_css(".item-subtitle")
73
+ jdates = position.at_css(".date-range")
74
+ jlocation = position.at_css(".location")
75
+ jdesc = position.at_css(".description")
76
+ jdesc.css('br').each{|br| br.replace "\n"} if jdesc
77
+ text_resume += "\n#{jtitle.text}\n" if jtitle
78
+ text_resume += " - #{jcompany.text}\n" if jcompany && jcompany.text.length > 0
79
+ text_resume += "#{jdates.text}\n" if jdates
80
+ text_resume += "#{jlocation.text}\n" if jlocation
81
+ text_resume += "#{jdesc.text}\n" if jdesc
82
+ end
83
+ text_resume += "\n\nEDUCATION\n" if schools && schools.length > 0
84
+ schools.each do |school|
85
+ stitle = school.at_css(".item-title")
86
+ sdegree = school.at_css(".item-subtitle")
87
+ sdates = school.at_css(".date-range")
88
+ sdesc = school.at_css(".description")
89
+ sdesc.css('br').each{|br| br.replace "\n"} if sdesc
90
+ text_resume += "\n#{stitle.text}\n" if stitle
91
+ text_resume += " - #{sdegree.text}\n" if sdegree && sdegree.text.length > 0
92
+ text_resume += "#{sdates.text}\n" if sdates
93
+ text_resume += "#{sdesc.text}\n" if sdesc
94
+ end
95
+ text_resume += "\n\nCERTIFICATIONS\n" if certs && certs.length > 0
96
+ certs.each do |cert|
97
+ ctitle = cert.at_css(".item-title")
98
+ csub = cert.at_css(".item-subtitle")
99
+ cdates = cert.at_css(".date-range")
100
+ text_resume += "\n#{ctitle.text}\n" if ctitle
101
+ text_resume += "#{csub.text}\n" if csub
102
+ text_resume += "#{cdates.text}\n" if cdates
103
+ end
104
+ interests = page.css("#interests .pills .interest")
105
+ text_resume += "\nINTERESTS\n" if interests && interests.length > 0
106
+ ints = []
107
+ interests.each do |interest|
108
+ int = interest.at_css(".wrap")&.text
109
+ if int
110
+ ints << int unless (int == "See less") || (int.match(/See \d+\+/))
111
+ end
112
+ end
113
+ text_resume += "#{ints.join(", ")}\n\n"
114
+ skills = page.css("#skills .pills .skill")
115
+ text_resume += "\n\nSKILLS\n" if skills && skills.length > 0
116
+ sks = []
117
+ skills.each do |skill|
118
+ sk = skill.at_css(".wrap")&.text
119
+ if sk
120
+ sks << sk unless (sk == "See less") || (sk.match(/See \d+\+/))
121
+ end
122
+ end
123
+ text_resume += "#{sks.join(", ")}\n\n"
124
+ languages = page.css("#languages .language")
125
+ text_resume += "\n\nLANGUAGES\n" if languages.length > 0
126
+ langs = []
127
+ languages.each do |language|
128
+ lang = language.at_css(".name")&.text
129
+ prof = language.at_css(".proficiency")
130
+ lang += " (#{prof.text})" if prof && prof.text.length > 0
131
+ langs << lang if lang
132
+ end
133
+ text_resume += "#{langs.join(", ")}\n\n"
134
+ projects = page.css("#projects .project")
135
+ text_resume += "\n\nPROJECTS\n" if projects && projects.length > 0
136
+ projects.each do |project|
137
+ ptitle = project.at_css(".item-title")
138
+ pdates = project.at_css(".date-range")
139
+ pdesc = project.at_css(".description")
140
+ pdesc.css('br').each{|br| br.replace "\n"} if pdesc
141
+ pcont = project.at_css(".contributors")
142
+ text_resume += "\n#{ptitle.text}\n" if ptitle
143
+ text_resume += "#{pdates.text}\n" if pdates
144
+ text_resume += "#{pdesc.text}\n" if pdesc
145
+ text_resume += "#{pcont.text}\n " if pcont
146
+ end
147
+ pubs = page.css("#publications .publication")
148
+ text_resume += "\n\nPUBLICATIONS\n" if pubs && pubs.length > 0
149
+ pubs.each do |pub|
150
+ pubtitle = pub.at_css(".item-title")
151
+ pubsub = pub.at_css(".item-subtitle")
152
+ pubdates = pub.at_css(".date-range")
153
+ pubdesc = pub.at_css(".description")
154
+ pubdesc.css('br').each{|br| br.replace "\n"} if pubdesc
155
+ pubcont = pub.at_css(".contributors")
156
+ text_resume += "\n#{pubtitle.text}\n" if pubtitle
157
+ text_resume += "#{pubsub.text}\n" if pubsub
158
+ text_resume += "#{pubdates.text}\n" if pubdates
159
+ text_resume += "#{pubdesc.text}\n" if pubdesc
160
+ text_resume += "#{pubcont.text}\n" if pubcont
161
+ end
162
+ vols = page.css("#volunteering .position")
163
+ text_resume += "\n\nVOLUNTEERING\n" if vols && vols.length > 0
164
+ vols.each do |vol|
165
+ voltitle = vol.at_css(".item-title")
166
+ volsub = vol.at_css(".item-subtitle")
167
+ voldates = vol.at_css(".date-range")
168
+ voldesc = vol.at_css(".description")
169
+ voldesc.css('br').each{|br| br.replace "\n"} if voldesc
170
+ volcause = vol.at_css(".cause")
171
+ text_resume += "\n#{voltitle.text}\n" if voltitle
172
+ text_resume += "#{volsub.text}\n" if volsub
173
+ text_resume += "#{voldates.text}\n" if voldates
174
+ text_resume += "Cause: #{volcause.text}\n" if volcause
175
+ text_resume += "#{voldesc.text}\n" if voldesc
176
+ end
177
+ orgs = page.css("#organizations li")
178
+ text_resume += "\n\nORGANIZATIONS\n" if orgs && orgs.length > 0
179
+ orgs.each do |org|
180
+ orgtitle = org.at_css(".item-title")
181
+ orgsub = org.at_css(".item-subtitle")
182
+ orgdates = org.at_css(".date-range")
183
+ orgdesc = org.at_css(".description")
184
+ orgdesc.css('br').each{|br| br.replace "\n"} if orgdesc
185
+ text_resume += "\n#{orgtitle.text}\n" if orgtitle
186
+ text_resume += "#{orgsub.text}\n" if orgsub
187
+ text_resume += "#{orgdates.text}\n" if orgdates
188
+ text_resume += "#{orgdesc.text}\n" if orgdesc
189
+ end
190
+ pats = page.css("#patents .patent")
191
+ text_resume += "\n\nPATENTS\n" if pats && pats.length > 0
192
+ pats.each do |pat|
193
+ pattitle = pat.at_css(".item-title")
194
+ patsub = pat.at_css(".item-subtitle")
195
+ patdates = pat.at_css(".date-range")
196
+ patdesc = pat.at_css(".description")
197
+ patdesc.css('br').each{|br| br.replace "\n"} if patdesc
198
+ patcont = pat.at_css(".contributors")
199
+ text_resume += "\n#{pattitle.text}\n" if pattitle
200
+ text_resume += "#{patsub.text}\n" if patsub
201
+ text_resume += "#{patdates.text}\n" if patdates
202
+ text_resume += "#{patdesc.text}\n" if patdesc
203
+ text_resume += "#{patcont.text}\n" if patcont
204
+ end
205
+ awards = page.css("#awards .award")
206
+ text_resume += "\n\nAWARDS\n" if awards && awards.length > 0
207
+ awards.each do |award|
208
+ atitle = award.at_css(".item-title")
209
+ asub = award.at_css(".item-subtitle")
210
+ adates = award.at_css(".date-range")
211
+ adesc = award.at_css(".description")
212
+ adesc.css('br').each{|br| br.replace "\n"} if adesc
213
+ text_resume += "\n#{atitle.text}\n" if atitle
214
+ text_resume += "#{asub.text}\n" if asub
215
+ text_resume += "#{adates.text}\n" if adates
216
+ text_resume += "#{adesc.text}\n" if adesc
217
+ end
218
+ courses = page.css("#courses li")
219
+ text_resume += "\n\nCOURSES\n" if courses && courses.length > 0
220
+ courses.each do |course|
221
+ coutitle = course.at_css(".item-title")
222
+ coulist = course.at_css(".courses-list")
223
+ text_resume += "\n#{coutitle.text}\n" if coutitle
224
+ text_resume += "#{coulist.text}\n" if coulist
225
+ end
226
+
227
+
228
+ row["Contact ID"] = contact_id
229
+ row["LIN ID"] = lin_id
230
+ row["CV TR"] = "1"
231
+ row["Account Name"] = acc_name
232
+ row["Linkedin Import Status"] = import_status
233
+ row["First Name"] = name[0]&.slice(0, 39)
234
+ row["Last Name"] = name[1..-1]&.join(" ")&.slice(0, 79)
235
+ row["Email"] = email
236
+ row["Candidate ID"] = cand_id
237
+ row["LIN 1st Degree"] = cand_source
238
+ row["Title"] = title&.slice(0, 127)
239
+ row["Contact Country"] = country
240
+ row["Contact LIN Sector"] = sector&.slice(0, 99)
241
+ row["Employer 1 Title"] = e1_title&.slice(0, 31999)
242
+ row["Employer Organization Name 1"] = e1_org&.slice(0, 254)
243
+ row["Employer 1 Start Date"] = format_date(e1_start) #format
244
+ row["Employer 1 End Date"] = format_date(e1_end) #format
245
+ row["Employer 1 Location"] = e1_loc&.slice(0, 254)
246
+ row["Employer 1 Description"] = e1_desc&.slice(0, 31999)
247
+ row["Employer 2 Title"] = e2_title&.slice(0, 31999)
248
+ row["Employer Organization Name 2"] = e2_org&.slice(0, 254)
249
+ row["Employer 2 Start Date"] = format_date(e2_start) #format
250
+ row["Employer 2 End Date"] = format_date(e2_end) #format
251
+ row["Employer 2 Location"] = e2_loc&.slice(0, 254)
252
+ row["Employer 2 Description"] = e2_desc&.slice(0, 31999)
253
+ row["Employer 3 Title"] = e3_title&.slice(0, 31999)
254
+ row["Employer Organization Name 3"] = e3_org&.slice(0, 254)
255
+ row["Employer 3 Start Date"] = format_date(e3_start) #format
256
+ row["Employer 3 End Date"] = format_date(e3_end) #format
257
+ row["Employer 3 Location"] = e3_loc&.slice(0, 254)
258
+ row["Employer 3 Description"] = e3_desc&.slice(0, 31999)
259
+ row["License or Certification Name 1"] = c1_name&.slice(0, 254)
260
+ row["License or Certification Name 2"] = c2_name&.slice(0, 254)
261
+ row["License or Certification Credential Type"] = c_type&.slice(0, 254)
262
+ row["Education School 1"] = s1_name&.slice(0, 124)
263
+ row["Education Degree Name 1"] = s1_degree&.slice(0, 254)
264
+ row["Education Degree Date 1"] = format_date(s1_end)
265
+ row["Education School 2"] = s2_name&.slice(0, 124)
266
+ row["Education Degree Name 2"] = s2_degree&.slice(0, 254)
267
+ row["Education Degree Date 2"] = format_date(s2_end)
268
+ row["Text Resume"] = text_resume&.slice(0, 31999)
269
+ row["LinkedIn Profile"] = lin_profile&.slice(0, 254)
270
+ row["Resume Last Updated"] = Time.now.strftime('%Y-%m-%d %H:%M:%S')
271
+ row["LIN Import Date"] = Time.now.strftime('%Y-%m-%d')
272
+ row["CV Uploaded"] = "1"
273
+
274
+ row
275
+
276
+ end
277
+
278
+ def scrape_education(input_row, page)
279
+ rows = []
280
+ schools = page.css("#education .schools .school")
281
+
282
+ schools.each do |school|
283
+ row = CSV::Row.new(@education_headers, [])
284
+ row["Contact"] = input_row["Contact ID"]
285
+ row["LIN ID"] = input_row["LIN ID"]
286
+ row["School Name"] = school.at_css(".item-title").text.slice(0, 149)
287
+ row["Major"] = school.at_css(".item-subtitle").text.slice(0, 254)
288
+ dstart = school.css(".date-range time")[0]
289
+ dend = school.css(".date-range time")[1]
290
+ if dend
291
+ row["Graduation Year"] = dend.text.gsub(/\D/, '').slice(0, 74)
292
+ else
293
+ row["Graduation Year"] = dstart.text.gsub(/\D/, '').slice(0, 74)
294
+ end
295
+ rows << row
296
+ end
297
+ rows
298
+ end
299
+
300
+ def scrape_employment(input_row, page)
301
+ rows = []
302
+ positions = page.css("#experience .positions .position")
303
+
304
+ positions.each do |position|
305
+ row = CSV::Row.new(@employment_headers, [])
306
+ row["Contact"] = input_row["Contact ID"]
307
+ row["LIN ID"] = input_row["LIN ID"]
308
+ row["Job Title"] = position.at_css(".item-title").text.slice(0, 74)
309
+ row["Employer Name"] = position.at_css(".item-subtitle").text.slice(0, 149)
310
+ jstart = position.css(".date-range time")[0]
311
+ jend = position.css(".date-range time")[1]
312
+ row["Start Date"] = format_date(jstart.text)
313
+ row["End Date"] = format_date(jend.text)
314
+ row["Location"] = position.at_css(".location").text.slice(0, 254)
315
+ rows << row
316
+ end
317
+ rows
318
+ end
319
+
320
+ end
@@ -0,0 +1,30 @@
1
+ class Proxy
2
+ attr_accessor :ip, :port, :username, :password, :status, :last_used, :user_agent
3
+
4
+ def initialize(ip:, port: 80, username: nil, password: nil, status: nil, last_used: nil, user_agent: nil)
5
+ @ip, @port, @username, @password, @status, @last_used =
6
+ ip, port, username, password, status, last_used
7
+ end
8
+
9
+ def dead
10
+ @status = 'dead'
11
+ @last_used = Time.now
12
+ end
13
+
14
+ def good
15
+ @status = 'good'
16
+ @last_used = Time.now
17
+ end
18
+
19
+ def good?
20
+ @status == 'good' ? true : false
21
+ end
22
+
23
+ def dead?
24
+ @status == 'dead' ? true : false
25
+ end
26
+
27
+ def used
28
+ @last_used = Time.now
29
+ end
30
+ end
@@ -0,0 +1,42 @@
1
+ require_relative 'proxy'
2
+
3
+ class ProxyHandler
4
+
5
+ def initialize(cooldown_time = 5)
6
+ @cooldown_time = cooldown_time
7
+ @proxy_list = File.read('./../data/proxies.txt').split("\n")
8
+ .collect{|proxy| proxy.split(':')}
9
+ @proxies = []
10
+ @ua_list = File.read('./../data/agents.txt').split("\n")
11
+
12
+ @proxy_list.each do |proxy_details|
13
+ proxy = Proxy.new(ip: proxy_details[0], port: proxy_details[1],
14
+ username: proxy_details[2], password: proxy_details[3], status: 'good',
15
+ last_used: Time.now - @cooldown_time, user_agent: @ua_list.shift)
16
+ @proxies << proxy
17
+ end
18
+ if @proxies.length == 0
19
+ puts "proxies.txt is empty! if you don't want to use any proxies, use the -n flag. see docs for more."
20
+ exit
21
+ end
22
+
23
+ end
24
+
25
+ def get_proxy
26
+ @good_proxies = @proxies.select { |proxy| proxy.good? }
27
+ if @good_proxies.length > 0
28
+ @good_proxies.sort!{|a, b| a.last_used <=> b.last_used}
29
+ best_proxy = @good_proxies.first
30
+ duration = Time.now - best_proxy.last_used
31
+ sleep(@cooldown_time - duration) if duration < @cooldown_time
32
+ best_proxy
33
+ else
34
+ puts "All proxies are dead. Wait a few hours before resuming."
35
+ exit
36
+ end
37
+ end
38
+
39
+ def length
40
+ @proxies.length
41
+ end
42
+ end