linsc 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/linsc/cross_ref.rb +5 -5
- data/lib/linsc/csv_handlers.rb +3 -3
- data/lib/linsc/duck.rb +5 -1
- data/lib/linsc/merger.rb +3 -3
- data/lib/linsc/parsers.rb +64 -64
- data/lib/linsc.rb +1 -1
- data/linsc.gemspec +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2a7c54e13a9f1ade26f5330d410af6e23d6fcb72
|
4
|
+
data.tar.gz: b5f5b89e4169506b415f8a3fceeaff678a0615ca
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e79102d02acfcec2064b49611aa352b193d89bb90b93ff126995ea0e5a5e5584da6d3624208ce7bafcd223f82ea6c5baed1964d2cd1fa204dfdee0e326e19015
|
7
|
+
data.tar.gz: 48b1af2a42a580a5ad495abe53dd6f550057c7280c6942bde0126fe6226d792dbdcba62016054f9bfdbf2639e0bf57086a478f67a04fa2dec2d3a16a5f0aa7a7
|
data/lib/linsc/cross_ref.rb
CHANGED
@@ -42,20 +42,20 @@ class CrossRef
|
|
42
42
|
b = y[@master_lookup_field]
|
43
43
|
a && b ? a <=> b : a ? -1 : 1
|
44
44
|
end
|
45
|
-
master_lookup_values = master_data.collect {|row| row[@master_lookup_field]
|
45
|
+
master_lookup_values = master_data.collect {|row| row[@master_lookup_field] && row[@master_lookup_field].downcase}
|
46
46
|
i = 0
|
47
47
|
CSV.foreach(@child_path, headers: true, encoding: 'utf-8') do |child_row|
|
48
48
|
i += 1
|
49
49
|
puts "email lookup - row: #{i}/#{@child_length}"
|
50
|
-
child_lookup_value = child_row[@child_lookup_field]
|
51
|
-
if child_lookup_value
|
50
|
+
child_lookup_value = child_row[@child_lookup_field].downcase if child_row[@child_lookup_field]
|
51
|
+
if (child_lookup_value && child_lookup_value.include?('@')) || !@email_key ## generalize this
|
52
52
|
match_index = master_lookup_values.bsearch_index do |master_lookup_value|
|
53
53
|
child_lookup_value && master_lookup_value ?
|
54
54
|
child_lookup_value <=> master_lookup_value : child_lookup_value ? -1 : 1
|
55
55
|
end
|
56
56
|
if !match_index
|
57
57
|
match_index = master_data.find_index do |master_row|
|
58
|
-
master_secondary_lookups = @master_secondary_lookups.collect{|x| x
|
58
|
+
master_secondary_lookups = @master_secondary_lookups.collect{|x| x && x.downcase}
|
59
59
|
master_secondary_lookups.include?(child_lookup_value)
|
60
60
|
end
|
61
61
|
end
|
@@ -94,7 +94,7 @@ class CrossRef
|
|
94
94
|
end
|
95
95
|
master_row_new = CSV::Row.new(@headers, [])
|
96
96
|
master_row.each do |key, value|
|
97
|
-
master_row_new[key] = value
|
97
|
+
master_row_new[key] = value.encode('utf-8', invalid: :replace, undef: :replace, replace: '#') if value
|
98
98
|
end
|
99
99
|
master_row_new
|
100
100
|
end
|
data/lib/linsc/csv_handlers.rb
CHANGED
@@ -3,7 +3,7 @@ module CSVHandlers
|
|
3
3
|
values = []
|
4
4
|
headers.each do |header|
|
5
5
|
if encoding
|
6
|
-
values << row[header]
|
6
|
+
values << row[header].encode(encoding) if row[header]
|
7
7
|
else
|
8
8
|
values << row[header]
|
9
9
|
end
|
@@ -33,7 +33,7 @@ module CSVHandlers
|
|
33
33
|
unless File.exist?(f)
|
34
34
|
FileUtils.touch(f)
|
35
35
|
csv = CSV.open(f, "w+")
|
36
|
-
csv << @headers.collect {|x| x
|
36
|
+
csv << @headers.collect {|x| x && x.encode('utf-8')}
|
37
37
|
csv.close
|
38
38
|
end
|
39
39
|
end
|
@@ -42,7 +42,7 @@ module CSVHandlers
|
|
42
42
|
unless File.exist?(f)
|
43
43
|
FileUtils.touch(f)
|
44
44
|
csv = CSV.open(f, "w+")
|
45
|
-
csv << headers.collect {|x| x
|
45
|
+
csv << headers.collect {|x| x && x.encode('utf-8')}
|
46
46
|
csv.close
|
47
47
|
end
|
48
48
|
end
|
data/lib/linsc/duck.rb
CHANGED
@@ -42,7 +42,11 @@ class DuckScraper
|
|
42
42
|
CSV.foreach(@input_file, headers: true) do |input_row|
|
43
43
|
count += 1
|
44
44
|
next if @start && @start >= count
|
45
|
-
|
45
|
+
if @proxies
|
46
|
+
tries = @proxies.length
|
47
|
+
else
|
48
|
+
tries = 3
|
49
|
+
end
|
46
50
|
puts "ddg #{count}/#{@input_length}"
|
47
51
|
begin
|
48
52
|
unless sufficient_data?(input_row)
|
data/lib/linsc/merger.rb
CHANGED
@@ -29,7 +29,7 @@ class Merger
|
|
29
29
|
clean_file = File.read(lin_file, encoding: 'windows-1252').strip
|
30
30
|
CSV.parse(clean_file, headers: true, encoding: 'windows-1252') do |row|
|
31
31
|
row["Recruiter"] = recruiter_name
|
32
|
-
email = row['E-mail Address']
|
32
|
+
email = row['E-mail Address'].downcase if row['E-mail Address']
|
33
33
|
if emails.has_key?(email)
|
34
34
|
emails[email] << row
|
35
35
|
else
|
@@ -56,10 +56,10 @@ class Merger
|
|
56
56
|
output_row = CSV::Row.new(@headers, [])
|
57
57
|
correct_row.each do |key, value|
|
58
58
|
if @mapping[key]
|
59
|
-
output_row[@mapping[key]] = value
|
59
|
+
output_row[@mapping[key]] = value.encode('utf-8') if value
|
60
60
|
end
|
61
61
|
end
|
62
|
-
output_row['Email'] = output_row['Email']
|
62
|
+
output_row['Email'] = output_row['Email'].downcase if output_row['Email']
|
63
63
|
else
|
64
64
|
output_row = create_row(correct_row, @headers, 'utf-8')
|
65
65
|
end
|
data/lib/linsc/parsers.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module Parsers
|
2
2
|
def scrape_contact(input_row, page, mode)
|
3
3
|
row = CSV::Row.new(@headers, [])
|
4
|
-
name = page.at_css("#name")
|
4
|
+
name = page.at_css("#name").text.split
|
5
5
|
contact_id = input_row["Contact ID"]
|
6
6
|
lin_id = input_row["LIN ID"]
|
7
7
|
cv_tr = input_row["CV TR"]
|
@@ -11,53 +11,53 @@ module Parsers
|
|
11
11
|
lin_profile = input_row["Linkedin Profile"]
|
12
12
|
cand_id = input_row["Candidate ID"]
|
13
13
|
cand_source = input_row["LIN 1st Degree"]
|
14
|
-
title = page.at_css(".headline.title")
|
15
|
-
country = page.at_css("#demographics .locality")
|
16
|
-
sector = page.at_css("#demographics .descriptor:not(.adr)")
|
14
|
+
title = page.at_css(".headline.title").text
|
15
|
+
country = page.at_css("#demographics .locality").text
|
16
|
+
sector = page.at_css("#demographics .descriptor:not(.adr)").text
|
17
17
|
|
18
18
|
positions = page.css("#experience .positions .position")
|
19
19
|
if positions
|
20
|
-
e1_title = positions[0]
|
21
|
-
e1_org = positions[0]
|
22
|
-
e1_start = positions[0]
|
23
|
-
e1_end = positions[0]
|
24
|
-
e1_loc = positions[0]
|
25
|
-
e1_desc = positions[0]
|
26
|
-
e2_title = positions[1]
|
27
|
-
e2_org = positions[1]
|
28
|
-
e2_start = positions[1]
|
29
|
-
e2_end = positions[1]
|
30
|
-
e2_loc = positions[1]
|
31
|
-
e2_desc = positions[1]
|
32
|
-
e3_title = positions[2]
|
33
|
-
e3_org = positions[2]
|
34
|
-
e3_start = positions[2]
|
35
|
-
e3_end = positions[2]
|
36
|
-
e3_loc = positions[2]
|
37
|
-
e3_desc = positions[2]
|
20
|
+
e1_title = positions[0].at_css(".item-title").text
|
21
|
+
e1_org = positions[0].at_css(".item-subtitle").text
|
22
|
+
e1_start = positions[0].css(".date-range time")[0].text
|
23
|
+
e1_end = positions[0].css(".date-range time")[1].text
|
24
|
+
e1_loc = positions[0].at_css(".location").text
|
25
|
+
e1_desc = positions[0].at_css(".description").text
|
26
|
+
e2_title = positions[1].at_css(".item-title").text
|
27
|
+
e2_org = positions[1].at_css(".item-subtitle").text
|
28
|
+
e2_start = positions[1].css(".date-range time")[0].text
|
29
|
+
e2_end = positions[1].css(".date-range time")[1].text
|
30
|
+
e2_loc = positions[1].at_css(".location").text
|
31
|
+
e2_desc = positions[1].at_css(".description").text
|
32
|
+
e3_title = positions[2].at_css(".item-title").text
|
33
|
+
e3_org = positions[2].at_css(".item-subtitle").text
|
34
|
+
e3_start = positions[2].css(".date-range time")[0].text
|
35
|
+
e3_end = positions[2].css(".date-range time")[1].text
|
36
|
+
e3_loc = positions[2].at_css(".location").text
|
37
|
+
e3_desc = positions[2].at_css(".description").text
|
38
38
|
end
|
39
39
|
|
40
40
|
certs = page.css(".certifications .certification")
|
41
41
|
if certs
|
42
|
-
c1_name = certs[0]
|
43
|
-
c2_name = certs[1]
|
44
|
-
c_type = certs[0]
|
42
|
+
c1_name = certs[0].at_css(".item-title").text
|
43
|
+
c2_name = certs[1].at_css(".item-title").text
|
44
|
+
c_type = certs[0].at_css(".item-subtitle").text
|
45
45
|
end
|
46
46
|
|
47
47
|
schools = page.css("#education .schools .school")
|
48
48
|
if schools
|
49
|
-
s1_name = schools[0]
|
50
|
-
s2_name = schools[1]
|
51
|
-
s1_start = schools[0]
|
52
|
-
s2_start = schools[1]
|
53
|
-
s1_end = schools[0]
|
54
|
-
s2_end = schools[1]
|
55
|
-
s1_degree = schools[0]
|
56
|
-
s2_degree = schools[1]
|
49
|
+
s1_name = schools[0].at_css(".item-title").text
|
50
|
+
s2_name = schools[1].at_css(".item-title").text
|
51
|
+
s1_start = schools[0].css(".date-range time")[0].text
|
52
|
+
s2_start = schools[1].css(".date-range time")[0].text
|
53
|
+
s1_end = schools[0].css(".date-range time")[1].text
|
54
|
+
s2_end = schools[1].css(".date-range time")[1].text
|
55
|
+
s1_degree = schools[0].at_css(".item-subtitle").text
|
56
|
+
s2_degree = schools[1].at_css(".item-subtitle").text
|
57
57
|
end
|
58
58
|
|
59
59
|
summary = page.at_css("#summary .description")
|
60
|
-
summary
|
60
|
+
summary.css('br').each{|br| br.replace "\n"} if summary
|
61
61
|
|
62
62
|
text_resume = "\n\n***IMPORTED FROM LINKEDIN***\n#{lin_profile}\n\n"
|
63
63
|
text_resume += name.join(" ")
|
@@ -105,7 +105,7 @@ module Parsers
|
|
105
105
|
text_resume += "\nINTERESTS\n" if interests && interests.length > 0
|
106
106
|
ints = []
|
107
107
|
interests.each do |interest|
|
108
|
-
int = interest.at_css(".wrap")
|
108
|
+
int = interest.at_css(".wrap").text
|
109
109
|
if int
|
110
110
|
ints << int unless (int == "See less") || (int.match(/See \d+\+/))
|
111
111
|
end
|
@@ -115,7 +115,7 @@ module Parsers
|
|
115
115
|
text_resume += "\n\nSKILLS\n" if skills && skills.length > 0
|
116
116
|
sks = []
|
117
117
|
skills.each do |skill|
|
118
|
-
sk = skill.at_css(".wrap")
|
118
|
+
sk = skill.at_css(".wrap").text
|
119
119
|
if sk
|
120
120
|
sks << sk unless (sk == "See less") || (sk.match(/See \d+\+/))
|
121
121
|
end
|
@@ -125,7 +125,7 @@ module Parsers
|
|
125
125
|
text_resume += "\n\nLANGUAGES\n" if languages.length > 0
|
126
126
|
langs = []
|
127
127
|
languages.each do |language|
|
128
|
-
lang = language.at_css(".name")
|
128
|
+
lang = language.at_css(".name").text
|
129
129
|
prof = language.at_css(".proficiency")
|
130
130
|
lang += " (#{prof.text})" if prof && prof.text.length > 0
|
131
131
|
langs << lang if lang
|
@@ -230,43 +230,43 @@ module Parsers
|
|
230
230
|
row["CV TR"] = "1"
|
231
231
|
row["Account Name"] = acc_name
|
232
232
|
row["Linkedin Import Status"] = import_status
|
233
|
-
row["First Name"] = name[0]
|
234
|
-
row["Last Name"] = name[1..-1]
|
233
|
+
row["First Name"] = name[0].slice(0, 39)
|
234
|
+
row["Last Name"] = name[1..-1].join(" ").slice(0, 79)
|
235
235
|
row["Email"] = email
|
236
236
|
row["Candidate ID"] = cand_id
|
237
237
|
row["LIN 1st Degree"] = cand_source
|
238
|
-
row["Title"] = title
|
238
|
+
row["Title"] = title.slice(0, 127)
|
239
239
|
row["Contact Country"] = country
|
240
|
-
row["Contact LIN Sector"] = sector
|
241
|
-
row["Employer 1 Title"] = e1_title
|
242
|
-
row["Employer Organization Name 1"] = e1_org
|
240
|
+
row["Contact LIN Sector"] = sector.slice(0, 99)
|
241
|
+
row["Employer 1 Title"] = e1_title.slice(0, 31999)
|
242
|
+
row["Employer Organization Name 1"] = e1_org.slice(0, 254)
|
243
243
|
row["Employer 1 Start Date"] = format_date(e1_start) #format
|
244
244
|
row["Employer 1 End Date"] = format_date(e1_end) #format
|
245
|
-
row["Employer 1 Location"] = e1_loc
|
246
|
-
row["Employer 1 Description"] = e1_desc
|
247
|
-
row["Employer 2 Title"] = e2_title
|
248
|
-
row["Employer Organization Name 2"] = e2_org
|
245
|
+
row["Employer 1 Location"] = e1_loc.slice(0, 254)
|
246
|
+
row["Employer 1 Description"] = e1_desc.slice(0, 31999)
|
247
|
+
row["Employer 2 Title"] = e2_title.slice(0, 31999)
|
248
|
+
row["Employer Organization Name 2"] = e2_org.slice(0, 254)
|
249
249
|
row["Employer 2 Start Date"] = format_date(e2_start) #format
|
250
250
|
row["Employer 2 End Date"] = format_date(e2_end) #format
|
251
|
-
row["Employer 2 Location"] = e2_loc
|
252
|
-
row["Employer 2 Description"] = e2_desc
|
253
|
-
row["Employer 3 Title"] = e3_title
|
254
|
-
row["Employer Organization Name 3"] = e3_org
|
251
|
+
row["Employer 2 Location"] = e2_loc.slice(0, 254)
|
252
|
+
row["Employer 2 Description"] = e2_desc.slice(0, 31999)
|
253
|
+
row["Employer 3 Title"] = e3_title.slice(0, 31999)
|
254
|
+
row["Employer Organization Name 3"] = e3_org.slice(0, 254)
|
255
255
|
row["Employer 3 Start Date"] = format_date(e3_start) #format
|
256
256
|
row["Employer 3 End Date"] = format_date(e3_end) #format
|
257
|
-
row["Employer 3 Location"] = e3_loc
|
258
|
-
row["Employer 3 Description"] = e3_desc
|
259
|
-
row["License or Certification Name 1"] = c1_name
|
260
|
-
row["License or Certification Name 2"] = c2_name
|
261
|
-
row["License or Certification Credential Type"] = c_type
|
262
|
-
row["Education School 1"] = s1_name
|
263
|
-
row["Education Degree Name 1"] = s1_degree
|
257
|
+
row["Employer 3 Location"] = e3_loc.slice(0, 254)
|
258
|
+
row["Employer 3 Description"] = e3_desc.slice(0, 31999)
|
259
|
+
row["License or Certification Name 1"] = c1_name.slice(0, 254)
|
260
|
+
row["License or Certification Name 2"] = c2_name.slice(0, 254)
|
261
|
+
row["License or Certification Credential Type"] = c_type.slice(0, 254)
|
262
|
+
row["Education School 1"] = s1_name.slice(0, 124)
|
263
|
+
row["Education Degree Name 1"] = s1_degree.slice(0, 254)
|
264
264
|
row["Education Degree Date 1"] = format_date(s1_end)
|
265
|
-
row["Education School 2"] = s2_name
|
266
|
-
row["Education Degree Name 2"] = s2_degree
|
265
|
+
row["Education School 2"] = s2_name.slice(0, 124)
|
266
|
+
row["Education Degree Name 2"] = s2_degree.slice(0, 254)
|
267
267
|
row["Education Degree Date 2"] = format_date(s2_end)
|
268
|
-
row["Text Resume"] = text_resume
|
269
|
-
row["LinkedIn Profile"] = lin_profile
|
268
|
+
row["Text Resume"] = text_resume.slice(0, 31999)
|
269
|
+
row["LinkedIn Profile"] = lin_profile.slice(0, 254)
|
270
270
|
row["Resume Last Updated"] = Time.now.strftime('%Y-%m-%d %H:%M:%S')
|
271
271
|
row["LIN Import Date"] = Time.now.strftime('%Y-%m-%d')
|
272
272
|
row["CV Uploaded"] = "1"
|
@@ -281,7 +281,7 @@ module Parsers
|
|
281
281
|
|
282
282
|
schools.each do |school|
|
283
283
|
row = CSV::Row.new(@education_headers, [])
|
284
|
-
row["Contact"] = input_row["Contact ID"]
|
284
|
+
row["Contact ID"] = input_row["Contact ID"]
|
285
285
|
row["LIN ID"] = input_row["LIN ID"]
|
286
286
|
row["School Name"] = school.at_css(".item-title").text.slice(0, 149)
|
287
287
|
row["Major"] = school.at_css(".item-subtitle").text.slice(0, 254)
|
@@ -303,7 +303,7 @@ module Parsers
|
|
303
303
|
|
304
304
|
positions.each do |position|
|
305
305
|
row = CSV::Row.new(@employment_headers, [])
|
306
|
-
row["Contact"] = input_row["Contact ID"]
|
306
|
+
row["Contact ID"] = input_row["Contact ID"]
|
307
307
|
row["LIN ID"] = input_row["LIN ID"]
|
308
308
|
row["Job Title"] = position.at_css(".item-title").text.slice(0, 74)
|
309
309
|
row["Employer Name"] = position.at_css(".item-subtitle").text.slice(0, 149)
|
data/lib/linsc.rb
CHANGED
data/linsc.gemspec
CHANGED
@@ -4,7 +4,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
4
|
|
5
5
|
Gem::Specification.new do |spec|
|
6
6
|
spec.name = "linsc"
|
7
|
-
spec.version = "0.0.
|
7
|
+
spec.version = "0.0.6"
|
8
8
|
spec.authors = ["Dan Molloy"]
|
9
9
|
spec.email = ["danieljmolloy1@gmail.com"]
|
10
10
|
spec.date = '2016-03-31'
|