linsc 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/linsc/cross_ref.rb +5 -5
- data/lib/linsc/csv_handlers.rb +3 -3
- data/lib/linsc/duck.rb +5 -1
- data/lib/linsc/merger.rb +3 -3
- data/lib/linsc/parsers.rb +64 -64
- data/lib/linsc.rb +1 -1
- data/linsc.gemspec +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2a7c54e13a9f1ade26f5330d410af6e23d6fcb72
|
4
|
+
data.tar.gz: b5f5b89e4169506b415f8a3fceeaff678a0615ca
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e79102d02acfcec2064b49611aa352b193d89bb90b93ff126995ea0e5a5e5584da6d3624208ce7bafcd223f82ea6c5baed1964d2cd1fa204dfdee0e326e19015
|
7
|
+
data.tar.gz: 48b1af2a42a580a5ad495abe53dd6f550057c7280c6942bde0126fe6226d792dbdcba62016054f9bfdbf2639e0bf57086a478f67a04fa2dec2d3a16a5f0aa7a7
|
data/lib/linsc/cross_ref.rb
CHANGED
@@ -42,20 +42,20 @@ class CrossRef
|
|
42
42
|
b = y[@master_lookup_field]
|
43
43
|
a && b ? a <=> b : a ? -1 : 1
|
44
44
|
end
|
45
|
-
master_lookup_values = master_data.collect {|row| row[@master_lookup_field]
|
45
|
+
master_lookup_values = master_data.collect {|row| row[@master_lookup_field] && row[@master_lookup_field].downcase}
|
46
46
|
i = 0
|
47
47
|
CSV.foreach(@child_path, headers: true, encoding: 'utf-8') do |child_row|
|
48
48
|
i += 1
|
49
49
|
puts "email lookup - row: #{i}/#{@child_length}"
|
50
|
-
child_lookup_value = child_row[@child_lookup_field]
|
51
|
-
if child_lookup_value
|
50
|
+
child_lookup_value = child_row[@child_lookup_field].downcase if child_row[@child_lookup_field]
|
51
|
+
if (child_lookup_value && child_lookup_value.include?('@')) || !@email_key ## generalize this
|
52
52
|
match_index = master_lookup_values.bsearch_index do |master_lookup_value|
|
53
53
|
child_lookup_value && master_lookup_value ?
|
54
54
|
child_lookup_value <=> master_lookup_value : child_lookup_value ? -1 : 1
|
55
55
|
end
|
56
56
|
if !match_index
|
57
57
|
match_index = master_data.find_index do |master_row|
|
58
|
-
master_secondary_lookups = @master_secondary_lookups.collect{|x| x
|
58
|
+
master_secondary_lookups = @master_secondary_lookups.collect{|x| x && x.downcase}
|
59
59
|
master_secondary_lookups.include?(child_lookup_value)
|
60
60
|
end
|
61
61
|
end
|
@@ -94,7 +94,7 @@ class CrossRef
|
|
94
94
|
end
|
95
95
|
master_row_new = CSV::Row.new(@headers, [])
|
96
96
|
master_row.each do |key, value|
|
97
|
-
master_row_new[key] = value
|
97
|
+
master_row_new[key] = value.encode('utf-8', invalid: :replace, undef: :replace, replace: '#') if value
|
98
98
|
end
|
99
99
|
master_row_new
|
100
100
|
end
|
data/lib/linsc/csv_handlers.rb
CHANGED
@@ -3,7 +3,7 @@ module CSVHandlers
|
|
3
3
|
values = []
|
4
4
|
headers.each do |header|
|
5
5
|
if encoding
|
6
|
-
values << row[header]
|
6
|
+
values << row[header].encode(encoding) if row[header]
|
7
7
|
else
|
8
8
|
values << row[header]
|
9
9
|
end
|
@@ -33,7 +33,7 @@ module CSVHandlers
|
|
33
33
|
unless File.exist?(f)
|
34
34
|
FileUtils.touch(f)
|
35
35
|
csv = CSV.open(f, "w+")
|
36
|
-
csv << @headers.collect {|x| x
|
36
|
+
csv << @headers.collect {|x| x && x.encode('utf-8')}
|
37
37
|
csv.close
|
38
38
|
end
|
39
39
|
end
|
@@ -42,7 +42,7 @@ module CSVHandlers
|
|
42
42
|
unless File.exist?(f)
|
43
43
|
FileUtils.touch(f)
|
44
44
|
csv = CSV.open(f, "w+")
|
45
|
-
csv << headers.collect {|x| x
|
45
|
+
csv << headers.collect {|x| x && x.encode('utf-8')}
|
46
46
|
csv.close
|
47
47
|
end
|
48
48
|
end
|
data/lib/linsc/duck.rb
CHANGED
@@ -42,7 +42,11 @@ class DuckScraper
|
|
42
42
|
CSV.foreach(@input_file, headers: true) do |input_row|
|
43
43
|
count += 1
|
44
44
|
next if @start && @start >= count
|
45
|
-
|
45
|
+
if @proxies
|
46
|
+
tries = @proxies.length
|
47
|
+
else
|
48
|
+
tries = 3
|
49
|
+
end
|
46
50
|
puts "ddg #{count}/#{@input_length}"
|
47
51
|
begin
|
48
52
|
unless sufficient_data?(input_row)
|
data/lib/linsc/merger.rb
CHANGED
@@ -29,7 +29,7 @@ class Merger
|
|
29
29
|
clean_file = File.read(lin_file, encoding: 'windows-1252').strip
|
30
30
|
CSV.parse(clean_file, headers: true, encoding: 'windows-1252') do |row|
|
31
31
|
row["Recruiter"] = recruiter_name
|
32
|
-
email = row['E-mail Address']
|
32
|
+
email = row['E-mail Address'].downcase if row['E-mail Address']
|
33
33
|
if emails.has_key?(email)
|
34
34
|
emails[email] << row
|
35
35
|
else
|
@@ -56,10 +56,10 @@ class Merger
|
|
56
56
|
output_row = CSV::Row.new(@headers, [])
|
57
57
|
correct_row.each do |key, value|
|
58
58
|
if @mapping[key]
|
59
|
-
output_row[@mapping[key]] = value
|
59
|
+
output_row[@mapping[key]] = value.encode('utf-8') if value
|
60
60
|
end
|
61
61
|
end
|
62
|
-
output_row['Email'] = output_row['Email']
|
62
|
+
output_row['Email'] = output_row['Email'].downcase if output_row['Email']
|
63
63
|
else
|
64
64
|
output_row = create_row(correct_row, @headers, 'utf-8')
|
65
65
|
end
|
data/lib/linsc/parsers.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
module Parsers
|
2
2
|
def scrape_contact(input_row, page, mode)
|
3
3
|
row = CSV::Row.new(@headers, [])
|
4
|
-
name = page.at_css("#name")
|
4
|
+
name = page.at_css("#name").text.split
|
5
5
|
contact_id = input_row["Contact ID"]
|
6
6
|
lin_id = input_row["LIN ID"]
|
7
7
|
cv_tr = input_row["CV TR"]
|
@@ -11,53 +11,53 @@ module Parsers
|
|
11
11
|
lin_profile = input_row["Linkedin Profile"]
|
12
12
|
cand_id = input_row["Candidate ID"]
|
13
13
|
cand_source = input_row["LIN 1st Degree"]
|
14
|
-
title = page.at_css(".headline.title")
|
15
|
-
country = page.at_css("#demographics .locality")
|
16
|
-
sector = page.at_css("#demographics .descriptor:not(.adr)")
|
14
|
+
title = page.at_css(".headline.title").text
|
15
|
+
country = page.at_css("#demographics .locality").text
|
16
|
+
sector = page.at_css("#demographics .descriptor:not(.adr)").text
|
17
17
|
|
18
18
|
positions = page.css("#experience .positions .position")
|
19
19
|
if positions
|
20
|
-
e1_title = positions[0]
|
21
|
-
e1_org = positions[0]
|
22
|
-
e1_start = positions[0]
|
23
|
-
e1_end = positions[0]
|
24
|
-
e1_loc = positions[0]
|
25
|
-
e1_desc = positions[0]
|
26
|
-
e2_title = positions[1]
|
27
|
-
e2_org = positions[1]
|
28
|
-
e2_start = positions[1]
|
29
|
-
e2_end = positions[1]
|
30
|
-
e2_loc = positions[1]
|
31
|
-
e2_desc = positions[1]
|
32
|
-
e3_title = positions[2]
|
33
|
-
e3_org = positions[2]
|
34
|
-
e3_start = positions[2]
|
35
|
-
e3_end = positions[2]
|
36
|
-
e3_loc = positions[2]
|
37
|
-
e3_desc = positions[2]
|
20
|
+
e1_title = positions[0].at_css(".item-title").text
|
21
|
+
e1_org = positions[0].at_css(".item-subtitle").text
|
22
|
+
e1_start = positions[0].css(".date-range time")[0].text
|
23
|
+
e1_end = positions[0].css(".date-range time")[1].text
|
24
|
+
e1_loc = positions[0].at_css(".location").text
|
25
|
+
e1_desc = positions[0].at_css(".description").text
|
26
|
+
e2_title = positions[1].at_css(".item-title").text
|
27
|
+
e2_org = positions[1].at_css(".item-subtitle").text
|
28
|
+
e2_start = positions[1].css(".date-range time")[0].text
|
29
|
+
e2_end = positions[1].css(".date-range time")[1].text
|
30
|
+
e2_loc = positions[1].at_css(".location").text
|
31
|
+
e2_desc = positions[1].at_css(".description").text
|
32
|
+
e3_title = positions[2].at_css(".item-title").text
|
33
|
+
e3_org = positions[2].at_css(".item-subtitle").text
|
34
|
+
e3_start = positions[2].css(".date-range time")[0].text
|
35
|
+
e3_end = positions[2].css(".date-range time")[1].text
|
36
|
+
e3_loc = positions[2].at_css(".location").text
|
37
|
+
e3_desc = positions[2].at_css(".description").text
|
38
38
|
end
|
39
39
|
|
40
40
|
certs = page.css(".certifications .certification")
|
41
41
|
if certs
|
42
|
-
c1_name = certs[0]
|
43
|
-
c2_name = certs[1]
|
44
|
-
c_type = certs[0]
|
42
|
+
c1_name = certs[0].at_css(".item-title").text
|
43
|
+
c2_name = certs[1].at_css(".item-title").text
|
44
|
+
c_type = certs[0].at_css(".item-subtitle").text
|
45
45
|
end
|
46
46
|
|
47
47
|
schools = page.css("#education .schools .school")
|
48
48
|
if schools
|
49
|
-
s1_name = schools[0]
|
50
|
-
s2_name = schools[1]
|
51
|
-
s1_start = schools[0]
|
52
|
-
s2_start = schools[1]
|
53
|
-
s1_end = schools[0]
|
54
|
-
s2_end = schools[1]
|
55
|
-
s1_degree = schools[0]
|
56
|
-
s2_degree = schools[1]
|
49
|
+
s1_name = schools[0].at_css(".item-title").text
|
50
|
+
s2_name = schools[1].at_css(".item-title").text
|
51
|
+
s1_start = schools[0].css(".date-range time")[0].text
|
52
|
+
s2_start = schools[1].css(".date-range time")[0].text
|
53
|
+
s1_end = schools[0].css(".date-range time")[1].text
|
54
|
+
s2_end = schools[1].css(".date-range time")[1].text
|
55
|
+
s1_degree = schools[0].at_css(".item-subtitle").text
|
56
|
+
s2_degree = schools[1].at_css(".item-subtitle").text
|
57
57
|
end
|
58
58
|
|
59
59
|
summary = page.at_css("#summary .description")
|
60
|
-
summary
|
60
|
+
summary.css('br').each{|br| br.replace "\n"} if summary
|
61
61
|
|
62
62
|
text_resume = "\n\n***IMPORTED FROM LINKEDIN***\n#{lin_profile}\n\n"
|
63
63
|
text_resume += name.join(" ")
|
@@ -105,7 +105,7 @@ module Parsers
|
|
105
105
|
text_resume += "\nINTERESTS\n" if interests && interests.length > 0
|
106
106
|
ints = []
|
107
107
|
interests.each do |interest|
|
108
|
-
int = interest.at_css(".wrap")
|
108
|
+
int = interest.at_css(".wrap").text
|
109
109
|
if int
|
110
110
|
ints << int unless (int == "See less") || (int.match(/See \d+\+/))
|
111
111
|
end
|
@@ -115,7 +115,7 @@ module Parsers
|
|
115
115
|
text_resume += "\n\nSKILLS\n" if skills && skills.length > 0
|
116
116
|
sks = []
|
117
117
|
skills.each do |skill|
|
118
|
-
sk = skill.at_css(".wrap")
|
118
|
+
sk = skill.at_css(".wrap").text
|
119
119
|
if sk
|
120
120
|
sks << sk unless (sk == "See less") || (sk.match(/See \d+\+/))
|
121
121
|
end
|
@@ -125,7 +125,7 @@ module Parsers
|
|
125
125
|
text_resume += "\n\nLANGUAGES\n" if languages.length > 0
|
126
126
|
langs = []
|
127
127
|
languages.each do |language|
|
128
|
-
lang = language.at_css(".name")
|
128
|
+
lang = language.at_css(".name").text
|
129
129
|
prof = language.at_css(".proficiency")
|
130
130
|
lang += " (#{prof.text})" if prof && prof.text.length > 0
|
131
131
|
langs << lang if lang
|
@@ -230,43 +230,43 @@ module Parsers
|
|
230
230
|
row["CV TR"] = "1"
|
231
231
|
row["Account Name"] = acc_name
|
232
232
|
row["Linkedin Import Status"] = import_status
|
233
|
-
row["First Name"] = name[0]
|
234
|
-
row["Last Name"] = name[1..-1]
|
233
|
+
row["First Name"] = name[0].slice(0, 39)
|
234
|
+
row["Last Name"] = name[1..-1].join(" ").slice(0, 79)
|
235
235
|
row["Email"] = email
|
236
236
|
row["Candidate ID"] = cand_id
|
237
237
|
row["LIN 1st Degree"] = cand_source
|
238
|
-
row["Title"] = title
|
238
|
+
row["Title"] = title.slice(0, 127)
|
239
239
|
row["Contact Country"] = country
|
240
|
-
row["Contact LIN Sector"] = sector
|
241
|
-
row["Employer 1 Title"] = e1_title
|
242
|
-
row["Employer Organization Name 1"] = e1_org
|
240
|
+
row["Contact LIN Sector"] = sector.slice(0, 99)
|
241
|
+
row["Employer 1 Title"] = e1_title.slice(0, 31999)
|
242
|
+
row["Employer Organization Name 1"] = e1_org.slice(0, 254)
|
243
243
|
row["Employer 1 Start Date"] = format_date(e1_start) #format
|
244
244
|
row["Employer 1 End Date"] = format_date(e1_end) #format
|
245
|
-
row["Employer 1 Location"] = e1_loc
|
246
|
-
row["Employer 1 Description"] = e1_desc
|
247
|
-
row["Employer 2 Title"] = e2_title
|
248
|
-
row["Employer Organization Name 2"] = e2_org
|
245
|
+
row["Employer 1 Location"] = e1_loc.slice(0, 254)
|
246
|
+
row["Employer 1 Description"] = e1_desc.slice(0, 31999)
|
247
|
+
row["Employer 2 Title"] = e2_title.slice(0, 31999)
|
248
|
+
row["Employer Organization Name 2"] = e2_org.slice(0, 254)
|
249
249
|
row["Employer 2 Start Date"] = format_date(e2_start) #format
|
250
250
|
row["Employer 2 End Date"] = format_date(e2_end) #format
|
251
|
-
row["Employer 2 Location"] = e2_loc
|
252
|
-
row["Employer 2 Description"] = e2_desc
|
253
|
-
row["Employer 3 Title"] = e3_title
|
254
|
-
row["Employer Organization Name 3"] = e3_org
|
251
|
+
row["Employer 2 Location"] = e2_loc.slice(0, 254)
|
252
|
+
row["Employer 2 Description"] = e2_desc.slice(0, 31999)
|
253
|
+
row["Employer 3 Title"] = e3_title.slice(0, 31999)
|
254
|
+
row["Employer Organization Name 3"] = e3_org.slice(0, 254)
|
255
255
|
row["Employer 3 Start Date"] = format_date(e3_start) #format
|
256
256
|
row["Employer 3 End Date"] = format_date(e3_end) #format
|
257
|
-
row["Employer 3 Location"] = e3_loc
|
258
|
-
row["Employer 3 Description"] = e3_desc
|
259
|
-
row["License or Certification Name 1"] = c1_name
|
260
|
-
row["License or Certification Name 2"] = c2_name
|
261
|
-
row["License or Certification Credential Type"] = c_type
|
262
|
-
row["Education School 1"] = s1_name
|
263
|
-
row["Education Degree Name 1"] = s1_degree
|
257
|
+
row["Employer 3 Location"] = e3_loc.slice(0, 254)
|
258
|
+
row["Employer 3 Description"] = e3_desc.slice(0, 31999)
|
259
|
+
row["License or Certification Name 1"] = c1_name.slice(0, 254)
|
260
|
+
row["License or Certification Name 2"] = c2_name.slice(0, 254)
|
261
|
+
row["License or Certification Credential Type"] = c_type.slice(0, 254)
|
262
|
+
row["Education School 1"] = s1_name.slice(0, 124)
|
263
|
+
row["Education Degree Name 1"] = s1_degree.slice(0, 254)
|
264
264
|
row["Education Degree Date 1"] = format_date(s1_end)
|
265
|
-
row["Education School 2"] = s2_name
|
266
|
-
row["Education Degree Name 2"] = s2_degree
|
265
|
+
row["Education School 2"] = s2_name.slice(0, 124)
|
266
|
+
row["Education Degree Name 2"] = s2_degree.slice(0, 254)
|
267
267
|
row["Education Degree Date 2"] = format_date(s2_end)
|
268
|
-
row["Text Resume"] = text_resume
|
269
|
-
row["LinkedIn Profile"] = lin_profile
|
268
|
+
row["Text Resume"] = text_resume.slice(0, 31999)
|
269
|
+
row["LinkedIn Profile"] = lin_profile.slice(0, 254)
|
270
270
|
row["Resume Last Updated"] = Time.now.strftime('%Y-%m-%d %H:%M:%S')
|
271
271
|
row["LIN Import Date"] = Time.now.strftime('%Y-%m-%d')
|
272
272
|
row["CV Uploaded"] = "1"
|
@@ -281,7 +281,7 @@ module Parsers
|
|
281
281
|
|
282
282
|
schools.each do |school|
|
283
283
|
row = CSV::Row.new(@education_headers, [])
|
284
|
-
row["Contact"] = input_row["Contact ID"]
|
284
|
+
row["Contact ID"] = input_row["Contact ID"]
|
285
285
|
row["LIN ID"] = input_row["LIN ID"]
|
286
286
|
row["School Name"] = school.at_css(".item-title").text.slice(0, 149)
|
287
287
|
row["Major"] = school.at_css(".item-subtitle").text.slice(0, 254)
|
@@ -303,7 +303,7 @@ module Parsers
|
|
303
303
|
|
304
304
|
positions.each do |position|
|
305
305
|
row = CSV::Row.new(@employment_headers, [])
|
306
|
-
row["Contact"] = input_row["Contact ID"]
|
306
|
+
row["Contact ID"] = input_row["Contact ID"]
|
307
307
|
row["LIN ID"] = input_row["LIN ID"]
|
308
308
|
row["Job Title"] = position.at_css(".item-title").text.slice(0, 74)
|
309
309
|
row["Employer Name"] = position.at_css(".item-subtitle").text.slice(0, 149)
|
data/lib/linsc.rb
CHANGED
data/linsc.gemspec
CHANGED
@@ -4,7 +4,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
|
4
4
|
|
5
5
|
Gem::Specification.new do |spec|
|
6
6
|
spec.name = "linsc"
|
7
|
-
spec.version = "0.0.
|
7
|
+
spec.version = "0.0.6"
|
8
8
|
spec.authors = ["Dan Molloy"]
|
9
9
|
spec.email = ["danieljmolloy1@gmail.com"]
|
10
10
|
spec.date = '2016-03-31'
|