linsc 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +40 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/linsc +5 -0
- data/bin/setup +8 -0
- data/data/agents.txt +10 -0
- data/data/proxies.txt +0 -0
- data/data/recruiters.txt +0 -0
- data/lib/linsc.rb +159 -0
- data/lib/linsc/cross_ref.rb +113 -0
- data/lib/linsc/csv_handlers.rb +53 -0
- data/lib/linsc/duck.rb +179 -0
- data/lib/linsc/lin.rb +303 -0
- data/lib/linsc/merger.rb +70 -0
- data/lib/linsc/parsers.rb +320 -0
- data/lib/linsc/proxy.rb +30 -0
- data/lib/linsc/proxy_handler.rb +42 -0
- data/linsc-0.0.1.gem +0 -0
- data/linsc.gemspec +31 -0
- metadata +140 -0
data/lib/linsc/duck.rb
ADDED
@@ -0,0 +1,179 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'mechanize'
|
3
|
+
require 'csv'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'i18n'
|
6
|
+
require_relative 'proxy'
|
7
|
+
require_relative 'proxy_handler'
|
8
|
+
require_relative 'csv_handlers'
|
9
|
+
|
10
|
+
# tuck this away into a core_extensions module
|
11
|
+
class String
|
12
|
+
def alnum
|
13
|
+
return self.gsub(/[^\p{Alnum}\p{Space}]/u, ' ')
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
class DuckScraper
|
18
|
+
|
19
|
+
include CSVHandlers
|
20
|
+
|
21
|
+
def initialize(working_dir, input_file, output_file, options)
|
22
|
+
@working_dir, @input_file, @output_file, @noproxy =
|
23
|
+
working_dir, input_file, output_file, options[:noproxy]
|
24
|
+
|
25
|
+
@headers = get_headers(@input_file)
|
26
|
+
@headers << "Linkedin Import Status" unless @headers.include?("Linkedin Import Status")
|
27
|
+
@headers << "Urls" unless @headers.include?("Urls")
|
28
|
+
@input_length = %x(wc -l "#{@input_file}").split[0].to_i - 1
|
29
|
+
if File.exist?(@output_file)
|
30
|
+
@start = CSV.read(@output_file, headers: true).length
|
31
|
+
puts "resuming from row #{@start}"
|
32
|
+
else
|
33
|
+
create_file(@output_file)
|
34
|
+
end
|
35
|
+
@cooldown = 5
|
36
|
+
@proxies = ProxyHandler.new(@cooldown) unless @noproxy
|
37
|
+
end
|
38
|
+
|
39
|
+
def find_profiles
|
40
|
+
count = 0
|
41
|
+
|
42
|
+
CSV.foreach(@input_file, headers: true) do |input_row|
|
43
|
+
count += 1
|
44
|
+
next if @start && @start >= count
|
45
|
+
tries = @proxies&.length || 3
|
46
|
+
puts "ddg #{count}/#{@input_length}"
|
47
|
+
begin
|
48
|
+
unless sufficient_data?(input_row)
|
49
|
+
puts "Insufficient data, skipping"
|
50
|
+
append_ddg_row(input_row, "Insufficient Data", nil)
|
51
|
+
next
|
52
|
+
end
|
53
|
+
agent = Mechanize.new
|
54
|
+
|
55
|
+
unless @noproxy
|
56
|
+
proxy = @proxies.get_proxy
|
57
|
+
agent.set_proxy(proxy.ip, proxy.port, proxy.username, proxy.password)
|
58
|
+
agent.user_agent = proxy.user_agent
|
59
|
+
puts "proxy: #{proxy.ip}"
|
60
|
+
end
|
61
|
+
sleep(@cooldown) if @noproxy
|
62
|
+
query_string = create_query(input_row)
|
63
|
+
puts "query string: #{query_string}"
|
64
|
+
ddg_page = agent.get('https://www.duckduckgo.com/html')
|
65
|
+
search_form = ddg_page.form_with(id: 'search_form_homepage')
|
66
|
+
search_form.q = query_string
|
67
|
+
results_page = agent.submit(search_form)
|
68
|
+
urls = find_results(results_page, input_row)
|
69
|
+
if urls.length > 0
|
70
|
+
puts "Success! #{urls.length} possible urls found"
|
71
|
+
append_ddg_row(input_row, "DDG results found", urls.join(', '))
|
72
|
+
else
|
73
|
+
puts "no results found"
|
74
|
+
append_ddg_row(input_row, "No DDG results found", nil)
|
75
|
+
end
|
76
|
+
proxy.good if proxy
|
77
|
+
|
78
|
+
rescue StandardError => msg
|
79
|
+
tries -= 1
|
80
|
+
if tries > 0
|
81
|
+
puts "\n\n"
|
82
|
+
puts msg
|
83
|
+
puts 'RETRYING'
|
84
|
+
puts "\n\n"
|
85
|
+
proxy.used if proxy
|
86
|
+
retry
|
87
|
+
else
|
88
|
+
append_ddg_row(input_row, msg, nil)
|
89
|
+
puts msg
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def append_ddg_row(row, status, urls)
|
96
|
+
row << ["Linkedin Import Status", status]
|
97
|
+
row << ["Urls", urls]
|
98
|
+
output_row = create_row(row, @headers)
|
99
|
+
append_to_csv(@output_file, output_row)
|
100
|
+
end
|
101
|
+
|
102
|
+
def sufficient_data?(row)
|
103
|
+
data_presence = 0
|
104
|
+
if row["First Name"] && row["First Name"].alnum.strip != ""
|
105
|
+
data_presence += 1
|
106
|
+
end
|
107
|
+
if row["Last Name"] && row["Last Name"].alnum.strip != ""
|
108
|
+
data_presence += 1
|
109
|
+
end
|
110
|
+
if row["Employer Organization Name 1"] && row["Employer Organization Name 1"].alnum.strip != ""
|
111
|
+
data_presence += 1
|
112
|
+
end
|
113
|
+
if row["Employer 1 Title"] && row["Employer 1 Title"].alnum.strip != ""
|
114
|
+
data_presence += 1
|
115
|
+
end
|
116
|
+
data_presence == 4 ? true : false
|
117
|
+
end
|
118
|
+
|
119
|
+
def find_results(page, row)
|
120
|
+
matches = []
|
121
|
+
full_name = "#{row['First Name']} #{row['Last Name']}".gsub(row["Email"], ' ').alnum.strip
|
122
|
+
if page.css("#links .results_links_deep")
|
123
|
+
results = page.css("#links .results_links_deep")
|
124
|
+
else
|
125
|
+
return matches
|
126
|
+
end
|
127
|
+
results.each do |result|
|
128
|
+
if result.at_css("a.result__a")
|
129
|
+
|
130
|
+
url_text = result.css("a.result__a").text.alnum
|
131
|
+
url = result.at_css('a.result__a')['href']
|
132
|
+
bio = result.css("a.result__snippet").text.alnum || ""
|
133
|
+
valid_url = true
|
134
|
+
short_title = row["Employer 1 Title"].alnum.split.first(2)
|
135
|
+
short_employer = row["Employer Organization Name 1"].alnum.split.first
|
136
|
+
|
137
|
+
if result.css("a.large").text.include?("profiles | LinkedIn")
|
138
|
+
valid_url = false
|
139
|
+
end
|
140
|
+
unless url.include?("linkedin") && (url.include?("/in/") || url.include?("/pub/"))
|
141
|
+
valid_url = false
|
142
|
+
end
|
143
|
+
|
144
|
+
if valid_url && name_check(url_text, full_name)
|
145
|
+
if bio.downcase.include?(short_title.join(' ').downcase) && bio.downcase.include?(short_employer.to_s.downcase)
|
146
|
+
matches.unshift(url)
|
147
|
+
else
|
148
|
+
matches.push(url)
|
149
|
+
end
|
150
|
+
else
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
matches
|
155
|
+
end
|
156
|
+
|
157
|
+
def name_check(lin_name, csv_name)
|
158
|
+
csv_array = csv_name.downcase.split(" ")
|
159
|
+
lin_array = lin_name.downcase.split(" ")
|
160
|
+
match = true
|
161
|
+
csv_array.each do |chunk|
|
162
|
+
unless lin_array.include?(chunk)
|
163
|
+
match = false
|
164
|
+
end
|
165
|
+
end
|
166
|
+
return match
|
167
|
+
end
|
168
|
+
|
169
|
+
def create_query(row)
|
170
|
+
query_parts = [row["First Name"], row["Last Name"], row["Employer 1 Title"],
|
171
|
+
row["Employer Organization Name 1"]]
|
172
|
+
query_parts.collect! do |part|
|
173
|
+
part.gsub!(row["Email"], ' ')
|
174
|
+
part.downcase.alnum.strip
|
175
|
+
end
|
176
|
+
"linkedin #{query_parts.join(' ')}"
|
177
|
+
end
|
178
|
+
|
179
|
+
end
|
data/lib/linsc/lin.rb
ADDED
@@ -0,0 +1,303 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'mechanize'
|
3
|
+
require 'csv'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'i18n'
|
6
|
+
require_relative 'proxy'
|
7
|
+
require_relative 'proxy_handler'
|
8
|
+
require_relative 'csv_handlers'
|
9
|
+
require_relative 'parsers'
|
10
|
+
|
11
|
+
# tuck this away into a core_extensions module
|
12
|
+
class String
|
13
|
+
def alnum
|
14
|
+
return self.gsub(/[^\p{Alnum}\p{Space}]/u, ' ')
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
class NilClass
|
19
|
+
def text
|
20
|
+
return nil
|
21
|
+
end
|
22
|
+
def [](options = {})
|
23
|
+
return nil
|
24
|
+
end
|
25
|
+
def css(options = {})
|
26
|
+
return nil
|
27
|
+
end
|
28
|
+
def gsub(a, b)
|
29
|
+
return nil
|
30
|
+
end
|
31
|
+
def at_css(options = {})
|
32
|
+
return nil
|
33
|
+
end
|
34
|
+
def slice(a, b, options = {})
|
35
|
+
return nil
|
36
|
+
end
|
37
|
+
def include?(a)
|
38
|
+
return false
|
39
|
+
end
|
40
|
+
def gsub!(a, b)
|
41
|
+
return nil
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
class LinScraper
|
47
|
+
include CSVHandlers
|
48
|
+
include Parsers
|
49
|
+
|
50
|
+
def initialize(working_dir, input_file, options)
|
51
|
+
@working_dir, @input_file, @options = working_dir, input_file, options
|
52
|
+
@output_update = @working_dir + "contact_update.csv" if @options[:update]
|
53
|
+
@output_insert = @working_dir + "contact_insert.csv" if @options[:insert]
|
54
|
+
@output_employment_update = @working_dir + "contact_employment_update.csv" if @options[:update]
|
55
|
+
@output_employment_insert = @working_dir + "contact_employment_insert.csv" if @options[:insert]
|
56
|
+
@output_education_update = @working_dir + "contact_education_update.csv" if @options[:update]
|
57
|
+
@output_education_insert = @working_dir + "contact_education_insert.csv" if @options[:insert]
|
58
|
+
|
59
|
+
@cooldown = 20
|
60
|
+
@noproxy = options[:noproxy]
|
61
|
+
@proxies = ProxyHandler.new(@cooldown) unless @options[:noproxy]
|
62
|
+
@headers = get_headers(@input_file)
|
63
|
+
@new_headers = ["Contact ID", "LIN ID", "CV TR", "Account Name", "Linkedin Import Status", "First Name", "Last Name", "Email", "LinkedIn Profile", "Candidate ID",
|
64
|
+
"LIN 1st Degree", "Title", "Contact Country", "Contact LIN Sector", "Resume Last Updated", "LIN Import Date", "CV Uploaded",
|
65
|
+
"Employer 1 Title", "Employer Organization Name 1", "Employer 1 Start Date",
|
66
|
+
"Employer 1 End Date", "Employer 1 Location", "Employer 1 Description",
|
67
|
+
"Employer 2 Title", "Employer Organization Name 2", "Employer 2 Start Date",
|
68
|
+
"Employer 2 End Date", "Employer 2 Location", "Employer 2 Description",
|
69
|
+
"Employer 3 Title", "Employer Organization Name 3", "Employer 3 Start Date",
|
70
|
+
"Employer 3 End Date", "Employer 3 Location", "Employer 3 Description",
|
71
|
+
"License or Certification Name 1", "License or Certification Name 2",
|
72
|
+
"License or Certification Credential Type", "Education School 1",
|
73
|
+
"Education Degree Name 1", "Education Degree Date 1",
|
74
|
+
"Education School 2", "Education Degree Name 2",
|
75
|
+
"Education Degree Date 2", "Text Resume"]
|
76
|
+
@new_headers.each do |header|
|
77
|
+
@headers << header unless @headers.include?(header)
|
78
|
+
end
|
79
|
+
@headers.delete('Urls')
|
80
|
+
@employment_headers = ["Contact ID", "Employer Name", "Job Title", "Start Date", "End Date", "Location", "LIN ID"]
|
81
|
+
@education_headers = ["Contact ID", "School Name", "Major", "Graduation Year", "LIN ID"]
|
82
|
+
@input_length = %x(wc -l "#{@input_file}").split[0].to_i - 1
|
83
|
+
I18n.available_locales = [:en]
|
84
|
+
if (@output_update && File.exist?(@output_update)) || (@output_insert && File.exist?(@output_insert))
|
85
|
+
if @output_update
|
86
|
+
update_length = CSV.read(@output_update, headers: true).length
|
87
|
+
else
|
88
|
+
update_length = 0
|
89
|
+
end
|
90
|
+
if @output_insert
|
91
|
+
insert_length = CSV.read(@output_insert, headers: true).length
|
92
|
+
else
|
93
|
+
insert_length = 0
|
94
|
+
end
|
95
|
+
@start = update_length + insert_length
|
96
|
+
end
|
97
|
+
[@output_insert, @output_update].each do |file|
|
98
|
+
if file
|
99
|
+
create_file(file) unless File.exist?(file)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
[@output_employment_update, @output_employment_insert].each do |file|
|
103
|
+
if file
|
104
|
+
create_file_with_headers(file, @employment_headers)
|
105
|
+
end
|
106
|
+
end
|
107
|
+
[@output_education_update, @output_education_insert].each do |file|
|
108
|
+
if file
|
109
|
+
create_file_with_headers(file, @education_headers)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def name_check(lin_name, csv_name)
|
115
|
+
csv_array = csv_name.downcase.alnum.split(" ")
|
116
|
+
lin_array = lin_name.downcase.alnum.split(" ")
|
117
|
+
match = true
|
118
|
+
csv_array.each do |chunk|
|
119
|
+
unless lin_array.include?(chunk)
|
120
|
+
match = false
|
121
|
+
end
|
122
|
+
end
|
123
|
+
return match
|
124
|
+
end
|
125
|
+
|
126
|
+
|
127
|
+
|
128
|
+
def format_date(input_date)
|
129
|
+
if input_date.nil?
|
130
|
+
return nil
|
131
|
+
end
|
132
|
+
begin
|
133
|
+
date_arr = input_date.split(" ")
|
134
|
+
if date_arr.length == 1
|
135
|
+
output_date = Date.strptime(input_date, "%Y")
|
136
|
+
return output_date.strftime("%Y-%m-%d")
|
137
|
+
elsif date_arr.length == 2
|
138
|
+
output_date = Date.strptime(input_date, "%B %Y")
|
139
|
+
return output_date.strftime("%Y-%m-%d")
|
140
|
+
else
|
141
|
+
return nil
|
142
|
+
end
|
143
|
+
rescue
|
144
|
+
if date_arr.length == 2
|
145
|
+
return format_date(date_arr[1])
|
146
|
+
else
|
147
|
+
return nil
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
def validate(url, row)
|
153
|
+
puts "checking url: #{url}"
|
154
|
+
begin
|
155
|
+
agent = Mechanize.new
|
156
|
+
|
157
|
+
unless @noproxy
|
158
|
+
proxy = @proxies.get_proxy
|
159
|
+
agent.set_proxy(proxy.ip, proxy.port, proxy.username, proxy.password)
|
160
|
+
agent.user_agent = proxy.user_agent
|
161
|
+
puts "proxy: #{proxy.ip}"
|
162
|
+
end
|
163
|
+
sleep(@cooldown) if @noproxy
|
164
|
+
page = agent.get(url)
|
165
|
+
puts 'ACCESS GRANTED'
|
166
|
+
|
167
|
+
return false unless page.at_css("#name") && page.css("#experience .positions .position")
|
168
|
+
return false unless name_check(page.at_css("#name").text, "#{row['First Name']} #{row['Last Name']}")
|
169
|
+
positions = page.css("#experience .positions .position")
|
170
|
+
|
171
|
+
match = false
|
172
|
+
positions.each do |position|
|
173
|
+
if position.at_css("header .item-title a") && position.at_css("header .item-subtitle")
|
174
|
+
profile_title = I18n.transliterate(position.at_css("header .item-title a").text).alnum
|
175
|
+
profile_employer = I18n.transliterate(position.at_css("header .item-subtitle").text).alnum
|
176
|
+
title = I18n.transliterate(row['Employer 1 Title']).alnum
|
177
|
+
employer = I18n.transliterate(row['Employer Organization Name 1']).alnum
|
178
|
+
if name_check(profile_title, title) && name_check(profile_employer, employer)
|
179
|
+
match = true
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
proxy.good if proxy
|
184
|
+
if match
|
185
|
+
return [url, page]
|
186
|
+
else
|
187
|
+
return false
|
188
|
+
end
|
189
|
+
rescue StandardError => e
|
190
|
+
puts e
|
191
|
+
if e.to_s.start_with?('999')
|
192
|
+
proxy.dead if proxy
|
193
|
+
retry
|
194
|
+
elsif e.to_s.start_with?('404') || e.to_s.start_with?('403')
|
195
|
+
proxy.good if proxy
|
196
|
+
return false
|
197
|
+
else
|
198
|
+
puts e.backtrace
|
199
|
+
proxy.used if proxy
|
200
|
+
retry
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
def start
|
206
|
+
count = 0
|
207
|
+
CSV.foreach(@input_file, headers: true) do |input_row|
|
208
|
+
count += 1
|
209
|
+
next if @start && @start >= count
|
210
|
+
tries = @proxies.length unless @noproxy
|
211
|
+
puts "lin #{count}/#{@input_length}"
|
212
|
+
# begin
|
213
|
+
urls = input_row['Urls']
|
214
|
+
if urls && urls.include?('http')
|
215
|
+
urls = urls.split(', ')
|
216
|
+
correct_url, correct_page = nil
|
217
|
+
urls.each do |url|
|
218
|
+
correct_url, correct_page = validate(url, input_row)
|
219
|
+
break if correct_url && correct_page
|
220
|
+
end
|
221
|
+
if correct_url
|
222
|
+
puts "correct page"
|
223
|
+
input_row << ["Linkedin Profile", correct_url]
|
224
|
+
input_row["Linkedin Import Status"] = 'Profile imported'
|
225
|
+
input_row.delete('Urls')
|
226
|
+
if input_row['Contact ID'] && input_row['Contact ID'].strip.length > 0
|
227
|
+
if @options[:update]
|
228
|
+
new_row = scrape_contact(input_row, correct_page, 'update')
|
229
|
+
append_to_csv(@output_update, new_row)
|
230
|
+
scrape_employment(input_row, correct_page).each do |emp_row|
|
231
|
+
append_to_csv(@output_employment_update, emp_row)
|
232
|
+
end
|
233
|
+
scrape_education(input_row, correct_page).each do |ed_row|
|
234
|
+
append_to_csv(@output_education_update, ed_row)
|
235
|
+
end
|
236
|
+
end
|
237
|
+
else
|
238
|
+
if @options[:insert]
|
239
|
+
new_row = scrape_contact(input_row, correct_page, 'insert')
|
240
|
+
append_to_csv(@output_insert, new_row)
|
241
|
+
scrape_employment(input_row, correct_page).each do |emp_row|
|
242
|
+
append_to_csv(@output_employment_insert, emp_row)
|
243
|
+
end
|
244
|
+
scrape_education(input_row, correct_page).each do |ed_row|
|
245
|
+
append_to_csv(@output_education_insert, ed_row)
|
246
|
+
end
|
247
|
+
end
|
248
|
+
end
|
249
|
+
else
|
250
|
+
if @options[:update] && input_row['Contact ID'] && input_row['Contact ID'].strip.length > 0
|
251
|
+
input_row << ["Linkedin Profile", nil]
|
252
|
+
input_row.delete('Urls')
|
253
|
+
input_row["Linkedin Import Status"] = 'Profile not found'
|
254
|
+
output_row = create_row(input_row, @headers)
|
255
|
+
puts input_row["Linkedin Import Status"]
|
256
|
+
append_to_csv(@output_update, output_row)
|
257
|
+
elsif @options [:insert]
|
258
|
+
input_row << ["Linkedin Profile", nil]
|
259
|
+
input_row.delete('Urls')
|
260
|
+
input_row["Linkedin Import Status"] = 'Profile not found'
|
261
|
+
puts input_row["Linkedin Import Status"]
|
262
|
+
output_row = create_row(input_row, @headers)
|
263
|
+
append_to_csv(@output_insert, output_row)
|
264
|
+
end
|
265
|
+
end
|
266
|
+
else
|
267
|
+
if @options[:update] && input_row['Contact ID'] && input_row['Contact ID'].strip.length > 0
|
268
|
+
input_row << ["Linkedin Profile", nil]
|
269
|
+
input_row.delete('Urls')
|
270
|
+
puts input_row["Linkedin Import Status"]
|
271
|
+
output_row = create_row(input_row, @headers)
|
272
|
+
append_to_csv(@output_update, output_row)
|
273
|
+
elsif @options [:insert]
|
274
|
+
input_row << ["Linkedin Profile", nil]
|
275
|
+
input_row.delete('Urls')
|
276
|
+
puts input_row["Linkedin Import Status"]
|
277
|
+
output_row = create_row(input_row, @headers)
|
278
|
+
append_to_csv(@output_insert, output_row)
|
279
|
+
end
|
280
|
+
end
|
281
|
+
# rescue Exception => msg
|
282
|
+
# tries -= 1
|
283
|
+
# if tries > 0
|
284
|
+
# puts "\n\n"
|
285
|
+
# puts msg
|
286
|
+
# puts 'RETRYING'
|
287
|
+
# puts "\n\n"
|
288
|
+
# if msg.to_s.start_with?("999")
|
289
|
+
# proxy.dead
|
290
|
+
# else
|
291
|
+
# proxy.used
|
292
|
+
# end
|
293
|
+
# retry
|
294
|
+
# else
|
295
|
+
# #append_ddg_row(input_row, msg, nil)
|
296
|
+
# puts msg
|
297
|
+
# end
|
298
|
+
# end
|
299
|
+
end
|
300
|
+
|
301
|
+
end
|
302
|
+
|
303
|
+
end
|