linsc 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +9 -0
- data/.rspec +2 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +21 -0
- data/README.md +40 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/linsc +5 -0
- data/bin/setup +8 -0
- data/data/agents.txt +10 -0
- data/data/proxies.txt +0 -0
- data/data/recruiters.txt +0 -0
- data/lib/linsc.rb +159 -0
- data/lib/linsc/cross_ref.rb +113 -0
- data/lib/linsc/csv_handlers.rb +53 -0
- data/lib/linsc/duck.rb +179 -0
- data/lib/linsc/lin.rb +303 -0
- data/lib/linsc/merger.rb +70 -0
- data/lib/linsc/parsers.rb +320 -0
- data/lib/linsc/proxy.rb +30 -0
- data/lib/linsc/proxy_handler.rb +42 -0
- data/linsc-0.0.1.gem +0 -0
- data/linsc.gemspec +31 -0
- metadata +140 -0
data/lib/linsc/duck.rb
ADDED
@@ -0,0 +1,179 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'mechanize'
|
3
|
+
require 'csv'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'i18n'
|
6
|
+
require_relative 'proxy'
|
7
|
+
require_relative 'proxy_handler'
|
8
|
+
require_relative 'csv_handlers'
|
9
|
+
|
10
|
+
# tuck this away into a core_extensions module
|
11
|
+
class String
|
12
|
+
def alnum
|
13
|
+
return self.gsub(/[^\p{Alnum}\p{Space}]/u, ' ')
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
class DuckScraper
|
18
|
+
|
19
|
+
include CSVHandlers
|
20
|
+
|
21
|
+
def initialize(working_dir, input_file, output_file, options)
|
22
|
+
@working_dir, @input_file, @output_file, @noproxy =
|
23
|
+
working_dir, input_file, output_file, options[:noproxy]
|
24
|
+
|
25
|
+
@headers = get_headers(@input_file)
|
26
|
+
@headers << "Linkedin Import Status" unless @headers.include?("Linkedin Import Status")
|
27
|
+
@headers << "Urls" unless @headers.include?("Urls")
|
28
|
+
@input_length = %x(wc -l "#{@input_file}").split[0].to_i - 1
|
29
|
+
if File.exist?(@output_file)
|
30
|
+
@start = CSV.read(@output_file, headers: true).length
|
31
|
+
puts "resuming from row #{@start}"
|
32
|
+
else
|
33
|
+
create_file(@output_file)
|
34
|
+
end
|
35
|
+
@cooldown = 5
|
36
|
+
@proxies = ProxyHandler.new(@cooldown) unless @noproxy
|
37
|
+
end
|
38
|
+
|
39
|
+
def find_profiles
|
40
|
+
count = 0
|
41
|
+
|
42
|
+
CSV.foreach(@input_file, headers: true) do |input_row|
|
43
|
+
count += 1
|
44
|
+
next if @start && @start >= count
|
45
|
+
tries = @proxies&.length || 3
|
46
|
+
puts "ddg #{count}/#{@input_length}"
|
47
|
+
begin
|
48
|
+
unless sufficient_data?(input_row)
|
49
|
+
puts "Insufficient data, skipping"
|
50
|
+
append_ddg_row(input_row, "Insufficient Data", nil)
|
51
|
+
next
|
52
|
+
end
|
53
|
+
agent = Mechanize.new
|
54
|
+
|
55
|
+
unless @noproxy
|
56
|
+
proxy = @proxies.get_proxy
|
57
|
+
agent.set_proxy(proxy.ip, proxy.port, proxy.username, proxy.password)
|
58
|
+
agent.user_agent = proxy.user_agent
|
59
|
+
puts "proxy: #{proxy.ip}"
|
60
|
+
end
|
61
|
+
sleep(@cooldown) if @noproxy
|
62
|
+
query_string = create_query(input_row)
|
63
|
+
puts "query string: #{query_string}"
|
64
|
+
ddg_page = agent.get('https://www.duckduckgo.com/html')
|
65
|
+
search_form = ddg_page.form_with(id: 'search_form_homepage')
|
66
|
+
search_form.q = query_string
|
67
|
+
results_page = agent.submit(search_form)
|
68
|
+
urls = find_results(results_page, input_row)
|
69
|
+
if urls.length > 0
|
70
|
+
puts "Success! #{urls.length} possible urls found"
|
71
|
+
append_ddg_row(input_row, "DDG results found", urls.join(', '))
|
72
|
+
else
|
73
|
+
puts "no results found"
|
74
|
+
append_ddg_row(input_row, "No DDG results found", nil)
|
75
|
+
end
|
76
|
+
proxy.good if proxy
|
77
|
+
|
78
|
+
rescue StandardError => msg
|
79
|
+
tries -= 1
|
80
|
+
if tries > 0
|
81
|
+
puts "\n\n"
|
82
|
+
puts msg
|
83
|
+
puts 'RETRYING'
|
84
|
+
puts "\n\n"
|
85
|
+
proxy.used if proxy
|
86
|
+
retry
|
87
|
+
else
|
88
|
+
append_ddg_row(input_row, msg, nil)
|
89
|
+
puts msg
|
90
|
+
end
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
|
95
|
+
def append_ddg_row(row, status, urls)
|
96
|
+
row << ["Linkedin Import Status", status]
|
97
|
+
row << ["Urls", urls]
|
98
|
+
output_row = create_row(row, @headers)
|
99
|
+
append_to_csv(@output_file, output_row)
|
100
|
+
end
|
101
|
+
|
102
|
+
def sufficient_data?(row)
|
103
|
+
data_presence = 0
|
104
|
+
if row["First Name"] && row["First Name"].alnum.strip != ""
|
105
|
+
data_presence += 1
|
106
|
+
end
|
107
|
+
if row["Last Name"] && row["Last Name"].alnum.strip != ""
|
108
|
+
data_presence += 1
|
109
|
+
end
|
110
|
+
if row["Employer Organization Name 1"] && row["Employer Organization Name 1"].alnum.strip != ""
|
111
|
+
data_presence += 1
|
112
|
+
end
|
113
|
+
if row["Employer 1 Title"] && row["Employer 1 Title"].alnum.strip != ""
|
114
|
+
data_presence += 1
|
115
|
+
end
|
116
|
+
data_presence == 4 ? true : false
|
117
|
+
end
|
118
|
+
|
119
|
+
def find_results(page, row)
|
120
|
+
matches = []
|
121
|
+
full_name = "#{row['First Name']} #{row['Last Name']}".gsub(row["Email"], ' ').alnum.strip
|
122
|
+
if page.css("#links .results_links_deep")
|
123
|
+
results = page.css("#links .results_links_deep")
|
124
|
+
else
|
125
|
+
return matches
|
126
|
+
end
|
127
|
+
results.each do |result|
|
128
|
+
if result.at_css("a.result__a")
|
129
|
+
|
130
|
+
url_text = result.css("a.result__a").text.alnum
|
131
|
+
url = result.at_css('a.result__a')['href']
|
132
|
+
bio = result.css("a.result__snippet").text.alnum || ""
|
133
|
+
valid_url = true
|
134
|
+
short_title = row["Employer 1 Title"].alnum.split.first(2)
|
135
|
+
short_employer = row["Employer Organization Name 1"].alnum.split.first
|
136
|
+
|
137
|
+
if result.css("a.large").text.include?("profiles | LinkedIn")
|
138
|
+
valid_url = false
|
139
|
+
end
|
140
|
+
unless url.include?("linkedin") && (url.include?("/in/") || url.include?("/pub/"))
|
141
|
+
valid_url = false
|
142
|
+
end
|
143
|
+
|
144
|
+
if valid_url && name_check(url_text, full_name)
|
145
|
+
if bio.downcase.include?(short_title.join(' ').downcase) && bio.downcase.include?(short_employer.to_s.downcase)
|
146
|
+
matches.unshift(url)
|
147
|
+
else
|
148
|
+
matches.push(url)
|
149
|
+
end
|
150
|
+
else
|
151
|
+
end
|
152
|
+
end
|
153
|
+
end
|
154
|
+
matches
|
155
|
+
end
|
156
|
+
|
157
|
+
def name_check(lin_name, csv_name)
|
158
|
+
csv_array = csv_name.downcase.split(" ")
|
159
|
+
lin_array = lin_name.downcase.split(" ")
|
160
|
+
match = true
|
161
|
+
csv_array.each do |chunk|
|
162
|
+
unless lin_array.include?(chunk)
|
163
|
+
match = false
|
164
|
+
end
|
165
|
+
end
|
166
|
+
return match
|
167
|
+
end
|
168
|
+
|
169
|
+
def create_query(row)
|
170
|
+
query_parts = [row["First Name"], row["Last Name"], row["Employer 1 Title"],
|
171
|
+
row["Employer Organization Name 1"]]
|
172
|
+
query_parts.collect! do |part|
|
173
|
+
part.gsub!(row["Email"], ' ')
|
174
|
+
part.downcase.alnum.strip
|
175
|
+
end
|
176
|
+
"linkedin #{query_parts.join(' ')}"
|
177
|
+
end
|
178
|
+
|
179
|
+
end
|
data/lib/linsc/lin.rb
ADDED
@@ -0,0 +1,303 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'mechanize'
|
3
|
+
require 'csv'
|
4
|
+
require 'fileutils'
|
5
|
+
require 'i18n'
|
6
|
+
require_relative 'proxy'
|
7
|
+
require_relative 'proxy_handler'
|
8
|
+
require_relative 'csv_handlers'
|
9
|
+
require_relative 'parsers'
|
10
|
+
|
11
|
+
# tuck this away into a core_extensions module
|
12
|
+
class String
|
13
|
+
def alnum
|
14
|
+
return self.gsub(/[^\p{Alnum}\p{Space}]/u, ' ')
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
class NilClass
|
19
|
+
def text
|
20
|
+
return nil
|
21
|
+
end
|
22
|
+
def [](options = {})
|
23
|
+
return nil
|
24
|
+
end
|
25
|
+
def css(options = {})
|
26
|
+
return nil
|
27
|
+
end
|
28
|
+
def gsub(a, b)
|
29
|
+
return nil
|
30
|
+
end
|
31
|
+
def at_css(options = {})
|
32
|
+
return nil
|
33
|
+
end
|
34
|
+
def slice(a, b, options = {})
|
35
|
+
return nil
|
36
|
+
end
|
37
|
+
def include?(a)
|
38
|
+
return false
|
39
|
+
end
|
40
|
+
def gsub!(a, b)
|
41
|
+
return nil
|
42
|
+
end
|
43
|
+
|
44
|
+
end
|
45
|
+
|
46
|
+
class LinScraper
|
47
|
+
include CSVHandlers
|
48
|
+
include Parsers
|
49
|
+
|
50
|
+
def initialize(working_dir, input_file, options)
|
51
|
+
@working_dir, @input_file, @options = working_dir, input_file, options
|
52
|
+
@output_update = @working_dir + "contact_update.csv" if @options[:update]
|
53
|
+
@output_insert = @working_dir + "contact_insert.csv" if @options[:insert]
|
54
|
+
@output_employment_update = @working_dir + "contact_employment_update.csv" if @options[:update]
|
55
|
+
@output_employment_insert = @working_dir + "contact_employment_insert.csv" if @options[:insert]
|
56
|
+
@output_education_update = @working_dir + "contact_education_update.csv" if @options[:update]
|
57
|
+
@output_education_insert = @working_dir + "contact_education_insert.csv" if @options[:insert]
|
58
|
+
|
59
|
+
@cooldown = 20
|
60
|
+
@noproxy = options[:noproxy]
|
61
|
+
@proxies = ProxyHandler.new(@cooldown) unless @options[:noproxy]
|
62
|
+
@headers = get_headers(@input_file)
|
63
|
+
@new_headers = ["Contact ID", "LIN ID", "CV TR", "Account Name", "Linkedin Import Status", "First Name", "Last Name", "Email", "LinkedIn Profile", "Candidate ID",
|
64
|
+
"LIN 1st Degree", "Title", "Contact Country", "Contact LIN Sector", "Resume Last Updated", "LIN Import Date", "CV Uploaded",
|
65
|
+
"Employer 1 Title", "Employer Organization Name 1", "Employer 1 Start Date",
|
66
|
+
"Employer 1 End Date", "Employer 1 Location", "Employer 1 Description",
|
67
|
+
"Employer 2 Title", "Employer Organization Name 2", "Employer 2 Start Date",
|
68
|
+
"Employer 2 End Date", "Employer 2 Location", "Employer 2 Description",
|
69
|
+
"Employer 3 Title", "Employer Organization Name 3", "Employer 3 Start Date",
|
70
|
+
"Employer 3 End Date", "Employer 3 Location", "Employer 3 Description",
|
71
|
+
"License or Certification Name 1", "License or Certification Name 2",
|
72
|
+
"License or Certification Credential Type", "Education School 1",
|
73
|
+
"Education Degree Name 1", "Education Degree Date 1",
|
74
|
+
"Education School 2", "Education Degree Name 2",
|
75
|
+
"Education Degree Date 2", "Text Resume"]
|
76
|
+
@new_headers.each do |header|
|
77
|
+
@headers << header unless @headers.include?(header)
|
78
|
+
end
|
79
|
+
@headers.delete('Urls')
|
80
|
+
@employment_headers = ["Contact ID", "Employer Name", "Job Title", "Start Date", "End Date", "Location", "LIN ID"]
|
81
|
+
@education_headers = ["Contact ID", "School Name", "Major", "Graduation Year", "LIN ID"]
|
82
|
+
@input_length = %x(wc -l "#{@input_file}").split[0].to_i - 1
|
83
|
+
I18n.available_locales = [:en]
|
84
|
+
if (@output_update && File.exist?(@output_update)) || (@output_insert && File.exist?(@output_insert))
|
85
|
+
if @output_update
|
86
|
+
update_length = CSV.read(@output_update, headers: true).length
|
87
|
+
else
|
88
|
+
update_length = 0
|
89
|
+
end
|
90
|
+
if @output_insert
|
91
|
+
insert_length = CSV.read(@output_insert, headers: true).length
|
92
|
+
else
|
93
|
+
insert_length = 0
|
94
|
+
end
|
95
|
+
@start = update_length + insert_length
|
96
|
+
end
|
97
|
+
[@output_insert, @output_update].each do |file|
|
98
|
+
if file
|
99
|
+
create_file(file) unless File.exist?(file)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
[@output_employment_update, @output_employment_insert].each do |file|
|
103
|
+
if file
|
104
|
+
create_file_with_headers(file, @employment_headers)
|
105
|
+
end
|
106
|
+
end
|
107
|
+
[@output_education_update, @output_education_insert].each do |file|
|
108
|
+
if file
|
109
|
+
create_file_with_headers(file, @education_headers)
|
110
|
+
end
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
def name_check(lin_name, csv_name)
|
115
|
+
csv_array = csv_name.downcase.alnum.split(" ")
|
116
|
+
lin_array = lin_name.downcase.alnum.split(" ")
|
117
|
+
match = true
|
118
|
+
csv_array.each do |chunk|
|
119
|
+
unless lin_array.include?(chunk)
|
120
|
+
match = false
|
121
|
+
end
|
122
|
+
end
|
123
|
+
return match
|
124
|
+
end
|
125
|
+
|
126
|
+
|
127
|
+
|
128
|
+
def format_date(input_date)
|
129
|
+
if input_date.nil?
|
130
|
+
return nil
|
131
|
+
end
|
132
|
+
begin
|
133
|
+
date_arr = input_date.split(" ")
|
134
|
+
if date_arr.length == 1
|
135
|
+
output_date = Date.strptime(input_date, "%Y")
|
136
|
+
return output_date.strftime("%Y-%m-%d")
|
137
|
+
elsif date_arr.length == 2
|
138
|
+
output_date = Date.strptime(input_date, "%B %Y")
|
139
|
+
return output_date.strftime("%Y-%m-%d")
|
140
|
+
else
|
141
|
+
return nil
|
142
|
+
end
|
143
|
+
rescue
|
144
|
+
if date_arr.length == 2
|
145
|
+
return format_date(date_arr[1])
|
146
|
+
else
|
147
|
+
return nil
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
def validate(url, row)
|
153
|
+
puts "checking url: #{url}"
|
154
|
+
begin
|
155
|
+
agent = Mechanize.new
|
156
|
+
|
157
|
+
unless @noproxy
|
158
|
+
proxy = @proxies.get_proxy
|
159
|
+
agent.set_proxy(proxy.ip, proxy.port, proxy.username, proxy.password)
|
160
|
+
agent.user_agent = proxy.user_agent
|
161
|
+
puts "proxy: #{proxy.ip}"
|
162
|
+
end
|
163
|
+
sleep(@cooldown) if @noproxy
|
164
|
+
page = agent.get(url)
|
165
|
+
puts 'ACCESS GRANTED'
|
166
|
+
|
167
|
+
return false unless page.at_css("#name") && page.css("#experience .positions .position")
|
168
|
+
return false unless name_check(page.at_css("#name").text, "#{row['First Name']} #{row['Last Name']}")
|
169
|
+
positions = page.css("#experience .positions .position")
|
170
|
+
|
171
|
+
match = false
|
172
|
+
positions.each do |position|
|
173
|
+
if position.at_css("header .item-title a") && position.at_css("header .item-subtitle")
|
174
|
+
profile_title = I18n.transliterate(position.at_css("header .item-title a").text).alnum
|
175
|
+
profile_employer = I18n.transliterate(position.at_css("header .item-subtitle").text).alnum
|
176
|
+
title = I18n.transliterate(row['Employer 1 Title']).alnum
|
177
|
+
employer = I18n.transliterate(row['Employer Organization Name 1']).alnum
|
178
|
+
if name_check(profile_title, title) && name_check(profile_employer, employer)
|
179
|
+
match = true
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end
|
183
|
+
proxy.good if proxy
|
184
|
+
if match
|
185
|
+
return [url, page]
|
186
|
+
else
|
187
|
+
return false
|
188
|
+
end
|
189
|
+
rescue StandardError => e
|
190
|
+
puts e
|
191
|
+
if e.to_s.start_with?('999')
|
192
|
+
proxy.dead if proxy
|
193
|
+
retry
|
194
|
+
elsif e.to_s.start_with?('404') || e.to_s.start_with?('403')
|
195
|
+
proxy.good if proxy
|
196
|
+
return false
|
197
|
+
else
|
198
|
+
puts e.backtrace
|
199
|
+
proxy.used if proxy
|
200
|
+
retry
|
201
|
+
end
|
202
|
+
end
|
203
|
+
end
|
204
|
+
|
205
|
+
def start
|
206
|
+
count = 0
|
207
|
+
CSV.foreach(@input_file, headers: true) do |input_row|
|
208
|
+
count += 1
|
209
|
+
next if @start && @start >= count
|
210
|
+
tries = @proxies.length unless @noproxy
|
211
|
+
puts "lin #{count}/#{@input_length}"
|
212
|
+
# begin
|
213
|
+
urls = input_row['Urls']
|
214
|
+
if urls && urls.include?('http')
|
215
|
+
urls = urls.split(', ')
|
216
|
+
correct_url, correct_page = nil
|
217
|
+
urls.each do |url|
|
218
|
+
correct_url, correct_page = validate(url, input_row)
|
219
|
+
break if correct_url && correct_page
|
220
|
+
end
|
221
|
+
if correct_url
|
222
|
+
puts "correct page"
|
223
|
+
input_row << ["Linkedin Profile", correct_url]
|
224
|
+
input_row["Linkedin Import Status"] = 'Profile imported'
|
225
|
+
input_row.delete('Urls')
|
226
|
+
if input_row['Contact ID'] && input_row['Contact ID'].strip.length > 0
|
227
|
+
if @options[:update]
|
228
|
+
new_row = scrape_contact(input_row, correct_page, 'update')
|
229
|
+
append_to_csv(@output_update, new_row)
|
230
|
+
scrape_employment(input_row, correct_page).each do |emp_row|
|
231
|
+
append_to_csv(@output_employment_update, emp_row)
|
232
|
+
end
|
233
|
+
scrape_education(input_row, correct_page).each do |ed_row|
|
234
|
+
append_to_csv(@output_education_update, ed_row)
|
235
|
+
end
|
236
|
+
end
|
237
|
+
else
|
238
|
+
if @options[:insert]
|
239
|
+
new_row = scrape_contact(input_row, correct_page, 'insert')
|
240
|
+
append_to_csv(@output_insert, new_row)
|
241
|
+
scrape_employment(input_row, correct_page).each do |emp_row|
|
242
|
+
append_to_csv(@output_employment_insert, emp_row)
|
243
|
+
end
|
244
|
+
scrape_education(input_row, correct_page).each do |ed_row|
|
245
|
+
append_to_csv(@output_education_insert, ed_row)
|
246
|
+
end
|
247
|
+
end
|
248
|
+
end
|
249
|
+
else
|
250
|
+
if @options[:update] && input_row['Contact ID'] && input_row['Contact ID'].strip.length > 0
|
251
|
+
input_row << ["Linkedin Profile", nil]
|
252
|
+
input_row.delete('Urls')
|
253
|
+
input_row["Linkedin Import Status"] = 'Profile not found'
|
254
|
+
output_row = create_row(input_row, @headers)
|
255
|
+
puts input_row["Linkedin Import Status"]
|
256
|
+
append_to_csv(@output_update, output_row)
|
257
|
+
elsif @options [:insert]
|
258
|
+
input_row << ["Linkedin Profile", nil]
|
259
|
+
input_row.delete('Urls')
|
260
|
+
input_row["Linkedin Import Status"] = 'Profile not found'
|
261
|
+
puts input_row["Linkedin Import Status"]
|
262
|
+
output_row = create_row(input_row, @headers)
|
263
|
+
append_to_csv(@output_insert, output_row)
|
264
|
+
end
|
265
|
+
end
|
266
|
+
else
|
267
|
+
if @options[:update] && input_row['Contact ID'] && input_row['Contact ID'].strip.length > 0
|
268
|
+
input_row << ["Linkedin Profile", nil]
|
269
|
+
input_row.delete('Urls')
|
270
|
+
puts input_row["Linkedin Import Status"]
|
271
|
+
output_row = create_row(input_row, @headers)
|
272
|
+
append_to_csv(@output_update, output_row)
|
273
|
+
elsif @options [:insert]
|
274
|
+
input_row << ["Linkedin Profile", nil]
|
275
|
+
input_row.delete('Urls')
|
276
|
+
puts input_row["Linkedin Import Status"]
|
277
|
+
output_row = create_row(input_row, @headers)
|
278
|
+
append_to_csv(@output_insert, output_row)
|
279
|
+
end
|
280
|
+
end
|
281
|
+
# rescue Exception => msg
|
282
|
+
# tries -= 1
|
283
|
+
# if tries > 0
|
284
|
+
# puts "\n\n"
|
285
|
+
# puts msg
|
286
|
+
# puts 'RETRYING'
|
287
|
+
# puts "\n\n"
|
288
|
+
# if msg.to_s.start_with?("999")
|
289
|
+
# proxy.dead
|
290
|
+
# else
|
291
|
+
# proxy.used
|
292
|
+
# end
|
293
|
+
# retry
|
294
|
+
# else
|
295
|
+
# #append_ddg_row(input_row, msg, nil)
|
296
|
+
# puts msg
|
297
|
+
# end
|
298
|
+
# end
|
299
|
+
end
|
300
|
+
|
301
|
+
end
|
302
|
+
|
303
|
+
end
|