linsc 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,179 @@
1
+ require 'rubygems'
2
+ require 'mechanize'
3
+ require 'csv'
4
+ require 'fileutils'
5
+ require 'i18n'
6
+ require_relative 'proxy'
7
+ require_relative 'proxy_handler'
8
+ require_relative 'csv_handlers'
9
+
10
+ # tuck this away into a core_extensions module
11
+ class String
12
+ def alnum
13
+ return self.gsub(/[^\p{Alnum}\p{Space}]/u, ' ')
14
+ end
15
+ end
16
+
17
+ class DuckScraper
18
+
19
+ include CSVHandlers
20
+
21
+ def initialize(working_dir, input_file, output_file, options)
22
+ @working_dir, @input_file, @output_file, @noproxy =
23
+ working_dir, input_file, output_file, options[:noproxy]
24
+
25
+ @headers = get_headers(@input_file)
26
+ @headers << "Linkedin Import Status" unless @headers.include?("Linkedin Import Status")
27
+ @headers << "Urls" unless @headers.include?("Urls")
28
+ @input_length = %x(wc -l "#{@input_file}").split[0].to_i - 1
29
+ if File.exist?(@output_file)
30
+ @start = CSV.read(@output_file, headers: true).length
31
+ puts "resuming from row #{@start}"
32
+ else
33
+ create_file(@output_file)
34
+ end
35
+ @cooldown = 5
36
+ @proxies = ProxyHandler.new(@cooldown) unless @noproxy
37
+ end
38
+
39
+ def find_profiles
40
+ count = 0
41
+
42
+ CSV.foreach(@input_file, headers: true) do |input_row|
43
+ count += 1
44
+ next if @start && @start >= count
45
+ tries = @proxies&.length || 3
46
+ puts "ddg #{count}/#{@input_length}"
47
+ begin
48
+ unless sufficient_data?(input_row)
49
+ puts "Insufficient data, skipping"
50
+ append_ddg_row(input_row, "Insufficient Data", nil)
51
+ next
52
+ end
53
+ agent = Mechanize.new
54
+
55
+ unless @noproxy
56
+ proxy = @proxies.get_proxy
57
+ agent.set_proxy(proxy.ip, proxy.port, proxy.username, proxy.password)
58
+ agent.user_agent = proxy.user_agent
59
+ puts "proxy: #{proxy.ip}"
60
+ end
61
+ sleep(@cooldown) if @noproxy
62
+ query_string = create_query(input_row)
63
+ puts "query string: #{query_string}"
64
+ ddg_page = agent.get('https://www.duckduckgo.com/html')
65
+ search_form = ddg_page.form_with(id: 'search_form_homepage')
66
+ search_form.q = query_string
67
+ results_page = agent.submit(search_form)
68
+ urls = find_results(results_page, input_row)
69
+ if urls.length > 0
70
+ puts "Success! #{urls.length} possible urls found"
71
+ append_ddg_row(input_row, "DDG results found", urls.join(', '))
72
+ else
73
+ puts "no results found"
74
+ append_ddg_row(input_row, "No DDG results found", nil)
75
+ end
76
+ proxy.good if proxy
77
+
78
+ rescue StandardError => msg
79
+ tries -= 1
80
+ if tries > 0
81
+ puts "\n\n"
82
+ puts msg
83
+ puts 'RETRYING'
84
+ puts "\n\n"
85
+ proxy.used if proxy
86
+ retry
87
+ else
88
+ append_ddg_row(input_row, msg, nil)
89
+ puts msg
90
+ end
91
+ end
92
+ end
93
+ end
94
+
95
+ def append_ddg_row(row, status, urls)
96
+ row << ["Linkedin Import Status", status]
97
+ row << ["Urls", urls]
98
+ output_row = create_row(row, @headers)
99
+ append_to_csv(@output_file, output_row)
100
+ end
101
+
102
+ def sufficient_data?(row)
103
+ data_presence = 0
104
+ if row["First Name"] && row["First Name"].alnum.strip != ""
105
+ data_presence += 1
106
+ end
107
+ if row["Last Name"] && row["Last Name"].alnum.strip != ""
108
+ data_presence += 1
109
+ end
110
+ if row["Employer Organization Name 1"] && row["Employer Organization Name 1"].alnum.strip != ""
111
+ data_presence += 1
112
+ end
113
+ if row["Employer 1 Title"] && row["Employer 1 Title"].alnum.strip != ""
114
+ data_presence += 1
115
+ end
116
+ data_presence == 4 ? true : false
117
+ end
118
+
119
+ def find_results(page, row)
120
+ matches = []
121
+ full_name = "#{row['First Name']} #{row['Last Name']}".gsub(row["Email"], ' ').alnum.strip
122
+ if page.css("#links .results_links_deep")
123
+ results = page.css("#links .results_links_deep")
124
+ else
125
+ return matches
126
+ end
127
+ results.each do |result|
128
+ if result.at_css("a.result__a")
129
+
130
+ url_text = result.css("a.result__a").text.alnum
131
+ url = result.at_css('a.result__a')['href']
132
+ bio = result.css("a.result__snippet").text.alnum || ""
133
+ valid_url = true
134
+ short_title = row["Employer 1 Title"].alnum.split.first(2)
135
+ short_employer = row["Employer Organization Name 1"].alnum.split.first
136
+
137
+ if result.css("a.large").text.include?("profiles | LinkedIn")
138
+ valid_url = false
139
+ end
140
+ unless url.include?("linkedin") && (url.include?("/in/") || url.include?("/pub/"))
141
+ valid_url = false
142
+ end
143
+
144
+ if valid_url && name_check(url_text, full_name)
145
+ if bio.downcase.include?(short_title.join(' ').downcase) && bio.downcase.include?(short_employer.to_s.downcase)
146
+ matches.unshift(url)
147
+ else
148
+ matches.push(url)
149
+ end
150
+ else
151
+ end
152
+ end
153
+ end
154
+ matches
155
+ end
156
+
157
+ def name_check(lin_name, csv_name)
158
+ csv_array = csv_name.downcase.split(" ")
159
+ lin_array = lin_name.downcase.split(" ")
160
+ match = true
161
+ csv_array.each do |chunk|
162
+ unless lin_array.include?(chunk)
163
+ match = false
164
+ end
165
+ end
166
+ return match
167
+ end
168
+
169
+ def create_query(row)
170
+ query_parts = [row["First Name"], row["Last Name"], row["Employer 1 Title"],
171
+ row["Employer Organization Name 1"]]
172
+ query_parts.collect! do |part|
173
+ part.gsub!(row["Email"], ' ')
174
+ part.downcase.alnum.strip
175
+ end
176
+ "linkedin #{query_parts.join(' ')}"
177
+ end
178
+
179
+ end
@@ -0,0 +1,303 @@
1
+ require 'rubygems'
2
+ require 'mechanize'
3
+ require 'csv'
4
+ require 'fileutils'
5
+ require 'i18n'
6
+ require_relative 'proxy'
7
+ require_relative 'proxy_handler'
8
+ require_relative 'csv_handlers'
9
+ require_relative 'parsers'
10
+
11
+ # tuck this away into a core_extensions module
12
+ class String
13
+ def alnum
14
+ return self.gsub(/[^\p{Alnum}\p{Space}]/u, ' ')
15
+ end
16
+ end
17
+
18
+ class NilClass
19
+ def text
20
+ return nil
21
+ end
22
+ def [](options = {})
23
+ return nil
24
+ end
25
+ def css(options = {})
26
+ return nil
27
+ end
28
+ def gsub(a, b)
29
+ return nil
30
+ end
31
+ def at_css(options = {})
32
+ return nil
33
+ end
34
+ def slice(a, b, options = {})
35
+ return nil
36
+ end
37
+ def include?(a)
38
+ return false
39
+ end
40
+ def gsub!(a, b)
41
+ return nil
42
+ end
43
+
44
+ end
45
+
46
+ class LinScraper
47
+ include CSVHandlers
48
+ include Parsers
49
+
50
+ def initialize(working_dir, input_file, options)
51
+ @working_dir, @input_file, @options = working_dir, input_file, options
52
+ @output_update = @working_dir + "contact_update.csv" if @options[:update]
53
+ @output_insert = @working_dir + "contact_insert.csv" if @options[:insert]
54
+ @output_employment_update = @working_dir + "contact_employment_update.csv" if @options[:update]
55
+ @output_employment_insert = @working_dir + "contact_employment_insert.csv" if @options[:insert]
56
+ @output_education_update = @working_dir + "contact_education_update.csv" if @options[:update]
57
+ @output_education_insert = @working_dir + "contact_education_insert.csv" if @options[:insert]
58
+
59
+ @cooldown = 20
60
+ @noproxy = options[:noproxy]
61
+ @proxies = ProxyHandler.new(@cooldown) unless @options[:noproxy]
62
+ @headers = get_headers(@input_file)
63
+ @new_headers = ["Contact ID", "LIN ID", "CV TR", "Account Name", "Linkedin Import Status", "First Name", "Last Name", "Email", "LinkedIn Profile", "Candidate ID",
64
+ "LIN 1st Degree", "Title", "Contact Country", "Contact LIN Sector", "Resume Last Updated", "LIN Import Date", "CV Uploaded",
65
+ "Employer 1 Title", "Employer Organization Name 1", "Employer 1 Start Date",
66
+ "Employer 1 End Date", "Employer 1 Location", "Employer 1 Description",
67
+ "Employer 2 Title", "Employer Organization Name 2", "Employer 2 Start Date",
68
+ "Employer 2 End Date", "Employer 2 Location", "Employer 2 Description",
69
+ "Employer 3 Title", "Employer Organization Name 3", "Employer 3 Start Date",
70
+ "Employer 3 End Date", "Employer 3 Location", "Employer 3 Description",
71
+ "License or Certification Name 1", "License or Certification Name 2",
72
+ "License or Certification Credential Type", "Education School 1",
73
+ "Education Degree Name 1", "Education Degree Date 1",
74
+ "Education School 2", "Education Degree Name 2",
75
+ "Education Degree Date 2", "Text Resume"]
76
+ @new_headers.each do |header|
77
+ @headers << header unless @headers.include?(header)
78
+ end
79
+ @headers.delete('Urls')
80
+ @employment_headers = ["Contact ID", "Employer Name", "Job Title", "Start Date", "End Date", "Location", "LIN ID"]
81
+ @education_headers = ["Contact ID", "School Name", "Major", "Graduation Year", "LIN ID"]
82
+ @input_length = %x(wc -l "#{@input_file}").split[0].to_i - 1
83
+ I18n.available_locales = [:en]
84
+ if (@output_update && File.exist?(@output_update)) || (@output_insert && File.exist?(@output_insert))
85
+ if @output_update
86
+ update_length = CSV.read(@output_update, headers: true).length
87
+ else
88
+ update_length = 0
89
+ end
90
+ if @output_insert
91
+ insert_length = CSV.read(@output_insert, headers: true).length
92
+ else
93
+ insert_length = 0
94
+ end
95
+ @start = update_length + insert_length
96
+ end
97
+ [@output_insert, @output_update].each do |file|
98
+ if file
99
+ create_file(file) unless File.exist?(file)
100
+ end
101
+ end
102
+ [@output_employment_update, @output_employment_insert].each do |file|
103
+ if file
104
+ create_file_with_headers(file, @employment_headers)
105
+ end
106
+ end
107
+ [@output_education_update, @output_education_insert].each do |file|
108
+ if file
109
+ create_file_with_headers(file, @education_headers)
110
+ end
111
+ end
112
+ end
113
+
114
+ def name_check(lin_name, csv_name)
115
+ csv_array = csv_name.downcase.alnum.split(" ")
116
+ lin_array = lin_name.downcase.alnum.split(" ")
117
+ match = true
118
+ csv_array.each do |chunk|
119
+ unless lin_array.include?(chunk)
120
+ match = false
121
+ end
122
+ end
123
+ return match
124
+ end
125
+
126
+
127
+
128
+ def format_date(input_date)
129
+ if input_date.nil?
130
+ return nil
131
+ end
132
+ begin
133
+ date_arr = input_date.split(" ")
134
+ if date_arr.length == 1
135
+ output_date = Date.strptime(input_date, "%Y")
136
+ return output_date.strftime("%Y-%m-%d")
137
+ elsif date_arr.length == 2
138
+ output_date = Date.strptime(input_date, "%B %Y")
139
+ return output_date.strftime("%Y-%m-%d")
140
+ else
141
+ return nil
142
+ end
143
+ rescue
144
+ if date_arr.length == 2
145
+ return format_date(date_arr[1])
146
+ else
147
+ return nil
148
+ end
149
+ end
150
+ end
151
+
152
+ def validate(url, row)
153
+ puts "checking url: #{url}"
154
+ begin
155
+ agent = Mechanize.new
156
+
157
+ unless @noproxy
158
+ proxy = @proxies.get_proxy
159
+ agent.set_proxy(proxy.ip, proxy.port, proxy.username, proxy.password)
160
+ agent.user_agent = proxy.user_agent
161
+ puts "proxy: #{proxy.ip}"
162
+ end
163
+ sleep(@cooldown) if @noproxy
164
+ page = agent.get(url)
165
+ puts 'ACCESS GRANTED'
166
+
167
+ return false unless page.at_css("#name") && page.css("#experience .positions .position")
168
+ return false unless name_check(page.at_css("#name").text, "#{row['First Name']} #{row['Last Name']}")
169
+ positions = page.css("#experience .positions .position")
170
+
171
+ match = false
172
+ positions.each do |position|
173
+ if position.at_css("header .item-title a") && position.at_css("header .item-subtitle")
174
+ profile_title = I18n.transliterate(position.at_css("header .item-title a").text).alnum
175
+ profile_employer = I18n.transliterate(position.at_css("header .item-subtitle").text).alnum
176
+ title = I18n.transliterate(row['Employer 1 Title']).alnum
177
+ employer = I18n.transliterate(row['Employer Organization Name 1']).alnum
178
+ if name_check(profile_title, title) && name_check(profile_employer, employer)
179
+ match = true
180
+ end
181
+ end
182
+ end
183
+ proxy.good if proxy
184
+ if match
185
+ return [url, page]
186
+ else
187
+ return false
188
+ end
189
+ rescue StandardError => e
190
+ puts e
191
+ if e.to_s.start_with?('999')
192
+ proxy.dead if proxy
193
+ retry
194
+ elsif e.to_s.start_with?('404') || e.to_s.start_with?('403')
195
+ proxy.good if proxy
196
+ return false
197
+ else
198
+ puts e.backtrace
199
+ proxy.used if proxy
200
+ retry
201
+ end
202
+ end
203
+ end
204
+
205
+ def start
206
+ count = 0
207
+ CSV.foreach(@input_file, headers: true) do |input_row|
208
+ count += 1
209
+ next if @start && @start >= count
210
+ tries = @proxies.length unless @noproxy
211
+ puts "lin #{count}/#{@input_length}"
212
+ # begin
213
+ urls = input_row['Urls']
214
+ if urls && urls.include?('http')
215
+ urls = urls.split(', ')
216
+ correct_url, correct_page = nil
217
+ urls.each do |url|
218
+ correct_url, correct_page = validate(url, input_row)
219
+ break if correct_url && correct_page
220
+ end
221
+ if correct_url
222
+ puts "correct page"
223
+ input_row << ["Linkedin Profile", correct_url]
224
+ input_row["Linkedin Import Status"] = 'Profile imported'
225
+ input_row.delete('Urls')
226
+ if input_row['Contact ID'] && input_row['Contact ID'].strip.length > 0
227
+ if @options[:update]
228
+ new_row = scrape_contact(input_row, correct_page, 'update')
229
+ append_to_csv(@output_update, new_row)
230
+ scrape_employment(input_row, correct_page).each do |emp_row|
231
+ append_to_csv(@output_employment_update, emp_row)
232
+ end
233
+ scrape_education(input_row, correct_page).each do |ed_row|
234
+ append_to_csv(@output_education_update, ed_row)
235
+ end
236
+ end
237
+ else
238
+ if @options[:insert]
239
+ new_row = scrape_contact(input_row, correct_page, 'insert')
240
+ append_to_csv(@output_insert, new_row)
241
+ scrape_employment(input_row, correct_page).each do |emp_row|
242
+ append_to_csv(@output_employment_insert, emp_row)
243
+ end
244
+ scrape_education(input_row, correct_page).each do |ed_row|
245
+ append_to_csv(@output_education_insert, ed_row)
246
+ end
247
+ end
248
+ end
249
+ else
250
+ if @options[:update] && input_row['Contact ID'] && input_row['Contact ID'].strip.length > 0
251
+ input_row << ["Linkedin Profile", nil]
252
+ input_row.delete('Urls')
253
+ input_row["Linkedin Import Status"] = 'Profile not found'
254
+ output_row = create_row(input_row, @headers)
255
+ puts input_row["Linkedin Import Status"]
256
+ append_to_csv(@output_update, output_row)
257
+ elsif @options [:insert]
258
+ input_row << ["Linkedin Profile", nil]
259
+ input_row.delete('Urls')
260
+ input_row["Linkedin Import Status"] = 'Profile not found'
261
+ puts input_row["Linkedin Import Status"]
262
+ output_row = create_row(input_row, @headers)
263
+ append_to_csv(@output_insert, output_row)
264
+ end
265
+ end
266
+ else
267
+ if @options[:update] && input_row['Contact ID'] && input_row['Contact ID'].strip.length > 0
268
+ input_row << ["Linkedin Profile", nil]
269
+ input_row.delete('Urls')
270
+ puts input_row["Linkedin Import Status"]
271
+ output_row = create_row(input_row, @headers)
272
+ append_to_csv(@output_update, output_row)
273
+ elsif @options [:insert]
274
+ input_row << ["Linkedin Profile", nil]
275
+ input_row.delete('Urls')
276
+ puts input_row["Linkedin Import Status"]
277
+ output_row = create_row(input_row, @headers)
278
+ append_to_csv(@output_insert, output_row)
279
+ end
280
+ end
281
+ # rescue Exception => msg
282
+ # tries -= 1
283
+ # if tries > 0
284
+ # puts "\n\n"
285
+ # puts msg
286
+ # puts 'RETRYING'
287
+ # puts "\n\n"
288
+ # if msg.to_s.start_with?("999")
289
+ # proxy.dead
290
+ # else
291
+ # proxy.used
292
+ # end
293
+ # retry
294
+ # else
295
+ # #append_ddg_row(input_row, msg, nil)
296
+ # puts msg
297
+ # end
298
+ # end
299
+ end
300
+
301
+ end
302
+
303
+ end