linsc 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,179 @@
1
+ require 'rubygems'
2
+ require 'mechanize'
3
+ require 'csv'
4
+ require 'fileutils'
5
+ require 'i18n'
6
+ require_relative 'proxy'
7
+ require_relative 'proxy_handler'
8
+ require_relative 'csv_handlers'
9
+
10
+ # tuck this away into a core_extensions module
11
+ class String
12
+ def alnum
13
+ return self.gsub(/[^\p{Alnum}\p{Space}]/u, ' ')
14
+ end
15
+ end
16
+
17
+ class DuckScraper
18
+
19
+ include CSVHandlers
20
+
21
+ def initialize(working_dir, input_file, output_file, options)
22
+ @working_dir, @input_file, @output_file, @noproxy =
23
+ working_dir, input_file, output_file, options[:noproxy]
24
+
25
+ @headers = get_headers(@input_file)
26
+ @headers << "Linkedin Import Status" unless @headers.include?("Linkedin Import Status")
27
+ @headers << "Urls" unless @headers.include?("Urls")
28
+ @input_length = %x(wc -l "#{@input_file}").split[0].to_i - 1
29
+ if File.exist?(@output_file)
30
+ @start = CSV.read(@output_file, headers: true).length
31
+ puts "resuming from row #{@start}"
32
+ else
33
+ create_file(@output_file)
34
+ end
35
+ @cooldown = 5
36
+ @proxies = ProxyHandler.new(@cooldown) unless @noproxy
37
+ end
38
+
39
+ def find_profiles
40
+ count = 0
41
+
42
+ CSV.foreach(@input_file, headers: true) do |input_row|
43
+ count += 1
44
+ next if @start && @start >= count
45
+ tries = @proxies&.length || 3
46
+ puts "ddg #{count}/#{@input_length}"
47
+ begin
48
+ unless sufficient_data?(input_row)
49
+ puts "Insufficient data, skipping"
50
+ append_ddg_row(input_row, "Insufficient Data", nil)
51
+ next
52
+ end
53
+ agent = Mechanize.new
54
+
55
+ unless @noproxy
56
+ proxy = @proxies.get_proxy
57
+ agent.set_proxy(proxy.ip, proxy.port, proxy.username, proxy.password)
58
+ agent.user_agent = proxy.user_agent
59
+ puts "proxy: #{proxy.ip}"
60
+ end
61
+ sleep(@cooldown) if @noproxy
62
+ query_string = create_query(input_row)
63
+ puts "query string: #{query_string}"
64
+ ddg_page = agent.get('https://www.duckduckgo.com/html')
65
+ search_form = ddg_page.form_with(id: 'search_form_homepage')
66
+ search_form.q = query_string
67
+ results_page = agent.submit(search_form)
68
+ urls = find_results(results_page, input_row)
69
+ if urls.length > 0
70
+ puts "Success! #{urls.length} possible urls found"
71
+ append_ddg_row(input_row, "DDG results found", urls.join(', '))
72
+ else
73
+ puts "no results found"
74
+ append_ddg_row(input_row, "No DDG results found", nil)
75
+ end
76
+ proxy.good if proxy
77
+
78
+ rescue StandardError => msg
79
+ tries -= 1
80
+ if tries > 0
81
+ puts "\n\n"
82
+ puts msg
83
+ puts 'RETRYING'
84
+ puts "\n\n"
85
+ proxy.used if proxy
86
+ retry
87
+ else
88
+ append_ddg_row(input_row, msg, nil)
89
+ puts msg
90
+ end
91
+ end
92
+ end
93
+ end
94
+
95
+ def append_ddg_row(row, status, urls)
96
+ row << ["Linkedin Import Status", status]
97
+ row << ["Urls", urls]
98
+ output_row = create_row(row, @headers)
99
+ append_to_csv(@output_file, output_row)
100
+ end
101
+
102
+ def sufficient_data?(row)
103
+ data_presence = 0
104
+ if row["First Name"] && row["First Name"].alnum.strip != ""
105
+ data_presence += 1
106
+ end
107
+ if row["Last Name"] && row["Last Name"].alnum.strip != ""
108
+ data_presence += 1
109
+ end
110
+ if row["Employer Organization Name 1"] && row["Employer Organization Name 1"].alnum.strip != ""
111
+ data_presence += 1
112
+ end
113
+ if row["Employer 1 Title"] && row["Employer 1 Title"].alnum.strip != ""
114
+ data_presence += 1
115
+ end
116
+ data_presence == 4 ? true : false
117
+ end
118
+
119
+ def find_results(page, row)
120
+ matches = []
121
+ full_name = "#{row['First Name']} #{row['Last Name']}".gsub(row["Email"], ' ').alnum.strip
122
+ if page.css("#links .results_links_deep")
123
+ results = page.css("#links .results_links_deep")
124
+ else
125
+ return matches
126
+ end
127
+ results.each do |result|
128
+ if result.at_css("a.result__a")
129
+
130
+ url_text = result.css("a.result__a").text.alnum
131
+ url = result.at_css('a.result__a')['href']
132
+ bio = result.css("a.result__snippet").text.alnum || ""
133
+ valid_url = true
134
+ short_title = row["Employer 1 Title"].alnum.split.first(2)
135
+ short_employer = row["Employer Organization Name 1"].alnum.split.first
136
+
137
+ if result.css("a.large").text.include?("profiles | LinkedIn")
138
+ valid_url = false
139
+ end
140
+ unless url.include?("linkedin") && (url.include?("/in/") || url.include?("/pub/"))
141
+ valid_url = false
142
+ end
143
+
144
+ if valid_url && name_check(url_text, full_name)
145
+ if bio.downcase.include?(short_title.join(' ').downcase) && bio.downcase.include?(short_employer.to_s.downcase)
146
+ matches.unshift(url)
147
+ else
148
+ matches.push(url)
149
+ end
150
+ else
151
+ end
152
+ end
153
+ end
154
+ matches
155
+ end
156
+
157
+ def name_check(lin_name, csv_name)
158
+ csv_array = csv_name.downcase.split(" ")
159
+ lin_array = lin_name.downcase.split(" ")
160
+ match = true
161
+ csv_array.each do |chunk|
162
+ unless lin_array.include?(chunk)
163
+ match = false
164
+ end
165
+ end
166
+ return match
167
+ end
168
+
169
+ def create_query(row)
170
+ query_parts = [row["First Name"], row["Last Name"], row["Employer 1 Title"],
171
+ row["Employer Organization Name 1"]]
172
+ query_parts.collect! do |part|
173
+ part.gsub!(row["Email"], ' ')
174
+ part.downcase.alnum.strip
175
+ end
176
+ "linkedin #{query_parts.join(' ')}"
177
+ end
178
+
179
+ end
@@ -0,0 +1,303 @@
1
+ require 'rubygems'
2
+ require 'mechanize'
3
+ require 'csv'
4
+ require 'fileutils'
5
+ require 'i18n'
6
+ require_relative 'proxy'
7
+ require_relative 'proxy_handler'
8
+ require_relative 'csv_handlers'
9
+ require_relative 'parsers'
10
+
11
+ # tuck this away into a core_extensions module
12
+ class String
13
+ def alnum
14
+ return self.gsub(/[^\p{Alnum}\p{Space}]/u, ' ')
15
+ end
16
+ end
17
+
18
+ class NilClass
19
+ def text
20
+ return nil
21
+ end
22
+ def [](options = {})
23
+ return nil
24
+ end
25
+ def css(options = {})
26
+ return nil
27
+ end
28
+ def gsub(a, b)
29
+ return nil
30
+ end
31
+ def at_css(options = {})
32
+ return nil
33
+ end
34
+ def slice(a, b, options = {})
35
+ return nil
36
+ end
37
+ def include?(a)
38
+ return false
39
+ end
40
+ def gsub!(a, b)
41
+ return nil
42
+ end
43
+
44
+ end
45
+
46
+ class LinScraper
47
+ include CSVHandlers
48
+ include Parsers
49
+
50
+ def initialize(working_dir, input_file, options)
51
+ @working_dir, @input_file, @options = working_dir, input_file, options
52
+ @output_update = @working_dir + "contact_update.csv" if @options[:update]
53
+ @output_insert = @working_dir + "contact_insert.csv" if @options[:insert]
54
+ @output_employment_update = @working_dir + "contact_employment_update.csv" if @options[:update]
55
+ @output_employment_insert = @working_dir + "contact_employment_insert.csv" if @options[:insert]
56
+ @output_education_update = @working_dir + "contact_education_update.csv" if @options[:update]
57
+ @output_education_insert = @working_dir + "contact_education_insert.csv" if @options[:insert]
58
+
59
+ @cooldown = 20
60
+ @noproxy = options[:noproxy]
61
+ @proxies = ProxyHandler.new(@cooldown) unless @options[:noproxy]
62
+ @headers = get_headers(@input_file)
63
+ @new_headers = ["Contact ID", "LIN ID", "CV TR", "Account Name", "Linkedin Import Status", "First Name", "Last Name", "Email", "LinkedIn Profile", "Candidate ID",
64
+ "LIN 1st Degree", "Title", "Contact Country", "Contact LIN Sector", "Resume Last Updated", "LIN Import Date", "CV Uploaded",
65
+ "Employer 1 Title", "Employer Organization Name 1", "Employer 1 Start Date",
66
+ "Employer 1 End Date", "Employer 1 Location", "Employer 1 Description",
67
+ "Employer 2 Title", "Employer Organization Name 2", "Employer 2 Start Date",
68
+ "Employer 2 End Date", "Employer 2 Location", "Employer 2 Description",
69
+ "Employer 3 Title", "Employer Organization Name 3", "Employer 3 Start Date",
70
+ "Employer 3 End Date", "Employer 3 Location", "Employer 3 Description",
71
+ "License or Certification Name 1", "License or Certification Name 2",
72
+ "License or Certification Credential Type", "Education School 1",
73
+ "Education Degree Name 1", "Education Degree Date 1",
74
+ "Education School 2", "Education Degree Name 2",
75
+ "Education Degree Date 2", "Text Resume"]
76
+ @new_headers.each do |header|
77
+ @headers << header unless @headers.include?(header)
78
+ end
79
+ @headers.delete('Urls')
80
+ @employment_headers = ["Contact ID", "Employer Name", "Job Title", "Start Date", "End Date", "Location", "LIN ID"]
81
+ @education_headers = ["Contact ID", "School Name", "Major", "Graduation Year", "LIN ID"]
82
+ @input_length = %x(wc -l "#{@input_file}").split[0].to_i - 1
83
+ I18n.available_locales = [:en]
84
+ if (@output_update && File.exist?(@output_update)) || (@output_insert && File.exist?(@output_insert))
85
+ if @output_update
86
+ update_length = CSV.read(@output_update, headers: true).length
87
+ else
88
+ update_length = 0
89
+ end
90
+ if @output_insert
91
+ insert_length = CSV.read(@output_insert, headers: true).length
92
+ else
93
+ insert_length = 0
94
+ end
95
+ @start = update_length + insert_length
96
+ end
97
+ [@output_insert, @output_update].each do |file|
98
+ if file
99
+ create_file(file) unless File.exist?(file)
100
+ end
101
+ end
102
+ [@output_employment_update, @output_employment_insert].each do |file|
103
+ if file
104
+ create_file_with_headers(file, @employment_headers)
105
+ end
106
+ end
107
+ [@output_education_update, @output_education_insert].each do |file|
108
+ if file
109
+ create_file_with_headers(file, @education_headers)
110
+ end
111
+ end
112
+ end
113
+
114
+ def name_check(lin_name, csv_name)
115
+ csv_array = csv_name.downcase.alnum.split(" ")
116
+ lin_array = lin_name.downcase.alnum.split(" ")
117
+ match = true
118
+ csv_array.each do |chunk|
119
+ unless lin_array.include?(chunk)
120
+ match = false
121
+ end
122
+ end
123
+ return match
124
+ end
125
+
126
+
127
+
128
+ def format_date(input_date)
129
+ if input_date.nil?
130
+ return nil
131
+ end
132
+ begin
133
+ date_arr = input_date.split(" ")
134
+ if date_arr.length == 1
135
+ output_date = Date.strptime(input_date, "%Y")
136
+ return output_date.strftime("%Y-%m-%d")
137
+ elsif date_arr.length == 2
138
+ output_date = Date.strptime(input_date, "%B %Y")
139
+ return output_date.strftime("%Y-%m-%d")
140
+ else
141
+ return nil
142
+ end
143
+ rescue
144
+ if date_arr.length == 2
145
+ return format_date(date_arr[1])
146
+ else
147
+ return nil
148
+ end
149
+ end
150
+ end
151
+
152
+ def validate(url, row)
153
+ puts "checking url: #{url}"
154
+ begin
155
+ agent = Mechanize.new
156
+
157
+ unless @noproxy
158
+ proxy = @proxies.get_proxy
159
+ agent.set_proxy(proxy.ip, proxy.port, proxy.username, proxy.password)
160
+ agent.user_agent = proxy.user_agent
161
+ puts "proxy: #{proxy.ip}"
162
+ end
163
+ sleep(@cooldown) if @noproxy
164
+ page = agent.get(url)
165
+ puts 'ACCESS GRANTED'
166
+
167
+ return false unless page.at_css("#name") && page.css("#experience .positions .position")
168
+ return false unless name_check(page.at_css("#name").text, "#{row['First Name']} #{row['Last Name']}")
169
+ positions = page.css("#experience .positions .position")
170
+
171
+ match = false
172
+ positions.each do |position|
173
+ if position.at_css("header .item-title a") && position.at_css("header .item-subtitle")
174
+ profile_title = I18n.transliterate(position.at_css("header .item-title a").text).alnum
175
+ profile_employer = I18n.transliterate(position.at_css("header .item-subtitle").text).alnum
176
+ title = I18n.transliterate(row['Employer 1 Title']).alnum
177
+ employer = I18n.transliterate(row['Employer Organization Name 1']).alnum
178
+ if name_check(profile_title, title) && name_check(profile_employer, employer)
179
+ match = true
180
+ end
181
+ end
182
+ end
183
+ proxy.good if proxy
184
+ if match
185
+ return [url, page]
186
+ else
187
+ return false
188
+ end
189
+ rescue StandardError => e
190
+ puts e
191
+ if e.to_s.start_with?('999')
192
+ proxy.dead if proxy
193
+ retry
194
+ elsif e.to_s.start_with?('404') || e.to_s.start_with?('403')
195
+ proxy.good if proxy
196
+ return false
197
+ else
198
+ puts e.backtrace
199
+ proxy.used if proxy
200
+ retry
201
+ end
202
+ end
203
+ end
204
+
205
+ def start
206
+ count = 0
207
+ CSV.foreach(@input_file, headers: true) do |input_row|
208
+ count += 1
209
+ next if @start && @start >= count
210
+ tries = @proxies.length unless @noproxy
211
+ puts "lin #{count}/#{@input_length}"
212
+ # begin
213
+ urls = input_row['Urls']
214
+ if urls && urls.include?('http')
215
+ urls = urls.split(', ')
216
+ correct_url, correct_page = nil
217
+ urls.each do |url|
218
+ correct_url, correct_page = validate(url, input_row)
219
+ break if correct_url && correct_page
220
+ end
221
+ if correct_url
222
+ puts "correct page"
223
+ input_row << ["Linkedin Profile", correct_url]
224
+ input_row["Linkedin Import Status"] = 'Profile imported'
225
+ input_row.delete('Urls')
226
+ if input_row['Contact ID'] && input_row['Contact ID'].strip.length > 0
227
+ if @options[:update]
228
+ new_row = scrape_contact(input_row, correct_page, 'update')
229
+ append_to_csv(@output_update, new_row)
230
+ scrape_employment(input_row, correct_page).each do |emp_row|
231
+ append_to_csv(@output_employment_update, emp_row)
232
+ end
233
+ scrape_education(input_row, correct_page).each do |ed_row|
234
+ append_to_csv(@output_education_update, ed_row)
235
+ end
236
+ end
237
+ else
238
+ if @options[:insert]
239
+ new_row = scrape_contact(input_row, correct_page, 'insert')
240
+ append_to_csv(@output_insert, new_row)
241
+ scrape_employment(input_row, correct_page).each do |emp_row|
242
+ append_to_csv(@output_employment_insert, emp_row)
243
+ end
244
+ scrape_education(input_row, correct_page).each do |ed_row|
245
+ append_to_csv(@output_education_insert, ed_row)
246
+ end
247
+ end
248
+ end
249
+ else
250
+ if @options[:update] && input_row['Contact ID'] && input_row['Contact ID'].strip.length > 0
251
+ input_row << ["Linkedin Profile", nil]
252
+ input_row.delete('Urls')
253
+ input_row["Linkedin Import Status"] = 'Profile not found'
254
+ output_row = create_row(input_row, @headers)
255
+ puts input_row["Linkedin Import Status"]
256
+ append_to_csv(@output_update, output_row)
257
+ elsif @options [:insert]
258
+ input_row << ["Linkedin Profile", nil]
259
+ input_row.delete('Urls')
260
+ input_row["Linkedin Import Status"] = 'Profile not found'
261
+ puts input_row["Linkedin Import Status"]
262
+ output_row = create_row(input_row, @headers)
263
+ append_to_csv(@output_insert, output_row)
264
+ end
265
+ end
266
+ else
267
+ if @options[:update] && input_row['Contact ID'] && input_row['Contact ID'].strip.length > 0
268
+ input_row << ["Linkedin Profile", nil]
269
+ input_row.delete('Urls')
270
+ puts input_row["Linkedin Import Status"]
271
+ output_row = create_row(input_row, @headers)
272
+ append_to_csv(@output_update, output_row)
273
+ elsif @options [:insert]
274
+ input_row << ["Linkedin Profile", nil]
275
+ input_row.delete('Urls')
276
+ puts input_row["Linkedin Import Status"]
277
+ output_row = create_row(input_row, @headers)
278
+ append_to_csv(@output_insert, output_row)
279
+ end
280
+ end
281
+ # rescue Exception => msg
282
+ # tries -= 1
283
+ # if tries > 0
284
+ # puts "\n\n"
285
+ # puts msg
286
+ # puts 'RETRYING'
287
+ # puts "\n\n"
288
+ # if msg.to_s.start_with?("999")
289
+ # proxy.dead
290
+ # else
291
+ # proxy.used
292
+ # end
293
+ # retry
294
+ # else
295
+ # #append_ddg_row(input_row, msg, nil)
296
+ # puts msg
297
+ # end
298
+ # end
299
+ end
300
+
301
+ end
302
+
303
+ end