linsc 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 340aa6ad3472d2fe72211eaa0dba3a3bede9d01b
4
+ data.tar.gz: 6907b89561b075e9d82a15446cb0ca5ad88816c2
5
+ SHA512:
6
+ metadata.gz: 5fc1790ddf980b7f7068e7bfd41bfa1fe70d183a4c5a5209f7782d8a8aec721843c08b8d2436866a59803d3f4f98ebbfcf7aafad6adb4b83c99d02ee6650d9b2
7
+ data.tar.gz: 9f41a2f305f1b0b170d7411d2fd81230c57869d0b43d6dd8598a4e23f48e9e903371e947dbd91ef128270ed676f520b81db37ce798049c86da9488b5c7b7c3da
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
@@ -0,0 +1,4 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.3.0
4
+ before_install: gem install bundler -v 1.11.2
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in linsc.gemspec
4
+ gemspec
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Dan Molloy
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,40 @@
1
+ # Linsc
2
+
3
+ Welcome to your new gem! In this directory, you'll find the files you need to be able to package up your Ruby library into a gem. Put your Ruby code in the file `lib/linsc`. To experiment with that code, run `bin/console` for an interactive prompt.
4
+
5
+ TODO: Delete this and the text above, and describe your gem
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application's Gemfile:
10
+
11
+ ```ruby
12
+ gem 'linsc'
13
+ ```
14
+
15
+ And then execute:
16
+
17
+ $ bundle
18
+
19
+ Or install it yourself as:
20
+
21
+ $ gem install linsc
22
+
23
+ ## Usage
24
+
25
+ TODO: Write usage instructions here
26
+
27
+ ## Development
28
+
29
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
30
+
31
+ To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
32
+
33
+ ## Contributing
34
+
35
+ Bug reports and pull requests are welcome on GitHub at https://github.com/danmolloy/linsc.
36
+
37
+
38
+ ## License
39
+
40
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "linsc"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'linsc'
4
+
5
+ Linsc.new
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,10 @@
1
+ Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML like Gecko) Chrome/41.0.2227.1 Safari/537.36
2
+ Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML like Gecko) Chrome/41.0.2228.0 Safari/537.36
3
+ Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246
4
+ Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML like Gecko) Version/7.0.3 Safari/7046A194A
5
+ Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1
6
+ Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML like Gecko) Chrome/40.0.2214.93 Safari/537.36
7
+ Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)
8
+ Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)
9
+ Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0
10
+ Mozilla/5.0 (X11; OpenBSD amd64; rv:28.0) Gecko/20100101 Firefox/28.0
File without changes
File without changes
@@ -0,0 +1,159 @@
1
+ require 'mechanize'
2
+ require 'i18n'
3
+ require 'fileutils'
4
+ require 'csv'
5
+ require 'optparse'
6
+ require 'pathname'
7
+ require_relative './linsc/merger'
8
+ require_relative './linsc/cross_ref'
9
+ require_relative './linsc/csv_handlers'
10
+ require_relative './linsc/duck'
11
+ require_relative './linsc/lin'
12
+
13
+
14
+ class Linsc
15
+ include CSVHandlers
16
+
17
+ def merge
18
+ merge_map = {'First Name' => 'First Name', 'Last Name' => 'Last Name', 'E-mail Address' => 'Email',
19
+ 'Company' => 'Employer Organization Name 1', 'Job Title' => 'Employer 1 Title',
20
+ 'Recruiter' => 'LIN 1st Degree'}
21
+ Merger.new(@working_dir, @merge_path, merge_map).merge
22
+ end
23
+
24
+ def crossref
25
+ CrossRef.new(input_dir: @working_dir, child_path: @merge_path,
26
+ master_path: @sf_path, output_path: @crossref_path, options: @options)
27
+ end
28
+
29
+ def duck
30
+ DuckScraper.new(@working_dir, @crossref_path, @ddg_path, @options).find_profiles
31
+ end
32
+
33
+ def lin
34
+ LinScraper.new(@working_dir, @ddg_path, @options).start
35
+ end
36
+
37
+ def map_history_ids
38
+ puts "Mapping ids to history"
39
+ CrossRef.new(input_dir: @working_dir, child_path: @working_dir + "contact_employment_insert.csv",
40
+ master_path: @working_dir + "history_ref.csv", output_path: @working_dir + "contact_employment_insert_with_ids.csv",
41
+ options: {:noproxy => false, :update => true, :insert => false},
42
+ master_lookup_field: 'LIN ID', child_lookup_field: 'LIN ID',
43
+ master_secondary_lookups: nil, static_values: nil)
44
+ CrossRef.new(input_dir: @working_dir, child_path: @working_dir + "contact_education_insert.csv",
45
+ master_path: @working_dir + "history_ref.csv", output_path: @working_dir + "contact_education_insert_with_ids.csv",
46
+ options: {:noproxy => false, :update => true, :insert => false},
47
+ master_lookup_field: 'LIN ID', child_lookup_field: 'LIN ID',
48
+ master_secondary_lookups: nil, static_values: nil)
49
+ exit
50
+ end
51
+
52
+ def confirm_restart(first=true)
53
+ if first
54
+ puts "Are you sure you want to restart the project? This will delete all data except the original inputs.\n(y/n)"
55
+ else
56
+ puts "Unknown input. Please enter (y/n)"
57
+ end
58
+ input = gets.chomp
59
+ if input.downcase == 'y'
60
+ return true
61
+ elsif input.downcase == 'n'
62
+ return false
63
+ else
64
+ confirm_restart(false)
65
+ end
66
+ end
67
+ def restart_project
68
+ files = [@merge_path, @crossref_path, @ddg_path, @working_dir + "contact_update.csv",
69
+ @working_dir + "contact_insert.csv", @working_dir + "contact_employment_update.csv",
70
+ @working_dir + "contact_employment_insert.csv", @working_dir + "contact_education_update.csv",
71
+ @working_dir + "contact_education_insert.csv"]
72
+ files.each do |f|
73
+ File.delete(f) if File.exist?(f)
74
+ end
75
+ end
76
+
77
+ def initialize
78
+ @options = {:noproxy => false, :update => false, :insert => false}
79
+ @working_dir = Pathname.pwd
80
+ @merge_path = @working_dir + 'merged.csv'
81
+ @sf_path = @working_dir + 'sf_ref.csv'
82
+ @crossref_path = @working_dir + 'crossref.csv'
83
+ @ddg_path = @working_dir + 'ddg.csv'
84
+
85
+ parser = OptionParser.new do|opts|
86
+ opts.banner = "Must specify update or insert (or both)"
87
+ opts.on('-u', '--update', 'Tell scraper to fetch fresh data for existing Salesforce records') do
88
+ @options[:update] = true;
89
+ end
90
+
91
+ opts.on('-i', '--insert', 'Tell scraper to fetch data for new connections not yet in Salesforce') do
92
+ @options[:insert] = true;
93
+ end
94
+
95
+ opts.on('-n', '--noproxy', 'Do not use any proxies') do
96
+ @options[:noproxy] = true;
97
+ end
98
+
99
+ opts.on('-e', '--history', 'Map Contact IDs to education/employment histories for new connections') do
100
+ map_history_ids
101
+ end
102
+
103
+ opts.on('-r', '--restart', 'Restart the project from beginning with the same inputs. WARNING: This will delete all scraped data.') do
104
+ if confirm_restart(true)
105
+ restart_project
106
+ puts "project files deleted"
107
+ else
108
+ puts "exiting"
109
+ exit
110
+ end
111
+ end
112
+
113
+ opts.on('-h', '--help', 'Displays Help') do
114
+ puts opts
115
+ exit
116
+ end
117
+ end.parse!
118
+
119
+ required_sf_fields = ['LIN ID', 'Email', 'Contact ID']
120
+ sf_headers = get_headers(@sf_path)
121
+ required_sf_fields.each do |field|
122
+ unless sf_headers.include?(field)
123
+ puts "The SF reference sheet must include the #{field} field."
124
+ exit
125
+ end
126
+ end
127
+
128
+ if File.exist?(@ddg_path)
129
+ ids = []
130
+ CSV.foreach(@crossref_path, headers: true) do |row|
131
+ ids << row['Contact ID']
132
+ end
133
+ if ids.include?(nil) || ids.include?("")
134
+ @options[:insert] = true
135
+ else
136
+ @options[:insert] = false
137
+ end
138
+ if ids.any?{|id| id && id&.length > 0}
139
+ @options[:update] = true
140
+ else
141
+ @options[:update] = false
142
+ end
143
+ puts "\nResuming previous scraping. insert: #{@options[:insert]}, update: #{@options[:update]}, using proxies? #{!@options[:noproxy]}"
144
+ else
145
+ unless @options[:update] || @options[:insert]
146
+ puts "Must specify insert or update. See help for details with -h"
147
+ exit
148
+ end
149
+ puts "\nStarting new project. insert: #{@options[:insert]}, update: #{@options[:update]}, using proxies? #{!@options[:noproxy]}"
150
+ end
151
+
152
+ merge unless File.exist?(@ddg_path)
153
+ crossref unless File.exist?(@ddg_path)
154
+ duck
155
+ lin
156
+
157
+ end
158
+
159
+ end
@@ -0,0 +1,113 @@
1
+ require_relative 'csv_handlers'
2
+ require 'securerandom'
3
+
4
+ class CrossRef
5
+ include CSVHandlers
6
+ include SecureRandom
7
+
8
+ def initialize(input_dir:, child_path:, master_path:, output_path:,
9
+ master_lookup_field: 'Email', child_lookup_field: 'Email',
10
+ master_secondary_lookups: ['Email 2', 'Email 3'],
11
+ static_values: {'Account Name' => 'Candidates'}, options:)
12
+ @input_dir, @child_path, @master_path, @output_path, @options =
13
+ input_dir, child_path, master_path, output_path, options
14
+ @master_lookup_field, @child_lookup_field, @master_secondary_lookups, @static_values =
15
+ master_lookup_field, child_lookup_field, master_secondary_lookups, static_values
16
+ @headers = get_headers(@master_path)
17
+ child_lookup_field == 'Email' ? @email_key = true : @email_key = false
18
+ @child_headers = get_headers(@child_path)
19
+ @child_headers.each do |child_header|
20
+ unless @headers.include?(child_header)
21
+ @headers << child_header
22
+ end
23
+ end
24
+ @static_values.each do |static_key, static_value|
25
+ unless @headers.include?(static_key)
26
+ @headers << static_key
27
+ end
28
+ end if @static_values
29
+ @child_length = %x(wc -l "#{@child_path}").split[0].to_i - 1
30
+ if File.exist?(@output_path)
31
+ File.delete(@output_path)
32
+ end
33
+ create_file(@output_path)
34
+ cross_ref
35
+ end
36
+
37
+ def cross_ref
38
+ master_data = CSV.read(@master_path, headers: true)
39
+ puts "sorting lookup values"
40
+ master_data = master_data.sort do |x, y|
41
+ a = x[@master_lookup_field]
42
+ b = y[@master_lookup_field]
43
+ a && b ? a <=> b : a ? -1 : 1
44
+ end
45
+ master_lookup_values = master_data.collect {|row| row[@master_lookup_field]&.downcase}
46
+ i = 0
47
+ CSV.foreach(@child_path, headers: true, encoding: 'utf-8') do |child_row|
48
+ i += 1
49
+ puts "email lookup - row: #{i}/#{@child_length}"
50
+ child_lookup_value = child_row[@child_lookup_field]&.downcase
51
+ if child_lookup_value&.include?('@') || !@email_key ## generalize this
52
+ match_index = master_lookup_values.bsearch_index do |master_lookup_value|
53
+ child_lookup_value && master_lookup_value ?
54
+ child_lookup_value <=> master_lookup_value : child_lookup_value ? -1 : 1
55
+ end
56
+ if !match_index
57
+ match_index = master_data.find_index do |master_row|
58
+ master_secondary_lookups = @master_secondary_lookups.collect{|x| x&.downcase}
59
+ master_secondary_lookups.include?(child_lookup_value)
60
+ end
61
+ end
62
+ if match_index
63
+ if @options[:update]
64
+ append_to_csv(@output_path, splice_rows(master_data[match_index], child_row))
65
+ end
66
+ else
67
+ if @options[:insert]
68
+ append_to_csv(@output_path, convert_row(child_row))
69
+ end
70
+ end
71
+ else
72
+ puts "missing lookup value"
73
+ end
74
+ end
75
+ end
76
+
77
+ def splice_rows(master_row, child_row)
78
+ unless master_row['LIN ID'] && master_row['LIN ID'].strip.length > 20
79
+ master_row['LIN ID'] = SecureRandom.hex(16)
80
+ end
81
+
82
+ child_row.each do |child_key, child_value|
83
+ if child_value && child_value.strip.length > 0
84
+ if master_row.has_key?(child_key)
85
+ master_row[child_key] = child_value
86
+ else
87
+ master_row << [child_key, child_value]
88
+ end
89
+ else
90
+ unless master_row.has_key?(child_key)
91
+ master_row << [child_key, child_value]
92
+ end
93
+ end
94
+ end
95
+ master_row_new = CSV::Row.new(@headers, [])
96
+ master_row.each do |key, value|
97
+ master_row_new[key] = value&.encode('utf-8', invalid: :replace, undef: :replace, replace: '#')
98
+ end
99
+ master_row_new
100
+ end
101
+
102
+ def convert_row(child_row)
103
+ master_row = CSV::Row.new(@headers, [])
104
+ master_row['LIN ID'] = SecureRandom.hex(16)
105
+ child_row.each do |child_key, child_value|
106
+ master_row[child_key] = child_value if master_row.has_key?(child_key)
107
+ end
108
+ @static_values.each do |static_key, static_value|
109
+ master_row[static_key] = static_value if master_row.has_key?(static_key)
110
+ end
111
+ master_row
112
+ end
113
+ end
@@ -0,0 +1,53 @@
1
+ module CSVHandlers
2
+ def create_row(row, headers, encoding = nil)
3
+ values = []
4
+ headers.each do |header|
5
+ if encoding
6
+ values << row[header]&.encode(encoding)
7
+ else
8
+ values << row[header]
9
+ end
10
+ end
11
+ CSV::Row.new(headers, values)
12
+ end
13
+
14
+ def append_to_csv(file, row)
15
+ tries = 3
16
+ begin
17
+ f = CSV.open(file, "a+", headers: row.headers, force_quotes: true)
18
+ f << row
19
+ f.close
20
+ rescue
21
+ tries -= 1
22
+ if tries > 0
23
+ retry
24
+ else
25
+ puts "Unable to write to file #{file}"
26
+ puts "Make sure the file exists and is not open in any other programs and try again. If that does not work try restarting your computer, or restarting the project with the -r flag."
27
+ exit
28
+ end
29
+ end
30
+ end
31
+
32
+ def create_file(f)
33
+ unless File.exist?(f)
34
+ FileUtils.touch(f)
35
+ csv = CSV.open(f, "w+")
36
+ csv << @headers.collect {|x| x&.encode('utf-8')}
37
+ csv.close
38
+ end
39
+ end
40
+
41
+ def create_file_with_headers(f, headers)
42
+ unless File.exist?(f)
43
+ FileUtils.touch(f)
44
+ csv = CSV.open(f, "w+")
45
+ csv << headers.collect {|x| x&.encode('utf-8')}
46
+ csv.close
47
+ end
48
+ end
49
+
50
+ def get_headers(file)
51
+ CSV.open(file, headers: true, return_headers: true).shift.headers
52
+ end
53
+ end