linkedin_scraper 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,46 @@
1
+ = Linkedin-Scraper
2
+
3
+ Linkedin Scraper is a gem for finding linkedin public profiles. You give it name and company of required profile, it finds linkedin profile for same, and its title, name, area, connection, etc.
4
+
5
+ = Installation
6
+
7
+ Install the gem from RubyGems:
8
+
9
+ gem install linkedin_scraper
10
+
11
+ This gem is tested on Ruby versions 1.8.7, 1.9.2 and 1.9.3.
12
+
13
+ = Usage
14
+
15
+ First Initialize an instance Linkedin class like this:
16
+
17
+ require 'linkedin_scraper'
18
+
19
+ linkedin = LinkedinScraper::linkedin.new
20
+
21
+ This sets the mechanize object for scraping. Now you can feed name and companies:
22
+
23
+ profile = linkedin.get_profile_data(name: some_name, company: some_company)
24
+
25
+ third option can be country: some_country (see list of supported countries below)
26
+
27
+ # List of supported countries
28
+ 1) United States - country: "us"
29
+ 2) United Kingdom - country: "uk"
30
+ 3) Canada - country: "ca"
31
+ 4) India - country: "in"
32
+
33
+ Then you can see the scraped data like this:
34
+
35
+
36
+ profile.full_name #the First name of the profile
37
+
38
+ profile.title #the linkedin job title
39
+
40
+ profile.location #the location of the prfile
41
+
42
+ profile.connection # number of connection of profile
43
+
44
+ profile.linkedin_url #url of the profile
45
+
46
+ Copyright (c) 2012 Bhushan Lodha, released under the MIT license.
@@ -0,0 +1,42 @@
1
+ module LinkedinScraper
2
+
3
+ USER_AGENTS = ["Linux Firefox", "Linux Konqueror", "Linux Mozilla", "Mac FireFox", "Mac Mozilla", "Mac Safari", "Windows Mozilla"]
4
+
5
+ class DuckDuckGo
6
+
7
+ def initialize options
8
+ raise "TypeError", "Invalid Arguments" unless options.is_a? Hash
9
+ @query = options[:query]
10
+ @agent = Mechanize.new { |agent| agent.user_agent_alias = USER_AGENTS.sample }
11
+ end
12
+
13
+ def search
14
+ output_arr = []
15
+ query_to_url
16
+ previous_content = ""
17
+ page = @agent.get("http:\/\/duckduckgo.com\/d.js?q=#{@query}&l=us-en&p=1&s=0")
18
+ content = page.content.match(/\[.*\]/).to_s
19
+ raise "No Profile, Profile not found or does not exist" if (content.nil? || content.empty?)
20
+ output_arr = JSON.parse(content).map {|f| f['c'] }
21
+ output_arr = output_arr.flatten.uniq
22
+ output_arr.each do |f|
23
+ next unless f
24
+ output_arr.delete(f) if f.include? "https://encrypted.google.com"
25
+ output_arr.delete(f) if f == nil
26
+ end
27
+ output_arr
28
+ end
29
+
30
+ private
31
+
32
+ def query_to_url
33
+ char_hash = {' ' => '%20','$' => '%24','&' => '%26','`' => '60%',':' => '%3A', '<' => '%3C', '>' => '%3E','[' => '%5B', ']' => '%5D', '{' => '%7B', '}' => '%7D', '"' => '%22','+' => '%2B', '@' => '%40', '/' => '%2F', ';' => '%3B', '=' => '%3D','?' => '%3F', '\\' => '%5C', '^' => '%5E', '|' => '%7C', '~' => '%7E', '\'' => '%27',',' => '%2C'}
34
+ @query = @query.gsub(/%/,'%25')
35
+ char_hash.each {|k,v| @query = @query.gsub(k,v) }
36
+ end
37
+
38
+
39
+ end
40
+
41
+ end
42
+
@@ -0,0 +1,249 @@
1
+ module LinkedinScraper
2
+
3
+ class Linkedin
4
+ # Interface for scraping linkedin public profile
5
+
6
+ # Full name of profile e.g "John Smith"
7
+ attr_accessor :full_name
8
+ # Linkedin url of profile e.g "http://www.linkedin.com/pub/in/john+smith"
9
+ attr_accessor :linkedin_url
10
+ # Current title of profile e.g "Ceo"
11
+ attr_accessor :current_title
12
+ # Past title of profile e.g "VP Business"
13
+ attr_accessor :past_title
14
+ # Current company of profile e.g "Pajama Labs"
15
+ attr_accessor :current_company
16
+ # Current job description as available on linkedin
17
+ attr_accessor :current_job_description
18
+ # Profile's summary as available on linkedin
19
+ attr_accessor :summary
20
+ # Total number of profile's connection e.g "44"
21
+ attr_accessor :connection
22
+ # Current location of profile "San Fransisco"
23
+ attr_accessor :location
24
+ # Total number of profile's recommendation e.g "44"
25
+ attr_accessor :recommendation
26
+ # All companies including current and past
27
+ attr_accessor :industry
28
+ # Experience description if mentioned
29
+ attr_accessor :experience
30
+ # Groups a profile is associated with
31
+ attr_accessor :groups
32
+ # Past companies of profile e.g "Google, Inc"
33
+ attr_accessor :past_company
34
+ # Schools profile has attended
35
+ attr_accessor :education
36
+ # Purposes for which profile can be contacted to (if provided)
37
+ attr_accessor :contact_for
38
+ # Websites associated with Profile
39
+ attr_accessor :websites
40
+
41
+ # Initializes mechnaize
42
+ def initialize
43
+ @agent = Mechanize.new { |agent| agent.user_agent_alias = USER_AGENTS.sample }
44
+ end
45
+
46
+ # Get data of the required profile
47
+ #
48
+ # _options_ contains the key :name *Required
49
+ # :name - name of profile/contact e.g "John Smith"
50
+ # _options_ contains the key :company *Required
51
+ # :company - current or past company of profile/contact e.g "Google"
52
+ # _options_ contains the key :country *Optional (see list of supported countries in Readme)
53
+ # :country - preferred country of profile to be found in
54
+ #
55
+ # Raises error if profile is not found or does not exist in linkedin or is not public profile
56
+ # Raises error if :name or :company or both are not defined
57
+ # Returns _self_
58
+ def get_profile_data options
59
+ @options = options
60
+ raise "TypeError", "Invalid Arguments" unless options.is_a?(Hash)
61
+ argument_error("name") unless options.has_key?(:name)
62
+ argument_error("company") unless options.has_key?(:company)
63
+ query = build_query
64
+ duck = LinkedinScraper::DuckDuckGo.new(query: query)
65
+ results = duck.search
66
+ raise "No Profile, Profile not found or does not exist" if results.empty?
67
+ page = get_li_page(results[0])
68
+ raise "No Profile, Profile not found or does not exist" unless page
69
+ if verify_profile(@options[:name], @options[:company], page)
70
+ self.data page
71
+ return self
72
+ else
73
+ raise "No Profile, Profile not found or does not exist"
74
+ end
75
+ end
76
+
77
+ def data page
78
+ self.full_name = li_full_name(page)
79
+ self.linkedin_url = page.uri.to_s
80
+ self.current_title = li_current_title(page)
81
+ self.past_title = li_past_title(page)
82
+ self.current_company = li_current_companies(page)
83
+ self.current_job_description = li_current_job_description(page)
84
+ self.summary = li_summary(page)
85
+ self.connection = li_connection(page)
86
+ self.location = li_location(page)
87
+ self.recommendation = li_recommendation(page)
88
+ self.industry = li_current_companies(page) + li_past_companies(page)
89
+ self.experience = li_experience(page)
90
+ self.groups = li_groups(page)
91
+ self.past_company = li_past_companies(page)
92
+ self.education = li_education(page)
93
+ self.contact_for = li_contact_for(page)
94
+ self.websites = li_websites(page)
95
+ end
96
+
97
+ private
98
+
99
+ def build_query
100
+ name = @options[:name] if @options.has_key? :name
101
+ #title = @options[:title] if @options.has_key? :title
102
+ company = @options[:company] if @options.has_key? :company
103
+ subdomain = @options[:country] || "www"
104
+ if name && company
105
+ return "site:#{subdomain}.linkedin.com \"#{name}\" + \"at #{company}\""
106
+ end
107
+ end
108
+
109
+ def get_li_page(url)
110
+ page = nil
111
+ begin
112
+ page = @agent.get(url)
113
+ if page
114
+ if page.parser.xpath('//*[(@id = "member-1")]//*[contains(concat( " ", @class, " " ), concat( " ", "title", " " ))]').text.empty?
115
+ unless page.parser.xpath('//*[(@id = "result-set")]//*[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//strong//a').text.empty?
116
+ return @agent.click(page.link_with(:text => page.parser.xpath('//*[(@id = "result-set")]//*[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//strong//a').text))
117
+ else
118
+ return nil
119
+ end
120
+ end
121
+ end
122
+ rescue Exception => e
123
+ end
124
+ return page
125
+ end
126
+
127
+ def verify_profile(name, company, page)
128
+ full_name = li_full_name(page)
129
+ companies = li_current_companies(page)+' '+li_past_companies(page)
130
+ return true if full_name=~/#{name}/i && companies=~/#{company}/i
131
+ end
132
+
133
+ def li_past_title(page)
134
+ stack = []
135
+ page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "past", " " ))]//li').each do |past|
136
+ stack << past.text.gsub(/\s+/, " ").downcase.split(' at ')[0]
137
+ end
138
+ stack.join(",")
139
+ end
140
+
141
+ def li_full_profile(page)
142
+ page.parser.xpath('//*[(@id = "content")]').text.gsub(/\s+/, " ").downcase
143
+ end
144
+
145
+ def li_current_job_description(page)
146
+ page.parser.xpath('//*[(@id = "profile-experience")]//*[contains(concat( " ", @class, " " ),
147
+ concat( " ", "first", " " ))]').text.gsub(/\s+/, " ").downcase
148
+ end
149
+
150
+ def li_full_name(page)
151
+ page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "full-name", " " ))]').text.downcase
152
+ end
153
+
154
+ def li_current_companies(page)
155
+ stack = []
156
+ page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "current", " " ))]//li').each do |com|
157
+ stack << com.text.gsub(/\s+/, " ").downcase.split(' at ')[-1]
158
+ end
159
+ stack.join(",")
160
+ end
161
+
162
+ def li_current_title(page)
163
+ stack = []
164
+ page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "current", " " ))]//li').each do |current|
165
+ stack << current.text.gsub(/\s+/, " ").downcase.split(' at ')[0]
166
+ end
167
+ stack.join(",")
168
+ end
169
+
170
+ def li_past_companies(page)
171
+ stack = []
172
+ page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "past", " " ))]//li').each do |com|
173
+ stack << com.text.gsub(/\s+/, " ").downcase.split(' at ')[-1]
174
+ end
175
+ stack.join(",")
176
+ end
177
+
178
+ def li_colleges(page)
179
+ stack = []
180
+ page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "summary-education", " " ))]//li').each do |com|
181
+ stack << com.text.gsub(/\s+/, " ").downcase
182
+ end
183
+ stack.join(",")
184
+ end
185
+
186
+ def li_recommendation(page)
187
+ page.parser.xpath('//dd[(((count(preceding-sibling::*) + 1) = 8) and parent::*)]//strong').text.strip
188
+ end
189
+
190
+ def li_websites(page)
191
+ websites = []
192
+ page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "websites", " " ))]//a').each do |web|
193
+ websites << web['href']
194
+ end
195
+ websites.join(",")
196
+ end
197
+
198
+ def li_summary(page)
199
+ page.parser.xpath('//*[(@id = "profile-summary")]').text.gsub(/\s+/, " ").downcase
200
+ end
201
+
202
+ def li_experience(page)
203
+ page.parser.xpath('//*[(@id = "profile-experience")]').text.gsub(/\s+/, " ").downcase
204
+ end
205
+
206
+ def li_education(page)
207
+ page.parser.xpath('//*[(@id = "profile-education")]').text.gsub(/\s+/, " ").downcase
208
+ end
209
+
210
+ def li_additional_info(page)
211
+ page.parser.xpath('//*[(@id = "profile-additional")]').text.gsub(/\s+/, " ").downcase
212
+ end
213
+
214
+ def li_contact_for(page)
215
+ page.parser.xpath('//*[(@id = "profile-contact")]').text.gsub(/\s+/, " ").downcase
216
+ end
217
+
218
+ def li_connection(page)
219
+ page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "overview-connections", " " ))]//p').text.gsub(/\s+/, " ").downcase
220
+ end
221
+
222
+ def li_location(page)
223
+ page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "locality", " " ))]').text.gsub(/\s+/, " ").downcase
224
+ end
225
+
226
+ def li_groups(page)
227
+ page.parser.xpath('//*[(@id = "pubgroups")]//*[contains(concat( " ", @class, " " ), concat( " ", "org", " " ))]').text.gsub(/\s+/, " ").downcase
228
+ end
229
+
230
+ def li_websites(page)
231
+ websites=[]
232
+ if page.search(".website").first
233
+ page.search(".website").each do |site|
234
+ url=site.at("a")["href"]
235
+ url="http://www.linkedin.com"+url
236
+ url=CGI.parse(URI.parse(url).query)["url"]
237
+ websites<<url
238
+ end
239
+ return websites.flatten!
240
+ end
241
+ end
242
+
243
+ def argument_error(argument)
244
+ raise "Argument Error, missing argument :#{argument}"
245
+ end
246
+
247
+ end
248
+
249
+ end
@@ -0,0 +1,3 @@
1
+ module LinkedinScraper
2
+ VERSION = '0.1.0'
3
+ end
@@ -0,0 +1,8 @@
1
+ require 'rubygems'
2
+ require 'mechanize'
3
+ require 'json'
4
+ require 'cgi'
5
+
6
+ require "#{File.dirname(File.expand_path(__FILE__))}/linkedin/duck.rb"
7
+ require "#{File.dirname(File.expand_path(__FILE__))}/linkedin/linkedin.rb"
8
+ require "#{File.dirname(File.expand_path(__FILE__))}/linkedin/version.rb"
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: linkedin_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Bhushan Lodha
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-08-31 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: mechanize
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: json
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ description: Find linkedin profile based on name and company of profile and scrapes
47
+ data if profile is found
48
+ email: bhushanlodha@gmail.com
49
+ executables: []
50
+ extensions: []
51
+ extra_rdoc_files: []
52
+ files:
53
+ - README.rdoc
54
+ - lib/linkedin/duck.rb
55
+ - lib/linkedin/linkedin.rb
56
+ - lib/linkedin/version.rb
57
+ - lib/linkedin_scraper.rb
58
+ homepage: https://github.com/bhushanlodha/linkedin_scraper
59
+ licenses: []
60
+ post_install_message:
61
+ rdoc_options: []
62
+ require_paths:
63
+ - lib
64
+ required_ruby_version: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ! '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ requirements: []
77
+ rubyforge_project:
78
+ rubygems_version: 1.8.24
79
+ signing_key:
80
+ specification_version: 3
81
+ summary: Find linkedin profiles and scrapes data
82
+ test_files: []