linkedin_scraper 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,46 @@
1
+ = Linkedin-Scraper
2
+
3
+ Linkedin Scraper is a gem for finding linkedin public profiles. You give it name and company of required profile, it finds linkedin profile for same, and its title, name, area, connection, etc.
4
+
5
+ = Installation
6
+
7
+ Install the gem from RubyGems:
8
+
9
+ gem install linkedin_scraper
10
+
11
+ This gem is tested on Ruby versions 1.8.7, 1.9.2 and 1.9.3.
12
+
13
+ = Usage
14
+
15
+ First Initialize an instance Linkedin class like this:
16
+
17
+ require 'linkedin_scraper'
18
+
19
+ linkedin = LinkedinScraper::linkedin.new
20
+
21
+ This sets the mechanize object for scraping. Now you can feed name and companies:
22
+
23
+ profile = linkedin.get_profile_data(name: some_name, company: some_company)
24
+
25
+ third option can be country: some_country (see list of supported countries below)
26
+
27
+ # List of supported countries
28
+ 1) United States - country: "us"
29
+ 2) United Kingdom - country: "uk"
30
+ 3) Canada - country: "ca"
31
+ 4) India - country: "in"
32
+
33
+ Then you can see the scraped data like this:
34
+
35
+
36
+ profile.full_name #the First name of the profile
37
+
38
+ profile.title #the linkedin job title
39
+
40
+ profile.location #the location of the prfile
41
+
42
+ profile.connection # number of connection of profile
43
+
44
+ profile.linkedin_url #url of the profile
45
+
46
+ Copyright (c) 2012 Bhushan Lodha, released under the MIT license.
@@ -0,0 +1,42 @@
1
+ module LinkedinScraper
2
+
3
+ USER_AGENTS = ["Linux Firefox", "Linux Konqueror", "Linux Mozilla", "Mac FireFox", "Mac Mozilla", "Mac Safari", "Windows Mozilla"]
4
+
5
+ class DuckDuckGo
6
+
7
+ def initialize options
8
+ raise "TypeError", "Invalid Arguments" unless options.is_a? Hash
9
+ @query = options[:query]
10
+ @agent = Mechanize.new { |agent| agent.user_agent_alias = USER_AGENTS.sample }
11
+ end
12
+
13
+ def search
14
+ output_arr = []
15
+ query_to_url
16
+ previous_content = ""
17
+ page = @agent.get("http:\/\/duckduckgo.com\/d.js?q=#{@query}&l=us-en&p=1&s=0")
18
+ content = page.content.match(/\[.*\]/).to_s
19
+ raise "No Profile, Profile not found or does not exist" if (content.nil? || content.empty?)
20
+ output_arr = JSON.parse(content).map {|f| f['c'] }
21
+ output_arr = output_arr.flatten.uniq
22
+ output_arr.each do |f|
23
+ next unless f
24
+ output_arr.delete(f) if f.include? "https://encrypted.google.com"
25
+ output_arr.delete(f) if f == nil
26
+ end
27
+ output_arr
28
+ end
29
+
30
+ private
31
+
32
+ def query_to_url
33
+ char_hash = {' ' => '%20','$' => '%24','&' => '%26','`' => '60%',':' => '%3A', '<' => '%3C', '>' => '%3E','[' => '%5B', ']' => '%5D', '{' => '%7B', '}' => '%7D', '"' => '%22','+' => '%2B', '@' => '%40', '/' => '%2F', ';' => '%3B', '=' => '%3D','?' => '%3F', '\\' => '%5C', '^' => '%5E', '|' => '%7C', '~' => '%7E', '\'' => '%27',',' => '%2C'}
34
+ @query = @query.gsub(/%/,'%25')
35
+ char_hash.each {|k,v| @query = @query.gsub(k,v) }
36
+ end
37
+
38
+
39
+ end
40
+
41
+ end
42
+
@@ -0,0 +1,249 @@
1
+ module LinkedinScraper
2
+
3
+ class Linkedin
4
+ # Interface for scraping linkedin public profile
5
+
6
+ # Full name of profile e.g "John Smith"
7
+ attr_accessor :full_name
8
+ # Linkedin url of profile e.g "http://www.linkedin.com/pub/in/john+smith"
9
+ attr_accessor :linkedin_url
10
+ # Current title of profile e.g "Ceo"
11
+ attr_accessor :current_title
12
+ # Past title of profile e.g "VP Business"
13
+ attr_accessor :past_title
14
+ # Current company of profile e.g "Pajama Labs"
15
+ attr_accessor :current_company
16
+ # Current job description as available on linkedin
17
+ attr_accessor :current_job_description
18
+ # Profile's summary as available on linkedin
19
+ attr_accessor :summary
20
+ # Total number of profile's connection e.g "44"
21
+ attr_accessor :connection
22
+ # Current location of profile "San Fransisco"
23
+ attr_accessor :location
24
+ # Total number of profile's recommendation e.g "44"
25
+ attr_accessor :recommendation
26
+ # All companies including current and past
27
+ attr_accessor :industry
28
+ # Experience description if mentioned
29
+ attr_accessor :experience
30
+ # Groups a profile is associated with
31
+ attr_accessor :groups
32
+ # Past companies of profile e.g "Google, Inc"
33
+ attr_accessor :past_company
34
+ # Schools profile has attended
35
+ attr_accessor :education
36
+ # Purposes for which profile can be contacted to (if provided)
37
+ attr_accessor :contact_for
38
+ # Websites associated with Profile
39
+ attr_accessor :websites
40
+
41
+ # Initializes mechnaize
42
+ def initialize
43
+ @agent = Mechanize.new { |agent| agent.user_agent_alias = USER_AGENTS.sample }
44
+ end
45
+
46
+ # Get data of the required profile
47
+ #
48
+ # _options_ contains the key :name *Required
49
+ # :name - name of profile/contact e.g "John Smith"
50
+ # _options_ contains the key :company *Required
51
+ # :company - current or past company of profile/contact e.g "Google"
52
+ # _options_ contains the key :country *Optional (see list of supported countries in Readme)
53
+ # :country - preferred country of profile to be found in
54
+ #
55
+ # Raises error if profile is not found or does not exist in linkedin or is not public profile
56
+ # Raises error if :name or :company or both are not defined
57
+ # Returns _self_
58
+ def get_profile_data options
59
+ @options = options
60
+ raise "TypeError", "Invalid Arguments" unless options.is_a?(Hash)
61
+ argument_error("name") unless options.has_key?(:name)
62
+ argument_error("company") unless options.has_key?(:company)
63
+ query = build_query
64
+ duck = LinkedinScraper::DuckDuckGo.new(query: query)
65
+ results = duck.search
66
+ raise "No Profile, Profile not found or does not exist" if results.empty?
67
+ page = get_li_page(results[0])
68
+ raise "No Profile, Profile not found or does not exist" unless page
69
+ if verify_profile(@options[:name], @options[:company], page)
70
+ self.data page
71
+ return self
72
+ else
73
+ raise "No Profile, Profile not found or does not exist"
74
+ end
75
+ end
76
+
77
+ def data page
78
+ self.full_name = li_full_name(page)
79
+ self.linkedin_url = page.uri.to_s
80
+ self.current_title = li_current_title(page)
81
+ self.past_title = li_past_title(page)
82
+ self.current_company = li_current_companies(page)
83
+ self.current_job_description = li_current_job_description(page)
84
+ self.summary = li_summary(page)
85
+ self.connection = li_connection(page)
86
+ self.location = li_location(page)
87
+ self.recommendation = li_recommendation(page)
88
+ self.industry = li_current_companies(page) + li_past_companies(page)
89
+ self.experience = li_experience(page)
90
+ self.groups = li_groups(page)
91
+ self.past_company = li_past_companies(page)
92
+ self.education = li_education(page)
93
+ self.contact_for = li_contact_for(page)
94
+ self.websites = li_websites(page)
95
+ end
96
+
97
+ private
98
+
99
+ def build_query
100
+ name = @options[:name] if @options.has_key? :name
101
+ #title = @options[:title] if @options.has_key? :title
102
+ company = @options[:company] if @options.has_key? :company
103
+ subdomain = @options[:country] || "www"
104
+ if name && company
105
+ return "site:#{subdomain}.linkedin.com \"#{name}\" + \"at #{company}\""
106
+ end
107
+ end
108
+
109
+ def get_li_page(url)
110
+ page = nil
111
+ begin
112
+ page = @agent.get(url)
113
+ if page
114
+ if page.parser.xpath('//*[(@id = "member-1")]//*[contains(concat( " ", @class, " " ), concat( " ", "title", " " ))]').text.empty?
115
+ unless page.parser.xpath('//*[(@id = "result-set")]//*[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//strong//a').text.empty?
116
+ return @agent.click(page.link_with(:text => page.parser.xpath('//*[(@id = "result-set")]//*[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//strong//a').text))
117
+ else
118
+ return nil
119
+ end
120
+ end
121
+ end
122
+ rescue Exception => e
123
+ end
124
+ return page
125
+ end
126
+
127
+ def verify_profile(name, company, page)
128
+ full_name = li_full_name(page)
129
+ companies = li_current_companies(page)+' '+li_past_companies(page)
130
+ return true if full_name=~/#{name}/i && companies=~/#{company}/i
131
+ end
132
+
133
+ def li_past_title(page)
134
+ stack = []
135
+ page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "past", " " ))]//li').each do |past|
136
+ stack << past.text.gsub(/\s+/, " ").downcase.split(' at ')[0]
137
+ end
138
+ stack.join(",")
139
+ end
140
+
141
+ def li_full_profile(page)
142
+ page.parser.xpath('//*[(@id = "content")]').text.gsub(/\s+/, " ").downcase
143
+ end
144
+
145
+ def li_current_job_description(page)
146
+ page.parser.xpath('//*[(@id = "profile-experience")]//*[contains(concat( " ", @class, " " ),
147
+ concat( " ", "first", " " ))]').text.gsub(/\s+/, " ").downcase
148
+ end
149
+
150
+ def li_full_name(page)
151
+ page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "full-name", " " ))]').text.downcase
152
+ end
153
+
154
+ def li_current_companies(page)
155
+ stack = []
156
+ page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "current", " " ))]//li').each do |com|
157
+ stack << com.text.gsub(/\s+/, " ").downcase.split(' at ')[-1]
158
+ end
159
+ stack.join(",")
160
+ end
161
+
162
+ def li_current_title(page)
163
+ stack = []
164
+ page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "current", " " ))]//li').each do |current|
165
+ stack << current.text.gsub(/\s+/, " ").downcase.split(' at ')[0]
166
+ end
167
+ stack.join(",")
168
+ end
169
+
170
+ def li_past_companies(page)
171
+ stack = []
172
+ page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "past", " " ))]//li').each do |com|
173
+ stack << com.text.gsub(/\s+/, " ").downcase.split(' at ')[-1]
174
+ end
175
+ stack.join(",")
176
+ end
177
+
178
+ def li_colleges(page)
179
+ stack = []
180
+ page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "summary-education", " " ))]//li').each do |com|
181
+ stack << com.text.gsub(/\s+/, " ").downcase
182
+ end
183
+ stack.join(",")
184
+ end
185
+
186
+ def li_recommendation(page)
187
+ page.parser.xpath('//dd[(((count(preceding-sibling::*) + 1) = 8) and parent::*)]//strong').text.strip
188
+ end
189
+
190
+ def li_websites(page)
191
+ websites = []
192
+ page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "websites", " " ))]//a').each do |web|
193
+ websites << web['href']
194
+ end
195
+ websites.join(",")
196
+ end
197
+
198
+ def li_summary(page)
199
+ page.parser.xpath('//*[(@id = "profile-summary")]').text.gsub(/\s+/, " ").downcase
200
+ end
201
+
202
+ def li_experience(page)
203
+ page.parser.xpath('//*[(@id = "profile-experience")]').text.gsub(/\s+/, " ").downcase
204
+ end
205
+
206
+ def li_education(page)
207
+ page.parser.xpath('//*[(@id = "profile-education")]').text.gsub(/\s+/, " ").downcase
208
+ end
209
+
210
+ def li_additional_info(page)
211
+ page.parser.xpath('//*[(@id = "profile-additional")]').text.gsub(/\s+/, " ").downcase
212
+ end
213
+
214
+ def li_contact_for(page)
215
+ page.parser.xpath('//*[(@id = "profile-contact")]').text.gsub(/\s+/, " ").downcase
216
+ end
217
+
218
+ def li_connection(page)
219
+ page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "overview-connections", " " ))]//p').text.gsub(/\s+/, " ").downcase
220
+ end
221
+
222
+ def li_location(page)
223
+ page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "locality", " " ))]').text.gsub(/\s+/, " ").downcase
224
+ end
225
+
226
+ def li_groups(page)
227
+ page.parser.xpath('//*[(@id = "pubgroups")]//*[contains(concat( " ", @class, " " ), concat( " ", "org", " " ))]').text.gsub(/\s+/, " ").downcase
228
+ end
229
+
230
+ def li_websites(page)
231
+ websites=[]
232
+ if page.search(".website").first
233
+ page.search(".website").each do |site|
234
+ url=site.at("a")["href"]
235
+ url="http://www.linkedin.com"+url
236
+ url=CGI.parse(URI.parse(url).query)["url"]
237
+ websites<<url
238
+ end
239
+ return websites.flatten!
240
+ end
241
+ end
242
+
243
+ def argument_error(argument)
244
+ raise "Argument Error, missing argument :#{argument}"
245
+ end
246
+
247
+ end
248
+
249
+ end
@@ -0,0 +1,3 @@
1
+ module LinkedinScraper
2
+ VERSION = '0.1.0'
3
+ end
@@ -0,0 +1,8 @@
1
+ require 'rubygems'
2
+ require 'mechanize'
3
+ require 'json'
4
+ require 'cgi'
5
+
6
+ require "#{File.dirname(File.expand_path(__FILE__))}/linkedin/duck.rb"
7
+ require "#{File.dirname(File.expand_path(__FILE__))}/linkedin/linkedin.rb"
8
+ require "#{File.dirname(File.expand_path(__FILE__))}/linkedin/version.rb"
metadata ADDED
@@ -0,0 +1,82 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: linkedin_scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Bhushan Lodha
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2012-08-31 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: mechanize
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: json
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ! '>='
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
46
+ description: Find linkedin profile based on name and company of profile and scrapes
47
+ data if profile is found
48
+ email: bhushanlodha@gmail.com
49
+ executables: []
50
+ extensions: []
51
+ extra_rdoc_files: []
52
+ files:
53
+ - README.rdoc
54
+ - lib/linkedin/duck.rb
55
+ - lib/linkedin/linkedin.rb
56
+ - lib/linkedin/version.rb
57
+ - lib/linkedin_scraper.rb
58
+ homepage: https://github.com/bhushanlodha/linkedin_scraper
59
+ licenses: []
60
+ post_install_message:
61
+ rdoc_options: []
62
+ require_paths:
63
+ - lib
64
+ required_ruby_version: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ required_rubygems_version: !ruby/object:Gem::Requirement
71
+ none: false
72
+ requirements:
73
+ - - ! '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ requirements: []
77
+ rubyforge_project:
78
+ rubygems_version: 1.8.24
79
+ signing_key:
80
+ specification_version: 3
81
+ summary: Find linkedin profiles and scrapes data
82
+ test_files: []