linkedin_scraper 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +46 -0
- data/lib/linkedin/duck.rb +42 -0
- data/lib/linkedin/linkedin.rb +249 -0
- data/lib/linkedin/version.rb +3 -0
- data/lib/linkedin_scraper.rb +8 -0
- metadata +82 -0
data/README.rdoc
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
= Linkedin-Scraper
|
2
|
+
|
3
|
+
Linkedin Scraper is a gem for finding linkedin public profiles. You give it name and company of required profile, it finds linkedin profile for same, and its title, name, area, connection, etc.
|
4
|
+
|
5
|
+
= Installation
|
6
|
+
|
7
|
+
Install the gem from RubyGems:
|
8
|
+
|
9
|
+
gem install linkedin_scraper
|
10
|
+
|
11
|
+
This gem is tested on Ruby versions 1.8.7, 1.9.2 and 1.9.3.
|
12
|
+
|
13
|
+
= Usage
|
14
|
+
|
15
|
+
First Initialize an instance Linkedin class like this:
|
16
|
+
|
17
|
+
require 'linkedin_scraper'
|
18
|
+
|
19
|
+
linkedin = LinkedinScraper::linkedin.new
|
20
|
+
|
21
|
+
This sets the mechanize object for scraping. Now you can feed name and companies:
|
22
|
+
|
23
|
+
profile = linkedin.get_profile_data(name: some_name, company: some_company)
|
24
|
+
|
25
|
+
third option can be country: some_country (see list of supported countries below)
|
26
|
+
|
27
|
+
# List of supported countries
|
28
|
+
1) United States - country: "us"
|
29
|
+
2) United Kingdom - country: "uk"
|
30
|
+
3) Canada - country: "ca"
|
31
|
+
4) India - country: "in"
|
32
|
+
|
33
|
+
Then you can see the scraped data like this:
|
34
|
+
|
35
|
+
|
36
|
+
profile.full_name #the First name of the profile
|
37
|
+
|
38
|
+
profile.title #the linkedin job title
|
39
|
+
|
40
|
+
profile.location #the location of the prfile
|
41
|
+
|
42
|
+
profile.connection # number of connection of profile
|
43
|
+
|
44
|
+
profile.linkedin_url #url of the profile
|
45
|
+
|
46
|
+
Copyright (c) 2012 Bhushan Lodha, released under the MIT license.
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module LinkedinScraper
|
2
|
+
|
3
|
+
USER_AGENTS = ["Linux Firefox", "Linux Konqueror", "Linux Mozilla", "Mac FireFox", "Mac Mozilla", "Mac Safari", "Windows Mozilla"]
|
4
|
+
|
5
|
+
class DuckDuckGo
|
6
|
+
|
7
|
+
def initialize options
|
8
|
+
raise "TypeError", "Invalid Arguments" unless options.is_a? Hash
|
9
|
+
@query = options[:query]
|
10
|
+
@agent = Mechanize.new { |agent| agent.user_agent_alias = USER_AGENTS.sample }
|
11
|
+
end
|
12
|
+
|
13
|
+
def search
|
14
|
+
output_arr = []
|
15
|
+
query_to_url
|
16
|
+
previous_content = ""
|
17
|
+
page = @agent.get("http:\/\/duckduckgo.com\/d.js?q=#{@query}&l=us-en&p=1&s=0")
|
18
|
+
content = page.content.match(/\[.*\]/).to_s
|
19
|
+
raise "No Profile, Profile not found or does not exist" if (content.nil? || content.empty?)
|
20
|
+
output_arr = JSON.parse(content).map {|f| f['c'] }
|
21
|
+
output_arr = output_arr.flatten.uniq
|
22
|
+
output_arr.each do |f|
|
23
|
+
next unless f
|
24
|
+
output_arr.delete(f) if f.include? "https://encrypted.google.com"
|
25
|
+
output_arr.delete(f) if f == nil
|
26
|
+
end
|
27
|
+
output_arr
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def query_to_url
|
33
|
+
char_hash = {' ' => '%20','$' => '%24','&' => '%26','`' => '60%',':' => '%3A', '<' => '%3C', '>' => '%3E','[' => '%5B', ']' => '%5D', '{' => '%7B', '}' => '%7D', '"' => '%22','+' => '%2B', '@' => '%40', '/' => '%2F', ';' => '%3B', '=' => '%3D','?' => '%3F', '\\' => '%5C', '^' => '%5E', '|' => '%7C', '~' => '%7E', '\'' => '%27',',' => '%2C'}
|
34
|
+
@query = @query.gsub(/%/,'%25')
|
35
|
+
char_hash.each {|k,v| @query = @query.gsub(k,v) }
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
|
@@ -0,0 +1,249 @@
|
|
1
|
+
module LinkedinScraper
|
2
|
+
|
3
|
+
class Linkedin
|
4
|
+
# Interface for scraping linkedin public profile
|
5
|
+
|
6
|
+
# Full name of profile e.g "John Smith"
|
7
|
+
attr_accessor :full_name
|
8
|
+
# Linkedin url of profile e.g "http://www.linkedin.com/pub/in/john+smith"
|
9
|
+
attr_accessor :linkedin_url
|
10
|
+
# Current title of profile e.g "Ceo"
|
11
|
+
attr_accessor :current_title
|
12
|
+
# Past title of profile e.g "VP Business"
|
13
|
+
attr_accessor :past_title
|
14
|
+
# Current company of profile e.g "Pajama Labs"
|
15
|
+
attr_accessor :current_company
|
16
|
+
# Current job description as available on linkedin
|
17
|
+
attr_accessor :current_job_description
|
18
|
+
# Profile's summary as available on linkedin
|
19
|
+
attr_accessor :summary
|
20
|
+
# Total number of profile's connection e.g "44"
|
21
|
+
attr_accessor :connection
|
22
|
+
# Current location of profile "San Fransisco"
|
23
|
+
attr_accessor :location
|
24
|
+
# Total number of profile's recommendation e.g "44"
|
25
|
+
attr_accessor :recommendation
|
26
|
+
# All companies including current and past
|
27
|
+
attr_accessor :industry
|
28
|
+
# Experience description if mentioned
|
29
|
+
attr_accessor :experience
|
30
|
+
# Groups a profile is associated with
|
31
|
+
attr_accessor :groups
|
32
|
+
# Past companies of profile e.g "Google, Inc"
|
33
|
+
attr_accessor :past_company
|
34
|
+
# Schools profile has attended
|
35
|
+
attr_accessor :education
|
36
|
+
# Purposes for which profile can be contacted to (if provided)
|
37
|
+
attr_accessor :contact_for
|
38
|
+
# Websites associated with Profile
|
39
|
+
attr_accessor :websites
|
40
|
+
|
41
|
+
# Initializes mechnaize
|
42
|
+
def initialize
|
43
|
+
@agent = Mechanize.new { |agent| agent.user_agent_alias = USER_AGENTS.sample }
|
44
|
+
end
|
45
|
+
|
46
|
+
# Get data of the required profile
|
47
|
+
#
|
48
|
+
# _options_ contains the key :name *Required
|
49
|
+
# :name - name of profile/contact e.g "John Smith"
|
50
|
+
# _options_ contains the key :company *Required
|
51
|
+
# :company - current or past company of profile/contact e.g "Google"
|
52
|
+
# _options_ contains the key :country *Optional (see list of supported countries in Readme)
|
53
|
+
# :country - preferred country of profile to be found in
|
54
|
+
#
|
55
|
+
# Raises error if profile is not found or does not exist in linkedin or is not public profile
|
56
|
+
# Raises error if :name or :company or both are not defined
|
57
|
+
# Returns _self_
|
58
|
+
def get_profile_data options
|
59
|
+
@options = options
|
60
|
+
raise "TypeError", "Invalid Arguments" unless options.is_a?(Hash)
|
61
|
+
argument_error("name") unless options.has_key?(:name)
|
62
|
+
argument_error("company") unless options.has_key?(:company)
|
63
|
+
query = build_query
|
64
|
+
duck = LinkedinScraper::DuckDuckGo.new(query: query)
|
65
|
+
results = duck.search
|
66
|
+
raise "No Profile, Profile not found or does not exist" if results.empty?
|
67
|
+
page = get_li_page(results[0])
|
68
|
+
raise "No Profile, Profile not found or does not exist" unless page
|
69
|
+
if verify_profile(@options[:name], @options[:company], page)
|
70
|
+
self.data page
|
71
|
+
return self
|
72
|
+
else
|
73
|
+
raise "No Profile, Profile not found or does not exist"
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def data page
|
78
|
+
self.full_name = li_full_name(page)
|
79
|
+
self.linkedin_url = page.uri.to_s
|
80
|
+
self.current_title = li_current_title(page)
|
81
|
+
self.past_title = li_past_title(page)
|
82
|
+
self.current_company = li_current_companies(page)
|
83
|
+
self.current_job_description = li_current_job_description(page)
|
84
|
+
self.summary = li_summary(page)
|
85
|
+
self.connection = li_connection(page)
|
86
|
+
self.location = li_location(page)
|
87
|
+
self.recommendation = li_recommendation(page)
|
88
|
+
self.industry = li_current_companies(page) + li_past_companies(page)
|
89
|
+
self.experience = li_experience(page)
|
90
|
+
self.groups = li_groups(page)
|
91
|
+
self.past_company = li_past_companies(page)
|
92
|
+
self.education = li_education(page)
|
93
|
+
self.contact_for = li_contact_for(page)
|
94
|
+
self.websites = li_websites(page)
|
95
|
+
end
|
96
|
+
|
97
|
+
private
|
98
|
+
|
99
|
+
def build_query
|
100
|
+
name = @options[:name] if @options.has_key? :name
|
101
|
+
#title = @options[:title] if @options.has_key? :title
|
102
|
+
company = @options[:company] if @options.has_key? :company
|
103
|
+
subdomain = @options[:country] || "www"
|
104
|
+
if name && company
|
105
|
+
return "site:#{subdomain}.linkedin.com \"#{name}\" + \"at #{company}\""
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def get_li_page(url)
|
110
|
+
page = nil
|
111
|
+
begin
|
112
|
+
page = @agent.get(url)
|
113
|
+
if page
|
114
|
+
if page.parser.xpath('//*[(@id = "member-1")]//*[contains(concat( " ", @class, " " ), concat( " ", "title", " " ))]').text.empty?
|
115
|
+
unless page.parser.xpath('//*[(@id = "result-set")]//*[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//strong//a').text.empty?
|
116
|
+
return @agent.click(page.link_with(:text => page.parser.xpath('//*[(@id = "result-set")]//*[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//strong//a').text))
|
117
|
+
else
|
118
|
+
return nil
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
rescue Exception => e
|
123
|
+
end
|
124
|
+
return page
|
125
|
+
end
|
126
|
+
|
127
|
+
def verify_profile(name, company, page)
|
128
|
+
full_name = li_full_name(page)
|
129
|
+
companies = li_current_companies(page)+' '+li_past_companies(page)
|
130
|
+
return true if full_name=~/#{name}/i && companies=~/#{company}/i
|
131
|
+
end
|
132
|
+
|
133
|
+
def li_past_title(page)
|
134
|
+
stack = []
|
135
|
+
page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "past", " " ))]//li').each do |past|
|
136
|
+
stack << past.text.gsub(/\s+/, " ").downcase.split(' at ')[0]
|
137
|
+
end
|
138
|
+
stack.join(",")
|
139
|
+
end
|
140
|
+
|
141
|
+
def li_full_profile(page)
|
142
|
+
page.parser.xpath('//*[(@id = "content")]').text.gsub(/\s+/, " ").downcase
|
143
|
+
end
|
144
|
+
|
145
|
+
def li_current_job_description(page)
|
146
|
+
page.parser.xpath('//*[(@id = "profile-experience")]//*[contains(concat( " ", @class, " " ),
|
147
|
+
concat( " ", "first", " " ))]').text.gsub(/\s+/, " ").downcase
|
148
|
+
end
|
149
|
+
|
150
|
+
def li_full_name(page)
|
151
|
+
page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "full-name", " " ))]').text.downcase
|
152
|
+
end
|
153
|
+
|
154
|
+
def li_current_companies(page)
|
155
|
+
stack = []
|
156
|
+
page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "current", " " ))]//li').each do |com|
|
157
|
+
stack << com.text.gsub(/\s+/, " ").downcase.split(' at ')[-1]
|
158
|
+
end
|
159
|
+
stack.join(",")
|
160
|
+
end
|
161
|
+
|
162
|
+
def li_current_title(page)
|
163
|
+
stack = []
|
164
|
+
page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "current", " " ))]//li').each do |current|
|
165
|
+
stack << current.text.gsub(/\s+/, " ").downcase.split(' at ')[0]
|
166
|
+
end
|
167
|
+
stack.join(",")
|
168
|
+
end
|
169
|
+
|
170
|
+
def li_past_companies(page)
|
171
|
+
stack = []
|
172
|
+
page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "past", " " ))]//li').each do |com|
|
173
|
+
stack << com.text.gsub(/\s+/, " ").downcase.split(' at ')[-1]
|
174
|
+
end
|
175
|
+
stack.join(",")
|
176
|
+
end
|
177
|
+
|
178
|
+
def li_colleges(page)
|
179
|
+
stack = []
|
180
|
+
page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "summary-education", " " ))]//li').each do |com|
|
181
|
+
stack << com.text.gsub(/\s+/, " ").downcase
|
182
|
+
end
|
183
|
+
stack.join(",")
|
184
|
+
end
|
185
|
+
|
186
|
+
def li_recommendation(page)
|
187
|
+
page.parser.xpath('//dd[(((count(preceding-sibling::*) + 1) = 8) and parent::*)]//strong').text.strip
|
188
|
+
end
|
189
|
+
|
190
|
+
def li_websites(page)
|
191
|
+
websites = []
|
192
|
+
page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "websites", " " ))]//a').each do |web|
|
193
|
+
websites << web['href']
|
194
|
+
end
|
195
|
+
websites.join(",")
|
196
|
+
end
|
197
|
+
|
198
|
+
def li_summary(page)
|
199
|
+
page.parser.xpath('//*[(@id = "profile-summary")]').text.gsub(/\s+/, " ").downcase
|
200
|
+
end
|
201
|
+
|
202
|
+
def li_experience(page)
|
203
|
+
page.parser.xpath('//*[(@id = "profile-experience")]').text.gsub(/\s+/, " ").downcase
|
204
|
+
end
|
205
|
+
|
206
|
+
def li_education(page)
|
207
|
+
page.parser.xpath('//*[(@id = "profile-education")]').text.gsub(/\s+/, " ").downcase
|
208
|
+
end
|
209
|
+
|
210
|
+
def li_additional_info(page)
|
211
|
+
page.parser.xpath('//*[(@id = "profile-additional")]').text.gsub(/\s+/, " ").downcase
|
212
|
+
end
|
213
|
+
|
214
|
+
def li_contact_for(page)
|
215
|
+
page.parser.xpath('//*[(@id = "profile-contact")]').text.gsub(/\s+/, " ").downcase
|
216
|
+
end
|
217
|
+
|
218
|
+
def li_connection(page)
|
219
|
+
page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "overview-connections", " " ))]//p').text.gsub(/\s+/, " ").downcase
|
220
|
+
end
|
221
|
+
|
222
|
+
def li_location(page)
|
223
|
+
page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "locality", " " ))]').text.gsub(/\s+/, " ").downcase
|
224
|
+
end
|
225
|
+
|
226
|
+
def li_groups(page)
|
227
|
+
page.parser.xpath('//*[(@id = "pubgroups")]//*[contains(concat( " ", @class, " " ), concat( " ", "org", " " ))]').text.gsub(/\s+/, " ").downcase
|
228
|
+
end
|
229
|
+
|
230
|
+
def li_websites(page)
|
231
|
+
websites=[]
|
232
|
+
if page.search(".website").first
|
233
|
+
page.search(".website").each do |site|
|
234
|
+
url=site.at("a")["href"]
|
235
|
+
url="http://www.linkedin.com"+url
|
236
|
+
url=CGI.parse(URI.parse(url).query)["url"]
|
237
|
+
websites<<url
|
238
|
+
end
|
239
|
+
return websites.flatten!
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
def argument_error(argument)
|
244
|
+
raise "Argument Error, missing argument :#{argument}"
|
245
|
+
end
|
246
|
+
|
247
|
+
end
|
248
|
+
|
249
|
+
end
|
@@ -0,0 +1,8 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'mechanize'
|
3
|
+
require 'json'
|
4
|
+
require 'cgi'
|
5
|
+
|
6
|
+
require "#{File.dirname(File.expand_path(__FILE__))}/linkedin/duck.rb"
|
7
|
+
require "#{File.dirname(File.expand_path(__FILE__))}/linkedin/linkedin.rb"
|
8
|
+
require "#{File.dirname(File.expand_path(__FILE__))}/linkedin/version.rb"
|
metadata
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: linkedin_scraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Bhushan Lodha
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-08-31 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: mechanize
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: json
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
description: Find linkedin profile based on name and company of profile and scrapes
|
47
|
+
data if profile is found
|
48
|
+
email: bhushanlodha@gmail.com
|
49
|
+
executables: []
|
50
|
+
extensions: []
|
51
|
+
extra_rdoc_files: []
|
52
|
+
files:
|
53
|
+
- README.rdoc
|
54
|
+
- lib/linkedin/duck.rb
|
55
|
+
- lib/linkedin/linkedin.rb
|
56
|
+
- lib/linkedin/version.rb
|
57
|
+
- lib/linkedin_scraper.rb
|
58
|
+
homepage: https://github.com/bhushanlodha/linkedin_scraper
|
59
|
+
licenses: []
|
60
|
+
post_install_message:
|
61
|
+
rdoc_options: []
|
62
|
+
require_paths:
|
63
|
+
- lib
|
64
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
72
|
+
requirements:
|
73
|
+
- - ! '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
requirements: []
|
77
|
+
rubyforge_project:
|
78
|
+
rubygems_version: 1.8.24
|
79
|
+
signing_key:
|
80
|
+
specification_version: 3
|
81
|
+
summary: Find linkedin profiles and scrapes data
|
82
|
+
test_files: []
|