linkedin_scraper 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +46 -0
- data/lib/linkedin/duck.rb +42 -0
- data/lib/linkedin/linkedin.rb +249 -0
- data/lib/linkedin/version.rb +3 -0
- data/lib/linkedin_scraper.rb +8 -0
- metadata +82 -0
data/README.rdoc
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
= Linkedin-Scraper
|
2
|
+
|
3
|
+
Linkedin Scraper is a gem for finding linkedin public profiles. You give it name and company of required profile, it finds linkedin profile for same, and its title, name, area, connection, etc.
|
4
|
+
|
5
|
+
= Installation
|
6
|
+
|
7
|
+
Install the gem from RubyGems:
|
8
|
+
|
9
|
+
gem install linkedin_scraper
|
10
|
+
|
11
|
+
This gem is tested on Ruby versions 1.8.7, 1.9.2 and 1.9.3.
|
12
|
+
|
13
|
+
= Usage
|
14
|
+
|
15
|
+
First Initialize an instance Linkedin class like this:
|
16
|
+
|
17
|
+
require 'linkedin_scraper'
|
18
|
+
|
19
|
+
linkedin = LinkedinScraper::linkedin.new
|
20
|
+
|
21
|
+
This sets the mechanize object for scraping. Now you can feed name and companies:
|
22
|
+
|
23
|
+
profile = linkedin.get_profile_data(name: some_name, company: some_company)
|
24
|
+
|
25
|
+
third option can be country: some_country (see list of supported countries below)
|
26
|
+
|
27
|
+
# List of supported countries
|
28
|
+
1) United States - country: "us"
|
29
|
+
2) United Kingdom - country: "uk"
|
30
|
+
3) Canada - country: "ca"
|
31
|
+
4) India - country: "in"
|
32
|
+
|
33
|
+
Then you can see the scraped data like this:
|
34
|
+
|
35
|
+
|
36
|
+
profile.full_name #the First name of the profile
|
37
|
+
|
38
|
+
profile.title #the linkedin job title
|
39
|
+
|
40
|
+
profile.location #the location of the prfile
|
41
|
+
|
42
|
+
profile.connection # number of connection of profile
|
43
|
+
|
44
|
+
profile.linkedin_url #url of the profile
|
45
|
+
|
46
|
+
Copyright (c) 2012 Bhushan Lodha, released under the MIT license.
|
@@ -0,0 +1,42 @@
|
|
1
|
+
module LinkedinScraper
|
2
|
+
|
3
|
+
USER_AGENTS = ["Linux Firefox", "Linux Konqueror", "Linux Mozilla", "Mac FireFox", "Mac Mozilla", "Mac Safari", "Windows Mozilla"]
|
4
|
+
|
5
|
+
class DuckDuckGo
|
6
|
+
|
7
|
+
def initialize options
|
8
|
+
raise "TypeError", "Invalid Arguments" unless options.is_a? Hash
|
9
|
+
@query = options[:query]
|
10
|
+
@agent = Mechanize.new { |agent| agent.user_agent_alias = USER_AGENTS.sample }
|
11
|
+
end
|
12
|
+
|
13
|
+
def search
|
14
|
+
output_arr = []
|
15
|
+
query_to_url
|
16
|
+
previous_content = ""
|
17
|
+
page = @agent.get("http:\/\/duckduckgo.com\/d.js?q=#{@query}&l=us-en&p=1&s=0")
|
18
|
+
content = page.content.match(/\[.*\]/).to_s
|
19
|
+
raise "No Profile, Profile not found or does not exist" if (content.nil? || content.empty?)
|
20
|
+
output_arr = JSON.parse(content).map {|f| f['c'] }
|
21
|
+
output_arr = output_arr.flatten.uniq
|
22
|
+
output_arr.each do |f|
|
23
|
+
next unless f
|
24
|
+
output_arr.delete(f) if f.include? "https://encrypted.google.com"
|
25
|
+
output_arr.delete(f) if f == nil
|
26
|
+
end
|
27
|
+
output_arr
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def query_to_url
|
33
|
+
char_hash = {' ' => '%20','$' => '%24','&' => '%26','`' => '60%',':' => '%3A', '<' => '%3C', '>' => '%3E','[' => '%5B', ']' => '%5D', '{' => '%7B', '}' => '%7D', '"' => '%22','+' => '%2B', '@' => '%40', '/' => '%2F', ';' => '%3B', '=' => '%3D','?' => '%3F', '\\' => '%5C', '^' => '%5E', '|' => '%7C', '~' => '%7E', '\'' => '%27',',' => '%2C'}
|
34
|
+
@query = @query.gsub(/%/,'%25')
|
35
|
+
char_hash.each {|k,v| @query = @query.gsub(k,v) }
|
36
|
+
end
|
37
|
+
|
38
|
+
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
|
@@ -0,0 +1,249 @@
|
|
1
|
+
module LinkedinScraper
|
2
|
+
|
3
|
+
class Linkedin
|
4
|
+
# Interface for scraping linkedin public profile
|
5
|
+
|
6
|
+
# Full name of profile e.g "John Smith"
|
7
|
+
attr_accessor :full_name
|
8
|
+
# Linkedin url of profile e.g "http://www.linkedin.com/pub/in/john+smith"
|
9
|
+
attr_accessor :linkedin_url
|
10
|
+
# Current title of profile e.g "Ceo"
|
11
|
+
attr_accessor :current_title
|
12
|
+
# Past title of profile e.g "VP Business"
|
13
|
+
attr_accessor :past_title
|
14
|
+
# Current company of profile e.g "Pajama Labs"
|
15
|
+
attr_accessor :current_company
|
16
|
+
# Current job description as available on linkedin
|
17
|
+
attr_accessor :current_job_description
|
18
|
+
# Profile's summary as available on linkedin
|
19
|
+
attr_accessor :summary
|
20
|
+
# Total number of profile's connection e.g "44"
|
21
|
+
attr_accessor :connection
|
22
|
+
# Current location of profile "San Fransisco"
|
23
|
+
attr_accessor :location
|
24
|
+
# Total number of profile's recommendation e.g "44"
|
25
|
+
attr_accessor :recommendation
|
26
|
+
# All companies including current and past
|
27
|
+
attr_accessor :industry
|
28
|
+
# Experience description if mentioned
|
29
|
+
attr_accessor :experience
|
30
|
+
# Groups a profile is associated with
|
31
|
+
attr_accessor :groups
|
32
|
+
# Past companies of profile e.g "Google, Inc"
|
33
|
+
attr_accessor :past_company
|
34
|
+
# Schools profile has attended
|
35
|
+
attr_accessor :education
|
36
|
+
# Purposes for which profile can be contacted to (if provided)
|
37
|
+
attr_accessor :contact_for
|
38
|
+
# Websites associated with Profile
|
39
|
+
attr_accessor :websites
|
40
|
+
|
41
|
+
# Initializes mechnaize
|
42
|
+
def initialize
|
43
|
+
@agent = Mechanize.new { |agent| agent.user_agent_alias = USER_AGENTS.sample }
|
44
|
+
end
|
45
|
+
|
46
|
+
# Get data of the required profile
|
47
|
+
#
|
48
|
+
# _options_ contains the key :name *Required
|
49
|
+
# :name - name of profile/contact e.g "John Smith"
|
50
|
+
# _options_ contains the key :company *Required
|
51
|
+
# :company - current or past company of profile/contact e.g "Google"
|
52
|
+
# _options_ contains the key :country *Optional (see list of supported countries in Readme)
|
53
|
+
# :country - preferred country of profile to be found in
|
54
|
+
#
|
55
|
+
# Raises error if profile is not found or does not exist in linkedin or is not public profile
|
56
|
+
# Raises error if :name or :company or both are not defined
|
57
|
+
# Returns _self_
|
58
|
+
def get_profile_data options
|
59
|
+
@options = options
|
60
|
+
raise "TypeError", "Invalid Arguments" unless options.is_a?(Hash)
|
61
|
+
argument_error("name") unless options.has_key?(:name)
|
62
|
+
argument_error("company") unless options.has_key?(:company)
|
63
|
+
query = build_query
|
64
|
+
duck = LinkedinScraper::DuckDuckGo.new(query: query)
|
65
|
+
results = duck.search
|
66
|
+
raise "No Profile, Profile not found or does not exist" if results.empty?
|
67
|
+
page = get_li_page(results[0])
|
68
|
+
raise "No Profile, Profile not found or does not exist" unless page
|
69
|
+
if verify_profile(@options[:name], @options[:company], page)
|
70
|
+
self.data page
|
71
|
+
return self
|
72
|
+
else
|
73
|
+
raise "No Profile, Profile not found or does not exist"
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
def data page
|
78
|
+
self.full_name = li_full_name(page)
|
79
|
+
self.linkedin_url = page.uri.to_s
|
80
|
+
self.current_title = li_current_title(page)
|
81
|
+
self.past_title = li_past_title(page)
|
82
|
+
self.current_company = li_current_companies(page)
|
83
|
+
self.current_job_description = li_current_job_description(page)
|
84
|
+
self.summary = li_summary(page)
|
85
|
+
self.connection = li_connection(page)
|
86
|
+
self.location = li_location(page)
|
87
|
+
self.recommendation = li_recommendation(page)
|
88
|
+
self.industry = li_current_companies(page) + li_past_companies(page)
|
89
|
+
self.experience = li_experience(page)
|
90
|
+
self.groups = li_groups(page)
|
91
|
+
self.past_company = li_past_companies(page)
|
92
|
+
self.education = li_education(page)
|
93
|
+
self.contact_for = li_contact_for(page)
|
94
|
+
self.websites = li_websites(page)
|
95
|
+
end
|
96
|
+
|
97
|
+
private
|
98
|
+
|
99
|
+
def build_query
|
100
|
+
name = @options[:name] if @options.has_key? :name
|
101
|
+
#title = @options[:title] if @options.has_key? :title
|
102
|
+
company = @options[:company] if @options.has_key? :company
|
103
|
+
subdomain = @options[:country] || "www"
|
104
|
+
if name && company
|
105
|
+
return "site:#{subdomain}.linkedin.com \"#{name}\" + \"at #{company}\""
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def get_li_page(url)
|
110
|
+
page = nil
|
111
|
+
begin
|
112
|
+
page = @agent.get(url)
|
113
|
+
if page
|
114
|
+
if page.parser.xpath('//*[(@id = "member-1")]//*[contains(concat( " ", @class, " " ), concat( " ", "title", " " ))]').text.empty?
|
115
|
+
unless page.parser.xpath('//*[(@id = "result-set")]//*[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//strong//a').text.empty?
|
116
|
+
return @agent.click(page.link_with(:text => page.parser.xpath('//*[(@id = "result-set")]//*[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//strong//a').text))
|
117
|
+
else
|
118
|
+
return nil
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
rescue Exception => e
|
123
|
+
end
|
124
|
+
return page
|
125
|
+
end
|
126
|
+
|
127
|
+
def verify_profile(name, company, page)
|
128
|
+
full_name = li_full_name(page)
|
129
|
+
companies = li_current_companies(page)+' '+li_past_companies(page)
|
130
|
+
return true if full_name=~/#{name}/i && companies=~/#{company}/i
|
131
|
+
end
|
132
|
+
|
133
|
+
def li_past_title(page)
|
134
|
+
stack = []
|
135
|
+
page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "past", " " ))]//li').each do |past|
|
136
|
+
stack << past.text.gsub(/\s+/, " ").downcase.split(' at ')[0]
|
137
|
+
end
|
138
|
+
stack.join(",")
|
139
|
+
end
|
140
|
+
|
141
|
+
def li_full_profile(page)
|
142
|
+
page.parser.xpath('//*[(@id = "content")]').text.gsub(/\s+/, " ").downcase
|
143
|
+
end
|
144
|
+
|
145
|
+
def li_current_job_description(page)
|
146
|
+
page.parser.xpath('//*[(@id = "profile-experience")]//*[contains(concat( " ", @class, " " ),
|
147
|
+
concat( " ", "first", " " ))]').text.gsub(/\s+/, " ").downcase
|
148
|
+
end
|
149
|
+
|
150
|
+
def li_full_name(page)
|
151
|
+
page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "full-name", " " ))]').text.downcase
|
152
|
+
end
|
153
|
+
|
154
|
+
def li_current_companies(page)
|
155
|
+
stack = []
|
156
|
+
page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "current", " " ))]//li').each do |com|
|
157
|
+
stack << com.text.gsub(/\s+/, " ").downcase.split(' at ')[-1]
|
158
|
+
end
|
159
|
+
stack.join(",")
|
160
|
+
end
|
161
|
+
|
162
|
+
def li_current_title(page)
|
163
|
+
stack = []
|
164
|
+
page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "current", " " ))]//li').each do |current|
|
165
|
+
stack << current.text.gsub(/\s+/, " ").downcase.split(' at ')[0]
|
166
|
+
end
|
167
|
+
stack.join(",")
|
168
|
+
end
|
169
|
+
|
170
|
+
def li_past_companies(page)
|
171
|
+
stack = []
|
172
|
+
page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "past", " " ))]//li').each do |com|
|
173
|
+
stack << com.text.gsub(/\s+/, " ").downcase.split(' at ')[-1]
|
174
|
+
end
|
175
|
+
stack.join(",")
|
176
|
+
end
|
177
|
+
|
178
|
+
def li_colleges(page)
|
179
|
+
stack = []
|
180
|
+
page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "summary-education", " " ))]//li').each do |com|
|
181
|
+
stack << com.text.gsub(/\s+/, " ").downcase
|
182
|
+
end
|
183
|
+
stack.join(",")
|
184
|
+
end
|
185
|
+
|
186
|
+
def li_recommendation(page)
|
187
|
+
page.parser.xpath('//dd[(((count(preceding-sibling::*) + 1) = 8) and parent::*)]//strong').text.strip
|
188
|
+
end
|
189
|
+
|
190
|
+
def li_websites(page)
|
191
|
+
websites = []
|
192
|
+
page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "websites", " " ))]//a').each do |web|
|
193
|
+
websites << web['href']
|
194
|
+
end
|
195
|
+
websites.join(",")
|
196
|
+
end
|
197
|
+
|
198
|
+
def li_summary(page)
|
199
|
+
page.parser.xpath('//*[(@id = "profile-summary")]').text.gsub(/\s+/, " ").downcase
|
200
|
+
end
|
201
|
+
|
202
|
+
def li_experience(page)
|
203
|
+
page.parser.xpath('//*[(@id = "profile-experience")]').text.gsub(/\s+/, " ").downcase
|
204
|
+
end
|
205
|
+
|
206
|
+
def li_education(page)
|
207
|
+
page.parser.xpath('//*[(@id = "profile-education")]').text.gsub(/\s+/, " ").downcase
|
208
|
+
end
|
209
|
+
|
210
|
+
def li_additional_info(page)
|
211
|
+
page.parser.xpath('//*[(@id = "profile-additional")]').text.gsub(/\s+/, " ").downcase
|
212
|
+
end
|
213
|
+
|
214
|
+
def li_contact_for(page)
|
215
|
+
page.parser.xpath('//*[(@id = "profile-contact")]').text.gsub(/\s+/, " ").downcase
|
216
|
+
end
|
217
|
+
|
218
|
+
def li_connection(page)
|
219
|
+
page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "overview-connections", " " ))]//p').text.gsub(/\s+/, " ").downcase
|
220
|
+
end
|
221
|
+
|
222
|
+
def li_location(page)
|
223
|
+
page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "locality", " " ))]').text.gsub(/\s+/, " ").downcase
|
224
|
+
end
|
225
|
+
|
226
|
+
def li_groups(page)
|
227
|
+
page.parser.xpath('//*[(@id = "pubgroups")]//*[contains(concat( " ", @class, " " ), concat( " ", "org", " " ))]').text.gsub(/\s+/, " ").downcase
|
228
|
+
end
|
229
|
+
|
230
|
+
def li_websites(page)
|
231
|
+
websites=[]
|
232
|
+
if page.search(".website").first
|
233
|
+
page.search(".website").each do |site|
|
234
|
+
url=site.at("a")["href"]
|
235
|
+
url="http://www.linkedin.com"+url
|
236
|
+
url=CGI.parse(URI.parse(url).query)["url"]
|
237
|
+
websites<<url
|
238
|
+
end
|
239
|
+
return websites.flatten!
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
def argument_error(argument)
|
244
|
+
raise "Argument Error, missing argument :#{argument}"
|
245
|
+
end
|
246
|
+
|
247
|
+
end
|
248
|
+
|
249
|
+
end
|
@@ -0,0 +1,8 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'mechanize'
|
3
|
+
require 'json'
|
4
|
+
require 'cgi'
|
5
|
+
|
6
|
+
require "#{File.dirname(File.expand_path(__FILE__))}/linkedin/duck.rb"
|
7
|
+
require "#{File.dirname(File.expand_path(__FILE__))}/linkedin/linkedin.rb"
|
8
|
+
require "#{File.dirname(File.expand_path(__FILE__))}/linkedin/version.rb"
|
metadata
ADDED
@@ -0,0 +1,82 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: linkedin_scraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Bhushan Lodha
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2012-08-31 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: mechanize
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
- !ruby/object:Gem::Dependency
|
31
|
+
name: json
|
32
|
+
requirement: !ruby/object:Gem::Requirement
|
33
|
+
none: false
|
34
|
+
requirements:
|
35
|
+
- - ! '>='
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
type: :runtime
|
39
|
+
prerelease: false
|
40
|
+
version_requirements: !ruby/object:Gem::Requirement
|
41
|
+
none: false
|
42
|
+
requirements:
|
43
|
+
- - ! '>='
|
44
|
+
- !ruby/object:Gem::Version
|
45
|
+
version: '0'
|
46
|
+
description: Find linkedin profile based on name and company of profile and scrapes
|
47
|
+
data if profile is found
|
48
|
+
email: bhushanlodha@gmail.com
|
49
|
+
executables: []
|
50
|
+
extensions: []
|
51
|
+
extra_rdoc_files: []
|
52
|
+
files:
|
53
|
+
- README.rdoc
|
54
|
+
- lib/linkedin/duck.rb
|
55
|
+
- lib/linkedin/linkedin.rb
|
56
|
+
- lib/linkedin/version.rb
|
57
|
+
- lib/linkedin_scraper.rb
|
58
|
+
homepage: https://github.com/bhushanlodha/linkedin_scraper
|
59
|
+
licenses: []
|
60
|
+
post_install_message:
|
61
|
+
rdoc_options: []
|
62
|
+
require_paths:
|
63
|
+
- lib
|
64
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
65
|
+
none: false
|
66
|
+
requirements:
|
67
|
+
- - ! '>='
|
68
|
+
- !ruby/object:Gem::Version
|
69
|
+
version: '0'
|
70
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
|
+
none: false
|
72
|
+
requirements:
|
73
|
+
- - ! '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
requirements: []
|
77
|
+
rubyforge_project:
|
78
|
+
rubygems_version: 1.8.24
|
79
|
+
signing_key:
|
80
|
+
specification_version: 3
|
81
|
+
summary: Find linkedin profiles and scrapes data
|
82
|
+
test_files: []
|