linkedin-scraper 0.0.11 → 0.0.12
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop.yml +11 -0
- data/README.md +3 -2
- data/bin/linkedin-scraper +0 -1
- data/lib/linkedin-scraper.rb +5 -8
- data/lib/linkedin-scraper/profile.rb +27 -27
- data/lib/linkedin-scraper/version.rb +1 -1
- data/linkedin-scraper.gemspec +7 -8
- data/spec/linkedin-scraper/profile_spec.rb +21 -16
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 98225f23a99fa755b3e29e92cfcff488c11ede5a
|
4
|
+
data.tar.gz: aaaae8060e81d59cc8f17848606acd382a158cea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f6ae1cd6a3eb3b9b66d7b32cc340cfba163627c0c958d2162f5eec43782f00631e5dc8e0802e4df7781350ba9c08afc397c2495ff07624540d002d871bcf62f2
|
7
|
+
data.tar.gz: 0aed555859a8ef26ce93a724da89ad34011d43986e6a0e6b80d43d7424472b5a1ba4fefac8f17c0789b5326e3cd4464ab7ce2198d106b7c497c62ed6a2e091bc
|
data/.rubocop.yml
ADDED
data/README.md
CHANGED
@@ -1,4 +1,5 @@
|
|
1
|
-
[![Build Status](https://secure.travis-ci.org/
|
1
|
+
[![Build Status](https://secure.travis-ci.org/yatish27/linkedin-scraper.png)](http://travis-ci.org/yatish27/linkedin-scraper)
|
2
|
+
[![Gem Version](https://badge.fury.io/rb/linkedin-scraper.png)](http://badge.fury.io/rb/linkedin-scraper)
|
2
3
|
|
3
4
|
Linkedin Scraper
|
4
5
|
================
|
@@ -251,7 +252,7 @@ For current and past comapnies it also provides the details of the companies lik
|
|
251
252
|
]
|
252
253
|
|
253
254
|
|
254
|
-
The gem also comes with a binary and can be used from
|
255
|
+
The gem also comes with a binary and can be used from the command line to get a json response of the scraped data. It takes the url as the first argument.
|
255
256
|
|
256
257
|
linkedin-scraper http://www.linkedin.com/in/jeffweiner08
|
257
258
|
|
data/bin/linkedin-scraper
CHANGED
data/lib/linkedin-scraper.rb
CHANGED
@@ -1,8 +1,5 @@
|
|
1
|
-
require
|
2
|
-
require
|
3
|
-
require
|
4
|
-
require
|
5
|
-
Dir["#{File.expand_path(File.dirname(__FILE__))}/linkedin-scraper/*.rb"].each {|file| require file }
|
6
|
-
|
7
|
-
|
8
|
-
|
1
|
+
require 'rubygems'
|
2
|
+
require 'mechanize'
|
3
|
+
require 'cgi'
|
4
|
+
require 'net/http'
|
5
|
+
Dir["#{File.expand_path(File.dirname(__FILE__))}/linkedin-scraper/*.rb"].each { |file| require file }
|
@@ -5,9 +5,9 @@ module Linkedin
|
|
5
5
|
USER_AGENTS = ['Windows IE 6', 'Windows IE 7', 'Windows Mozilla', 'Mac Safari', 'Mac FireFox', 'Mac Mozilla', 'Linux Mozilla', 'Linux Firefox', 'Linux Konqueror']
|
6
6
|
|
7
7
|
ATTRIBUTES = %w(name first_name last_name title location country industry summary picture linkedin_url education groups websites languages skills certifications organizations past_companies current_companies recommended_visitors)
|
8
|
-
|
8
|
+
|
9
9
|
attr_reader :page, :linkedin_url
|
10
|
-
|
10
|
+
|
11
11
|
def self.get_profile(url)
|
12
12
|
begin
|
13
13
|
Linkedin::Profile.new(url)
|
@@ -20,12 +20,12 @@ module Linkedin
|
|
20
20
|
@linkedin_url = url
|
21
21
|
@page = http_client.get(url)
|
22
22
|
end
|
23
|
-
|
23
|
+
|
24
24
|
def name
|
25
25
|
"#{first_name} #{last_name}"
|
26
26
|
end
|
27
|
-
|
28
|
-
def first_name
|
27
|
+
|
28
|
+
def first_name
|
29
29
|
@first_name ||= (@page.at('.given-name').text.strip if @page.at('.given-name'))
|
30
30
|
end
|
31
31
|
|
@@ -33,7 +33,7 @@ module Linkedin
|
|
33
33
|
@last_name ||= (@page.at('.family-name').text.strip if @page.at('.family-name'))
|
34
34
|
end
|
35
35
|
|
36
|
-
def title
|
36
|
+
def title
|
37
37
|
@title ||= (@page.at('.headline-title').text.gsub(/\s+/, ' ').strip if @page.at('.headline-title'))
|
38
38
|
end
|
39
39
|
|
@@ -77,12 +77,12 @@ module Linkedin
|
|
77
77
|
name = item.at('h3').text.gsub(/\s+|\n/, ' ').strip if item.at('h3')
|
78
78
|
desc = item.at('h4').text.gsub(/\s+|\n/, ' ').strip if item.at('h4')
|
79
79
|
period = item.at('.period').text.gsub(/\s+|\n/, ' ').strip if item.at('.period')
|
80
|
-
|
80
|
+
|
81
81
|
{:name => name, :description => desc, :period => period}
|
82
82
|
end
|
83
83
|
end
|
84
84
|
end
|
85
|
-
|
85
|
+
@education
|
86
86
|
end
|
87
87
|
|
88
88
|
def websites
|
@@ -118,7 +118,7 @@ module Linkedin
|
|
118
118
|
@organizations = []
|
119
119
|
if @page.search('ul.organizations/li.organization').first
|
120
120
|
@organizations = @page.search('ul.organizations/li.organization').map do |item|
|
121
|
-
|
121
|
+
|
122
122
|
name = item.search('h3').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
123
123
|
start_date, end_date = item.search('ul.specifics li').text.gsub(/\s+|\n/, ' ').strip.split(' to ')
|
124
124
|
start_date = Date.parse(start_date) rescue nil
|
@@ -153,21 +153,21 @@ module Linkedin
|
|
153
153
|
authority = item.at('.specifics/.org').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
154
154
|
license = item.at('.specifics/.licence-number').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
155
155
|
start_date = item.at('.specifics/.dtstart').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
156
|
-
|
156
|
+
|
157
157
|
{:name => name, :authority => authority, :license => license, :start_date => start_date}
|
158
158
|
end
|
159
159
|
end
|
160
160
|
end
|
161
161
|
@certifications
|
162
162
|
end
|
163
|
-
|
163
|
+
|
164
164
|
|
165
165
|
def recommended_visitors
|
166
166
|
unless @recommended_visitors
|
167
167
|
@recommended_visitors = []
|
168
168
|
if @page.at('.browsemap/.content/ul/li')
|
169
169
|
@recommended_visitors = @page.search('.browsemap/.content/ul/li').map do |visitor|
|
170
|
-
v = {}
|
170
|
+
v = {}
|
171
171
|
v[:link] = visitor.at('a')['href']
|
172
172
|
v[:name] = visitor.at('strong/a').text
|
173
173
|
v[:title] = visitor.at('.headline').text.gsub('...',' ').split(' at ').first
|
@@ -181,46 +181,46 @@ module Linkedin
|
|
181
181
|
|
182
182
|
def to_json
|
183
183
|
require 'json'
|
184
|
-
hash =
|
185
|
-
ATTRIBUTES.each do |attribute|
|
186
|
-
hash[attribute.to_sym] = self.send(attribute.to_sym)
|
187
|
-
end
|
188
|
-
hash.to_json
|
184
|
+
ATTRIBUTES.reduce({}){ |hash,attr| hash[attr.to_sym] = self.send(attr.to_sym);hash }.to_json
|
189
185
|
end
|
190
186
|
|
191
187
|
|
192
188
|
private
|
193
|
-
|
189
|
+
|
194
190
|
def get_companies(type)
|
195
191
|
companies = []
|
196
192
|
if @page.search(".position.experience.vevent.vcard.summary-#{type}").first
|
197
193
|
@page.search(".position.experience.vevent.vcard.summary-#{type}").each do |node|
|
198
|
-
|
194
|
+
|
199
195
|
company = {}
|
200
196
|
company[:title] = node.at('h3').text.gsub(/\s+|\n/, ' ').strip if node.at('h3')
|
201
197
|
company[:company] = node.at('h4').text.gsub(/\s+|\n/, ' ').strip if node.at('h4')
|
202
198
|
company[:description] = node.at(".description.#{type}-position").text.gsub(/\s+|\n/, ' ').strip if node.at(".description.#{type}-position")
|
203
199
|
start_date = node.at('.dtstart').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
204
|
-
company[:start_date] =
|
200
|
+
company[:start_date] = parse_date(start_date) rescue nil
|
205
201
|
|
206
|
-
end_date
|
207
|
-
|
208
|
-
|
202
|
+
end_date = node.at('.dtend').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
203
|
+
end_date ||= node.at('.dtstamp').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
204
|
+
company[:end_date] = parse_date(end_date) rescue nil
|
209
205
|
|
210
206
|
company_link = node.at('h4/strong/a')['href'] if node.at('h4/strong/a')
|
211
207
|
|
212
|
-
result = get_company_details(company_link)
|
208
|
+
result = get_company_details(company_link)
|
213
209
|
companies << company.merge!(result)
|
214
210
|
end
|
215
211
|
end
|
216
212
|
companies
|
217
213
|
end
|
218
214
|
|
219
|
-
|
215
|
+
def parse_date(date)
|
216
|
+
date = "#{date}-01-01" if date =~ /^(19|20)\d{2}$/
|
217
|
+
Date.parse(date)
|
218
|
+
end
|
219
|
+
|
220
220
|
def get_company_details(link)
|
221
221
|
result = {:linkedin_company_url => "http://www.linkedin.com#{link}"}
|
222
222
|
page = http_client.get(result[:linkedin_company_url])
|
223
|
-
|
223
|
+
|
224
224
|
result[:url] = page.at('.basic-info/div/dl/dd/a').text if page.at('.basic-info/div/dl/dd/a')
|
225
225
|
node_2 = page.at('.basic-info/.content.inner-mod')
|
226
226
|
if node_2
|
@@ -231,7 +231,7 @@ module Linkedin
|
|
231
231
|
result[:address] = page.at('.vcard.hq').at('.adr').text.gsub("\n",' ').strip if page.at('.vcard.hq')
|
232
232
|
result
|
233
233
|
end
|
234
|
-
|
234
|
+
|
235
235
|
def http_client
|
236
236
|
Mechanize.new do |agent|
|
237
237
|
agent.user_agent_alias = USER_AGENTS.sample
|
data/linkedin-scraper.gemspec
CHANGED
@@ -2,20 +2,19 @@
|
|
2
2
|
require File.expand_path('../lib/linkedin-scraper/version', __FILE__)
|
3
3
|
|
4
4
|
Gem::Specification.new do |gem|
|
5
|
-
gem.authors = [
|
5
|
+
gem.authors = ['Yatish Mehta']
|
6
6
|
gem.description = %q{Scrapes the linkedin profile when a url is given }
|
7
7
|
gem.summary = %q{when a url of public linkedin profile page is given it scrapes the entire page and converts into a accessible object}
|
8
|
-
gem.homepage =
|
8
|
+
gem.homepage = 'https://github.com/yatishmehta27/linkedin-scraper'
|
9
9
|
gem.files = `git ls-files`.split($\)
|
10
|
-
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
10
|
+
gem.executables = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
|
11
11
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
12
|
-
gem.name =
|
13
|
-
gem.require_paths = [
|
12
|
+
gem.name = 'linkedin-scraper'
|
13
|
+
gem.require_paths = ['lib']
|
14
14
|
gem.version = Linkedin::Scraper::VERSION
|
15
15
|
|
16
|
-
|
17
|
-
gem.
|
18
|
-
gem.add_development_dependency 'rspec','>=0'
|
16
|
+
gem.add_dependency(%q<mechanize>, ['>= 0'])
|
17
|
+
gem.add_development_dependency 'rspec', '>=0'
|
19
18
|
gem.add_development_dependency 'rake'
|
20
19
|
|
21
20
|
end
|
@@ -5,25 +5,25 @@ describe Linkedin::Profile do
|
|
5
5
|
|
6
6
|
|
7
7
|
before(:all) do
|
8
|
-
@page = Nokogiri::HTML(File.open(
|
9
|
-
@profile = Linkedin::Profile.new(
|
8
|
+
@page = Nokogiri::HTML(File.open('spec/fixtures/jgrevich.html', 'r') { |f| f.read })
|
9
|
+
@profile = Linkedin::Profile.new('http://www.linkedin.com/in/jgrevich')
|
10
10
|
end
|
11
11
|
|
12
|
-
describe
|
13
|
-
it
|
12
|
+
describe '.get_profile' do
|
13
|
+
it 'Create an instance of Linkedin::Profile class' do
|
14
14
|
expect(@profile).to be_instance_of Linkedin::Profile
|
15
15
|
end
|
16
16
|
end
|
17
17
|
|
18
|
-
describe
|
18
|
+
describe '#first_name' do
|
19
19
|
it 'returns the first name of the profile' do
|
20
|
-
expect(@profile.first_name).to eq
|
20
|
+
expect(@profile.first_name).to eq 'Justin'
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
24
|
-
describe
|
24
|
+
describe '#last_name' do
|
25
25
|
it 'returns the last name of the profile' do
|
26
|
-
expect(@profile.last_name).to eq
|
26
|
+
expect(@profile.last_name).to eq 'Grevich'
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
@@ -65,7 +65,7 @@ describe Linkedin::Profile do
|
|
65
65
|
|
66
66
|
describe '#skills' do
|
67
67
|
it 'returns the array of skills of the profile' do
|
68
|
-
skills = [
|
68
|
+
skills = ['Ruby', 'Ruby on Rails', 'Web Development', 'Web Applications', 'CSS3', 'HTML 5', 'Shell Scripting', 'Python', 'Chef', 'Git', 'Subversion', 'JavaScript', 'Rspec', 'jQuery', 'Capistrano', 'Sinatra', 'CoffeeScript', 'Haml', 'Standards Compliance', 'MySQL', 'PostgreSQL', 'Solr', 'Sphinx', 'Heroku', 'Amazon Web Services (AWS)', 'Information Security', 'Vulnerability Assessment', 'SAN', 'ZFS', 'Backup Solutions', 'SaaS', 'System Administration', 'Project Management', 'Linux', 'Troubleshooting', 'Network Security', 'OS X', 'Bash', 'Cloud Computing', 'Web Design', 'MongoDB', 'Z-Wave', 'Home Automation']
|
69
69
|
expect(@profile.skills).to include(*skills)
|
70
70
|
end
|
71
71
|
end
|
@@ -100,20 +100,20 @@ describe Linkedin::Profile do
|
|
100
100
|
end
|
101
101
|
end
|
102
102
|
|
103
|
-
describe
|
103
|
+
describe '#name' do
|
104
104
|
it 'returns the first and last name of the profile' do
|
105
|
-
expect(@profile.name).to eq
|
105
|
+
expect(@profile.name).to eq 'Justin Grevich'
|
106
106
|
end
|
107
|
-
end
|
107
|
+
end
|
108
108
|
|
109
|
-
describe
|
109
|
+
describe '#organizations' do
|
110
110
|
it 'returns an array of organization hashes for the profile' do
|
111
111
|
expect(@profile.organizations.class).to eq Array
|
112
112
|
expect(@profile.organizations.first[:name]).to eq 'San Diego Ruby'
|
113
113
|
end
|
114
114
|
end
|
115
115
|
|
116
|
-
describe
|
116
|
+
describe '#languages' do
|
117
117
|
it 'returns an array of languages hashes' do
|
118
118
|
expect(@profile.languages.class).to eq Array
|
119
119
|
end
|
@@ -133,8 +133,8 @@ describe Linkedin::Profile do
|
|
133
133
|
end
|
134
134
|
end
|
135
135
|
end # context 'with language data' do
|
136
|
-
|
137
|
-
end # describe
|
136
|
+
|
137
|
+
end # describe '.languages' do
|
138
138
|
|
139
139
|
describe '#recommended_visitors' do
|
140
140
|
it 'returns the array of hashes of recommended visitors' do
|
@@ -148,5 +148,10 @@ describe Linkedin::Profile do
|
|
148
148
|
end
|
149
149
|
end
|
150
150
|
|
151
|
+
describe '#to_json' do
|
152
|
+
it 'returns the json format of the profile' do
|
153
|
+
@profile.to_json
|
154
|
+
end
|
155
|
+
end
|
151
156
|
|
152
157
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedin-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yatish Mehta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-11-11 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -60,6 +60,7 @@ extensions: []
|
|
60
60
|
extra_rdoc_files: []
|
61
61
|
files:
|
62
62
|
- .gitignore
|
63
|
+
- .rubocop.yml
|
63
64
|
- .travis.yml
|
64
65
|
- Gemfile
|
65
66
|
- LICENSE
|