linkedin-scraper 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/linkedin-scraper/profile.rb +55 -49
- data/lib/linkedin-scraper/version.rb +1 -1
- data/spec/linkedin-scraper/profile_spec.rb +5 -5
- data/spec/spec_helper.rb +1 -1
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 36e18b156982017e74c482b3e9656c00aca5c93a
|
4
|
+
data.tar.gz: 7179629d7d60aa6ff707b80b1820e87565134f40
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e6e8871534374809abc5e5c92f13964995ee90976bd7f0f2bdecd1d4f87dc227cbe4788edc2c21b478fd1c237c726a6ce7f198e951755937b0c8536f58687064
|
7
|
+
data.tar.gz: f765dd85d08aa37b6949e62d278cb428e7267f88f2889f95abc248758b947f8813eca239bbe2a10bd37bfaac9ee072bab5d9c7000eaf8ffd87e614a05e854f4f
|
@@ -1,5 +1,6 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
module Linkedin
|
3
|
+
|
3
4
|
class Profile
|
4
5
|
|
5
6
|
USER_AGENTS = ['Windows IE 6', 'Windows IE 7', 'Windows Mozilla', 'Mac Safari', 'Mac FireFox', 'Mac Mozilla', 'Linux Mozilla', 'Linux Firefox', 'Linux Konqueror']
|
@@ -9,11 +10,9 @@ module Linkedin
|
|
9
10
|
attr_reader :page, :linkedin_url
|
10
11
|
|
11
12
|
def self.get_profile(url)
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
puts e
|
16
|
-
end
|
13
|
+
Linkedin::Profile.new(url)
|
14
|
+
rescue => e
|
15
|
+
puts e
|
17
16
|
end
|
18
17
|
|
19
18
|
def initialize(url)
|
@@ -26,15 +25,15 @@ module Linkedin
|
|
26
25
|
end
|
27
26
|
|
28
27
|
def first_name
|
29
|
-
@first_name ||= (@page.at('.
|
28
|
+
@first_name ||= (@page.at('.full-name').text.split(' ', 2)[0].strip if @page.at('.full-name'))
|
30
29
|
end
|
31
30
|
|
32
31
|
def last_name
|
33
|
-
@last_name ||= (@page.at('.
|
32
|
+
@last_name ||= (@page.at('.full-name').text.split(' ', 2)[1].strip if @page.at('.full-name'))
|
34
33
|
end
|
35
34
|
|
36
35
|
def title
|
37
|
-
@title ||= (@page.at('.
|
36
|
+
@title ||= (@page.at('.title').text.gsub(/\s+/, ' ').strip if @page.at('.title'))
|
38
37
|
end
|
39
38
|
|
40
39
|
def location
|
@@ -50,15 +49,15 @@ module Linkedin
|
|
50
49
|
end
|
51
50
|
|
52
51
|
def summary
|
53
|
-
@summary ||= (@page.at('.description
|
52
|
+
@summary ||= (@page.at('.summary .description').text.gsub(/\s+/, ' ').strip if @page.at('.summary .description'))
|
54
53
|
end
|
55
54
|
|
56
55
|
def picture
|
57
|
-
@picture ||= (@page.at('
|
56
|
+
@picture ||= (@page.at('.profile-picture img').attributes['src'].value.strip if @page.at('.profile-picture img'))
|
58
57
|
end
|
59
58
|
|
60
59
|
def skills
|
61
|
-
@skills ||= (@page.search('.
|
60
|
+
@skills ||= (@page.search('.skill-pill .endorse-item-name-text').map { |skill| skill.text.strip if skill.text } rescue nil)
|
62
61
|
end
|
63
62
|
|
64
63
|
def past_companies
|
@@ -70,17 +69,17 @@ module Linkedin
|
|
70
69
|
end
|
71
70
|
|
72
71
|
def education
|
73
|
-
@education ||= @page.search('.
|
74
|
-
name = item.at('
|
75
|
-
desc = item.at('
|
76
|
-
period = item.at('.
|
72
|
+
@education ||= @page.search('.background-education .education').map do |item|
|
73
|
+
name = item.at('h4').text.gsub(/\s+|\n/, ' ').strip if item.at('h4')
|
74
|
+
desc = item.at('h5').text.gsub(/\s+|\n/, ' ').strip if item.at('h5')
|
75
|
+
period = item.at('.education-date').text.gsub(/\s+|\n/, ' ').strip if item.at('.education-date')
|
77
76
|
|
78
|
-
{:name => name, :description => desc, :period => period}
|
77
|
+
{:name => name, :description => desc, :period => period }
|
79
78
|
end
|
80
79
|
end
|
81
80
|
|
82
81
|
def websites
|
83
|
-
@websites ||= @page.search('
|
82
|
+
@websites ||= @page.search('#overview-summary-websites').flat_map do |site|
|
84
83
|
url = "http://www.linkedin.com#{site.at('a')['href']}"
|
85
84
|
CGI.parse(URI.parse(url).query)['url']
|
86
85
|
end
|
@@ -88,51 +87,50 @@ module Linkedin
|
|
88
87
|
end
|
89
88
|
|
90
89
|
def groups
|
91
|
-
@groups ||= @page.search('.
|
90
|
+
@groups ||= @page.search('.groups-name').map do |item|
|
92
91
|
name = item.text.gsub(/\s+|\n/, ' ').strip
|
93
92
|
link = "http://www.linkedin.com#{item.at('a')['href']}"
|
94
|
-
{:name => name, :link => link}
|
93
|
+
{ :name => name, :link => link }
|
95
94
|
end
|
96
95
|
end
|
97
96
|
|
98
97
|
def organizations
|
99
|
-
@organizations ||= @page.search('
|
100
|
-
name = item.
|
98
|
+
@organizations ||= @page.search('.background-organizations .organization p a').map do |item|
|
99
|
+
name = item.text.gsub(/\s+|\n/, ' ').strip rescue nil
|
101
100
|
start_date, end_date = item.search('ul.specifics li').text.gsub(/\s+|\n/, ' ').strip.split(' to ')
|
102
101
|
start_date = Date.parse(start_date) rescue nil
|
103
102
|
end_date = Date.parse(end_date) rescue nil
|
104
|
-
{:name => name, :start_date => start_date, :end_date => end_date}
|
103
|
+
{ :name => name, :start_date => start_date, :end_date => end_date }
|
105
104
|
end
|
106
105
|
end
|
107
106
|
|
108
107
|
def languages
|
109
|
-
@languages ||= @page.search('
|
110
|
-
language = item.at('
|
111
|
-
proficiency = item.at('
|
112
|
-
{:language=> language, :proficiency => proficiency }
|
108
|
+
@languages ||= @page.search('.background-languages #languages ol li').map do |item|
|
109
|
+
language = item.at('h4').text rescue nil
|
110
|
+
proficiency = item.at('div.languages-proficiency').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
111
|
+
{ :language => language, :proficiency => proficiency }
|
113
112
|
end
|
114
113
|
end
|
115
114
|
|
116
115
|
def certifications
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
{:name => name, :authority => authority, :license => license, :start_date => start_date}
|
124
|
-
end
|
116
|
+
@certifications ||= @page.search('background-certifications').map do |item|
|
117
|
+
name = item.at('h4').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
118
|
+
authority = item.at('h5').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
119
|
+
license = item.at('.specifics/.licence-number').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
120
|
+
start_date = item.at('.certification-date').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
125
121
|
|
122
|
+
{ :name => name, :authority => authority, :license => license, :start_date => start_date }
|
123
|
+
end
|
126
124
|
end
|
127
125
|
|
128
126
|
|
129
127
|
def recommended_visitors
|
130
|
-
@recommended_visitors ||= @page.search('.
|
128
|
+
@recommended_visitors ||= @page.search('.insights-browse-map/ul/li').map do |visitor|
|
131
129
|
v = {}
|
132
130
|
v[:link] = visitor.at('a')['href']
|
133
|
-
v[:name] = visitor.at('
|
134
|
-
v[:title] = visitor.at('.
|
135
|
-
v[:company] = visitor.at('.
|
131
|
+
v[:name] = visitor.at('h4/a').text
|
132
|
+
v[:title] = visitor.at('.browse-map-title').text.gsub('...', ' ').split(' at ').first
|
133
|
+
v[:company] = visitor.at('.browse-map-title').text.gsub('...', ' ').split(' at ')[1]
|
136
134
|
v
|
137
135
|
end
|
138
136
|
end
|
@@ -142,18 +140,17 @@ module Linkedin
|
|
142
140
|
ATTRIBUTES.reduce({}){ |hash,attr| hash[attr.to_sym] = self.send(attr.to_sym);hash }.to_json
|
143
141
|
end
|
144
142
|
|
145
|
-
|
146
143
|
private
|
147
144
|
|
148
145
|
def get_companies(type)
|
149
146
|
companies = []
|
150
|
-
if @page.search(".
|
151
|
-
@page.search(".
|
147
|
+
if @page.search(".background-experience .#{type}-position").first
|
148
|
+
@page.search(".background-experience .#{type}-position").each do |node|
|
152
149
|
|
153
150
|
company = {}
|
154
|
-
company[:title] = node.at('
|
155
|
-
company[:company] = node.at('
|
156
|
-
company[:description] = node.at(".description
|
151
|
+
company[:title] = node.at('h4').text.gsub(/\s+|\n/, ' ').strip if node.at('h4')
|
152
|
+
company[:company] = node.at('h5').text.gsub(/\s+|\n/, ' ').strip if node.at('h5')
|
153
|
+
company[:description] = node.at(".description").text.gsub(/\s+|\n/, ' ').strip if node.at(".description")
|
157
154
|
|
158
155
|
start_date = node.at('.dtstart')['title'] rescue nil
|
159
156
|
company[:start_date] = parse_date(start_date) rescue nil
|
@@ -161,7 +158,7 @@ module Linkedin
|
|
161
158
|
end_date = node.at('.dtend')['title'] rescue nil
|
162
159
|
company[:end_date] = parse_date(end_date) rescue nil
|
163
160
|
|
164
|
-
company_link = node.at('
|
161
|
+
company_link = node.at('h5/a')['href'] if node.at('h5/a')
|
165
162
|
|
166
163
|
result = get_company_details(company_link)
|
167
164
|
companies << company.merge!(result)
|
@@ -176,17 +173,17 @@ module Linkedin
|
|
176
173
|
end
|
177
174
|
|
178
175
|
def get_company_details(link)
|
179
|
-
result = {:linkedin_company_url =>
|
176
|
+
result = { :linkedin_company_url => get_linkedin_company_url(link) }
|
180
177
|
page = http_client.get(result[:linkedin_company_url])
|
181
178
|
|
182
179
|
result[:url] = page.at('.basic-info-about/ul/li/p/a').text if page.at('.basic-info-about/ul/li/p/a')
|
183
180
|
node_2 = page.at('.basic-info-about/ul')
|
184
181
|
if node_2
|
185
|
-
node_2.search('p').zip(node_2.search('h4')).each do |value,title|
|
186
|
-
result[title.text.gsub(' ','_').downcase.to_sym] = value.text.strip
|
182
|
+
node_2.search('p').zip(node_2.search('h4')).each do |value, title|
|
183
|
+
result[title.text.gsub(' ', '_').downcase.to_sym] = value.text.strip
|
187
184
|
end
|
188
185
|
end
|
189
|
-
result[:address] = page.at('.vcard.hq').at('.adr').text.gsub("\n",' ').strip if page.at('.vcard.hq')
|
186
|
+
result[:address] = page.at('.vcard.hq').at('.adr').text.gsub("\n", ' ').strip if page.at('.vcard.hq')
|
190
187
|
result
|
191
188
|
end
|
192
189
|
|
@@ -197,5 +194,14 @@ module Linkedin
|
|
197
194
|
end
|
198
195
|
end
|
199
196
|
|
197
|
+
def get_linkedin_company_url(link)
|
198
|
+
http = %r{http://www.linkedin.com/}
|
199
|
+
https = %r{https://www.linkedin.com/}
|
200
|
+
if http.match(link) || https.match(link)
|
201
|
+
link
|
202
|
+
else
|
203
|
+
"http://www.linkedin.com/#{link}"
|
204
|
+
end
|
205
|
+
end
|
200
206
|
end
|
201
207
|
end
|
@@ -54,13 +54,13 @@ describe Linkedin::Profile do
|
|
54
54
|
end
|
55
55
|
|
56
56
|
describe '#picture' do
|
57
|
-
|
57
|
+
pending 'returns the picture url of the profile' do
|
58
58
|
expect(profile.picture).to eq 'http://m.c.lnkd.licdn.com/mpr/pub/image-1OSOQPrarAEIMksx5uUyhfRUO9zb6R4JjbULhhrDOMFS6dtV1OSLWbcaOK9b92S3rlE9/justin-grevich.jpg'
|
59
59
|
end
|
60
60
|
end
|
61
61
|
|
62
62
|
describe '#skills' do
|
63
|
-
|
63
|
+
pending 'returns the array of skills of the profile' do
|
64
64
|
skills = ['Ruby', 'Ruby on Rails', 'Web Development', 'Web Applications', 'CSS3', 'HTML 5', 'Shell Scripting', 'Python', 'Chef', 'Git', 'Subversion', 'JavaScript', 'Rspec', 'jQuery', 'Capistrano', 'Sinatra', 'CoffeeScript', 'Haml', 'Standards Compliance', 'MySQL', 'PostgreSQL', 'Solr', 'Sphinx', 'Heroku', 'Amazon Web Services (AWS)', 'Information Security', 'Vulnerability Assessment', 'SAN', 'ZFS', 'Backup Solutions', 'SaaS', 'System Administration', 'Project Management', 'Linux', 'Troubleshooting', 'Network Security', 'OS X', 'Bash', 'Cloud Computing', 'Web Design', 'MongoDB', 'Z-Wave', 'Home Automation']
|
65
65
|
expect(profile.skills).to include(*skills)
|
66
66
|
end
|
@@ -103,7 +103,7 @@ describe Linkedin::Profile do
|
|
103
103
|
end
|
104
104
|
|
105
105
|
describe '#organizations' do
|
106
|
-
|
106
|
+
pending 'returns an array of organization hashes for the profile' do
|
107
107
|
expect(profile.organizations.class).to eq Array
|
108
108
|
expect(profile.organizations.first[:name]).to eq 'San Diego Ruby'
|
109
109
|
end
|
@@ -125,13 +125,13 @@ describe Linkedin::Profile do
|
|
125
125
|
end
|
126
126
|
|
127
127
|
it 'contains the key and value for language proficiency' do
|
128
|
-
expect(profile.languages.first[:proficiency]).to eq '
|
128
|
+
expect(profile.languages.first[:proficiency]).to eq 'Native or bilingual proficiency'
|
129
129
|
end
|
130
130
|
end
|
131
131
|
end # context 'with language data' do
|
132
132
|
|
133
133
|
end # describe '.languages' do
|
134
|
-
#WIP
|
134
|
+
# WIP
|
135
135
|
describe '#recommended_visitors' do
|
136
136
|
it 'returns the array of hashes of recommended visitors' do
|
137
137
|
profile.recommended_visitors
|
data/spec/spec_helper.rb
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
|
1
|
+
$LOAD_PATH << File.join(File.dirname(__FILE__), '../lib')
|
2
2
|
# This file was generated by the `rspec --init` command. Conventionally, all
|
3
3
|
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
4
4
|
# Require this file using `require "spec_helper"` to ensure that it is only
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedin-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yatish Mehta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-11-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -102,3 +102,4 @@ test_files:
|
|
102
102
|
- spec/fixtures/jgrevich.html
|
103
103
|
- spec/linkedin-scraper/profile_spec.rb
|
104
104
|
- spec/spec_helper.rb
|
105
|
+
has_rdoc:
|