linkedin-scraper 0.0.10 → 0.0.11
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +2 -0
- data/.travis.yml +7 -0
- data/README.md +37 -50
- data/Rakefile +3 -2
- data/bin/linkedin-scraper +5 -0
- data/lib/linkedin-scraper/profile.rb +177 -248
- data/lib/linkedin-scraper/version.rb +1 -1
- data/linkedin-scraper.gemspec +6 -1
- data/spec/linkedin-scraper/profile_spec.rb +90 -21
- metadata +35 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c7cfeee1f051d529594d6d827d6ad373b6aca496
|
4
|
+
data.tar.gz: 681cfad543c0d7daa2863e6c6c2525560cf640df
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9c52c63f97a7855b088bb467bab0b72ac4a1616424348318b1072a0768dac512d4279fc04d1c1e8c213d3e57dc3eb368c43d757a8b937f979217d529f2a29510
|
7
|
+
data.tar.gz: 7d6e965dd00cb7ffc23f244eaa70d342e63d58b4a8ce760726f7907661dfbdae76c6a0bb740b3cdbdc20bd32b2f18230a40c4be25b1da1e289373e5da098397c
|
data/.gitignore
CHANGED
data/.travis.yml
ADDED
data/README.md
CHANGED
@@ -1,52 +1,64 @@
|
|
1
|
+
[![Build Status](https://secure.travis-ci.org/yatishmehta27/linkedin-scraper.png)](http://travis-ci.org/yatishmehta27/linkedin-scraper)
|
2
|
+
|
1
3
|
Linkedin Scraper
|
2
4
|
================
|
3
5
|
|
4
6
|
Linkedin-scraper is a gem for scraping linkedin public profiles.
|
5
|
-
|
7
|
+
Given the URL of the profile, it gets the name, country, title, area, current companies, past comapnies,organizations, skills, groups, etc
|
8
|
+
|
9
|
+
|
10
|
+
##Installation
|
6
11
|
|
7
|
-
Installation
|
8
|
-
------------
|
9
12
|
|
10
13
|
Install the gem from RubyGems:
|
11
14
|
|
12
15
|
gem install linkedin-scraper
|
13
16
|
|
14
|
-
This gem is tested on
|
17
|
+
This gem is tested on 1.9.2, 1.9.3, 2.0.0, JRuby1.9, rbx1.9,
|
15
18
|
|
16
|
-
Usage
|
17
|
-
-----
|
19
|
+
##Usage
|
18
20
|
|
19
|
-
|
21
|
+
|
22
|
+
Initialize a scraper instance
|
20
23
|
|
21
24
|
profile = Linkedin::Profile.get_profile("http://www.linkedin.com/in/jeffweiner08")
|
22
25
|
|
23
|
-
|
26
|
+
The returning object responds to the following methods
|
27
|
+
|
24
28
|
|
29
|
+
profile.first_name # The first name of the contact
|
25
30
|
|
26
|
-
profile.
|
31
|
+
profile.last_name # The last name of the contact
|
27
32
|
|
28
|
-
profile.
|
33
|
+
profile.name # The full name of the profile
|
29
34
|
|
30
|
-
profile.
|
35
|
+
profile.title # The job title
|
31
36
|
|
32
|
-
|
37
|
+
profile.summary # The summary of the profile
|
33
38
|
|
34
|
-
profile.location #
|
39
|
+
profile.location # The location of the contact
|
35
40
|
|
36
|
-
profile.country #
|
41
|
+
profile.country # The country of the contact
|
37
42
|
|
38
|
-
profile.industry #
|
43
|
+
profile.industry # The domain for which the contact belongs
|
39
44
|
|
40
|
-
profile.picture #
|
45
|
+
profile.picture # The profile picture link of profile
|
41
46
|
|
42
|
-
profile.skills #
|
47
|
+
profile.skills # Array of skills of the profile
|
43
48
|
|
44
|
-
profile.organizations #
|
49
|
+
profile.organizations # Array organizations of the profile
|
45
50
|
|
46
|
-
profile.education #Array of hashes for
|
51
|
+
profile.education # Array of hashes for education
|
47
52
|
|
48
|
-
profile.
|
53
|
+
profile.websites # Array of websites
|
49
54
|
|
55
|
+
profile.groups # Array of groups
|
56
|
+
|
57
|
+
profile.languages # Array of languages
|
58
|
+
|
59
|
+
profile.certifications # Array of certifications
|
60
|
+
|
61
|
+
For current and past comapnies it also provides the details of the companies like comapny size, industry, address, etc
|
50
62
|
|
51
63
|
profile.current_companies
|
52
64
|
|
@@ -116,8 +128,6 @@ Then you can see the scraped data like this:
|
|
116
128
|
|
117
129
|
|
118
130
|
profile.past_companies
|
119
|
-
#Array of hash containing its past job companies and job profile
|
120
|
-
#Example
|
121
131
|
[
|
122
132
|
[0] {
|
123
133
|
:past_company => "Accel Partners",
|
@@ -181,30 +191,8 @@ Then you can see the scraped data like this:
|
|
181
191
|
]
|
182
192
|
|
183
193
|
|
184
|
-
profile.linkedin_url #url of the profile
|
185
|
-
|
186
|
-
profile.websites
|
187
|
-
#Array of websites
|
188
|
-
[
|
189
|
-
[0] "http://www.linkedin.com/"
|
190
|
-
]
|
191
|
-
|
192
|
-
profile.groups
|
193
|
-
#Array of hashes containing group name and link
|
194
|
-
|
195
|
-
|
196
|
-
profile.education
|
197
|
-
#Array of hashes for eduction
|
198
|
-
|
199
|
-
profile.skills
|
200
|
-
#Array of skills
|
201
|
-
|
202
|
-
profile.picture
|
203
|
-
#url of the profile picture
|
204
|
-
|
205
|
-
|
206
194
|
profile.recommended_visitors
|
207
|
-
#
|
195
|
+
#It is the list of visitors "Viewers of this profile also viewed..."
|
208
196
|
[
|
209
197
|
[0] {
|
210
198
|
:link => "http://www.linkedin.com/in/barackobama?trk=pub-pbmap",
|
@@ -262,10 +250,9 @@ Then you can see the scraped data like this:
|
|
262
250
|
}
|
263
251
|
]
|
264
252
|
|
265
|
-
## Credits
|
266
253
|
|
267
|
-
|
268
|
-
|
269
|
-
-
|
254
|
+
The gem also comes with a binary and can be used from teh command line to get a json response of the scraped data. It takes the url as the first argument.
|
255
|
+
|
256
|
+
linkedin-scraper http://www.linkedin.com/in/jeffweiner08
|
270
257
|
|
271
|
-
You're welcome to fork this project and send pull requests
|
258
|
+
You're welcome to fork this project and send pull requests
|
data/Rakefile
CHANGED
@@ -1,2 +1,3 @@
|
|
1
|
-
|
2
|
-
|
1
|
+
require 'rspec/core/rake_task'
|
2
|
+
task :default => :spec
|
3
|
+
RSpec::Core::RakeTask.new
|
@@ -2,311 +2,240 @@
|
|
2
2
|
module Linkedin
|
3
3
|
class Profile
|
4
4
|
|
5
|
-
USER_AGENTS = [
|
6
|
-
|
7
|
-
|
8
|
-
attr_accessor :country, :current_companies, :education, :first_name, :groups, :industry, :last_name, :linkedin_url, :location, :page, :past_companies, :picture, :recommended_visitors, :skills, :title, :websites, :organizations, :summary, :certifications, :languages
|
9
|
-
|
10
|
-
|
11
|
-
def initialize(page,url)
|
12
|
-
@first_name = get_first_name(page)
|
13
|
-
@last_name = get_last_name(page)
|
14
|
-
@title = get_title(page)
|
15
|
-
@location = get_location(page)
|
16
|
-
@country = get_country(page)
|
17
|
-
@industry = get_industry(page)
|
18
|
-
@picture = get_picture(page)
|
19
|
-
@summary = get_summary(page)
|
20
|
-
@current_companies = get_current_companies(page)
|
21
|
-
@past_companies = get_past_companies(page)
|
22
|
-
@recommended_visitors = get_recommended_visitors(page)
|
23
|
-
@education = get_education(page)
|
24
|
-
@linkedin_url = url
|
25
|
-
@websites = get_websites(page)
|
26
|
-
@groups = get_groups(page)
|
27
|
-
@organizations = get_organizations(page)
|
28
|
-
@certifications = get_certifications(page)
|
29
|
-
@organizations = get_organizations(page)
|
30
|
-
@skills = get_skills(page)
|
31
|
-
@languages = get_languages(page)
|
32
|
-
@page = page
|
33
|
-
end
|
34
|
-
#returns:nil if it gives a 404 request
|
35
|
-
|
36
|
-
def name
|
37
|
-
name = ''
|
38
|
-
name += "#{self.first_name} " if self.first_name
|
39
|
-
name += self.last_name if self.last_name
|
40
|
-
name
|
41
|
-
end
|
5
|
+
USER_AGENTS = ['Windows IE 6', 'Windows IE 7', 'Windows Mozilla', 'Mac Safari', 'Mac FireFox', 'Mac Mozilla', 'Linux Mozilla', 'Linux Firefox', 'Linux Konqueror']
|
42
6
|
|
7
|
+
ATTRIBUTES = %w(name first_name last_name title location country industry summary picture linkedin_url education groups websites languages skills certifications organizations past_companies current_companies recommended_visitors)
|
8
|
+
|
9
|
+
attr_reader :page, :linkedin_url
|
10
|
+
|
43
11
|
def self.get_profile(url)
|
44
12
|
begin
|
45
|
-
|
46
|
-
@agent.user_agent_alias = USER_AGENTS.sample
|
47
|
-
@agent.max_history = 0
|
48
|
-
page = @agent.get(url)
|
49
|
-
return Linkedin::Profile.new(page, url)
|
13
|
+
Linkedin::Profile.new(url)
|
50
14
|
rescue => e
|
51
15
|
puts e
|
52
16
|
end
|
53
17
|
end
|
54
18
|
|
55
|
-
def
|
56
|
-
|
19
|
+
def initialize(url)
|
20
|
+
@linkedin_url = url
|
21
|
+
@page = http_client.get(url)
|
57
22
|
end
|
58
|
-
|
59
|
-
def
|
60
|
-
|
61
|
-
if node.at("h4/strong/a")
|
62
|
-
link = node.at("h4/strong/a")["href"]
|
63
|
-
@agent = Mechanize.new
|
64
|
-
@agent.user_agent_alias = USER_AGENTS.sample
|
65
|
-
@agent.max_history = 0
|
66
|
-
page = @agent.get("http://www.linkedin.com"+link)
|
67
|
-
result[:linkedin_company_url] = "http://www.linkedin.com"+link
|
68
|
-
result[:url] = page.at(".basic-info/div/dl/dd/a").text if page.at(".basic-info/div/dl/dd/a")
|
69
|
-
node_2 = page.at(".basic-info").at(".content.inner-mod")
|
70
|
-
node_2.search("dd").zip(node_2.search("dt")).each do |value,title|
|
71
|
-
result[title.text.gsub(" ","_").downcase.to_sym] = value.text.strip
|
72
|
-
end
|
73
|
-
result[:address] = page.at(".vcard.hq").at(".adr").text.gsub("\n"," ").strip if page.at(".vcard.hq")
|
74
|
-
end
|
75
|
-
result
|
23
|
+
|
24
|
+
def name
|
25
|
+
"#{first_name} #{last_name}"
|
76
26
|
end
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
def get_first_name page
|
81
|
-
return page.at(".given-name").text.strip if page.search(".given-name").first
|
27
|
+
|
28
|
+
def first_name
|
29
|
+
@first_name ||= (@page.at('.given-name').text.strip if @page.at('.given-name'))
|
82
30
|
end
|
83
31
|
|
84
|
-
def
|
85
|
-
|
32
|
+
def last_name
|
33
|
+
@last_name ||= (@page.at('.family-name').text.strip if @page.at('.family-name'))
|
86
34
|
end
|
87
35
|
|
88
|
-
def
|
89
|
-
|
36
|
+
def title
|
37
|
+
@title ||= (@page.at('.headline-title').text.gsub(/\s+/, ' ').strip if @page.at('.headline-title'))
|
90
38
|
end
|
91
39
|
|
92
|
-
def
|
93
|
-
|
40
|
+
def location
|
41
|
+
@location ||= (@page.at('.locality').text.split(',').first.strip if @page.at('.locality'))
|
94
42
|
end
|
95
43
|
|
96
|
-
def
|
97
|
-
|
44
|
+
def country
|
45
|
+
@country ||= (@page.at('.locality').text.split(',').last.strip if @page.at('.locality'))
|
98
46
|
end
|
99
47
|
|
100
|
-
def
|
101
|
-
|
48
|
+
def industry
|
49
|
+
@industry ||= (@page.at('.industry').text.gsub(/\s+/, ' ').strip if @page.at('.industry'))
|
102
50
|
end
|
103
51
|
|
104
|
-
def
|
105
|
-
page.at(
|
52
|
+
def summary
|
53
|
+
@summary ||= (@page.at('.description.summary').text.gsub(/\s+/, ' ').strip if @page.at('.description.summary'))
|
106
54
|
end
|
107
55
|
|
56
|
+
def picture
|
57
|
+
@picture ||= (@page.at('#profile-picture/img.photo').attributes['src'].value.strip if @page.at('#profile-picture/img.photo'))
|
58
|
+
end
|
108
59
|
|
109
|
-
def
|
110
|
-
|
60
|
+
def skills
|
61
|
+
@skills ||= (@page.search('.competency.show-bean').map{|skill| skill.text.strip if skill.text} rescue nil)
|
111
62
|
end
|
112
63
|
|
113
|
-
def
|
114
|
-
|
115
|
-
if page.search(".position.experience.vevent.vcard.summary-past").first
|
116
|
-
page.search(".position.experience.vevent.vcard.summary-past").each do |past_company|
|
117
|
-
result = get_company_url past_company
|
118
|
-
url = result[:url]
|
119
|
-
title = past_company.at("h3").text.gsub(/\s+|\n/, " ").strip if past_company.at("h3")
|
120
|
-
company = past_company.at("h4").text.gsub(/\s+|\n/, " ").strip if past_company.at("h4")
|
121
|
-
description = past_company.at(".description.past-position").text.gsub(/\s+|\n/, " ").strip if past_company.at(".description.past-position")
|
122
|
-
p_company = {:past_company=>company,:past_title=> title,:past_company_website=>url,:description=>description}
|
123
|
-
p_company = p_company.merge(result)
|
124
|
-
past_cs << p_company
|
125
|
-
end
|
126
|
-
return past_cs
|
127
|
-
end
|
64
|
+
def past_companies
|
65
|
+
@past_companies ||= get_companies('past')
|
128
66
|
end
|
129
67
|
|
130
|
-
def
|
131
|
-
|
132
|
-
if page.search(".position.experience.vevent.vcard.summary-current").first
|
133
|
-
page.search(".position.experience.vevent.vcard.summary-current").each do |current_company|
|
134
|
-
result = get_company_url current_company
|
135
|
-
url = result[:url]
|
136
|
-
title = current_company.at("h3").text.gsub(/\s+|\n/, " ").strip if current_company.at("h3")
|
137
|
-
company = current_company.at("h4").text.gsub(/\s+|\n/, " ").strip if current_company.at("h4")
|
138
|
-
description = current_company.at(".description.current-position").text.gsub(/\s+|\n/, " ").strip if current_company.at(".description.current-position")
|
139
|
-
current_company = {:current_company=>company,:current_title=> title,:current_company_url=>url,:description=>description}
|
140
|
-
current_cs << current_company.merge(result)
|
141
|
-
end
|
142
|
-
return current_cs
|
143
|
-
end
|
68
|
+
def current_companies
|
69
|
+
@current_companies ||= get_companies('current')
|
144
70
|
end
|
145
71
|
|
146
|
-
def
|
147
|
-
education
|
148
|
-
|
149
|
-
page.search(
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
72
|
+
def education
|
73
|
+
unless @education
|
74
|
+
@education = []
|
75
|
+
if @page.search('.position.education.vevent.vcard').first
|
76
|
+
@education = @page.search('.position.education.vevent.vcard').map do |item|
|
77
|
+
name = item.at('h3').text.gsub(/\s+|\n/, ' ').strip if item.at('h3')
|
78
|
+
desc = item.at('h4').text.gsub(/\s+|\n/, ' ').strip if item.at('h4')
|
79
|
+
period = item.at('.period').text.gsub(/\s+|\n/, ' ').strip if item.at('.period')
|
80
|
+
|
81
|
+
{:name => name, :description => desc, :period => period}
|
82
|
+
end
|
155
83
|
end
|
156
|
-
return education
|
157
84
|
end
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
85
|
+
@education
|
86
|
+
end
|
87
|
+
|
88
|
+
def websites
|
89
|
+
unless @websites
|
90
|
+
@websites = []
|
91
|
+
if @page.search('.website').first
|
92
|
+
@websites = @page.search('.website').map do |site|
|
93
|
+
url = site.at('a')['href']
|
94
|
+
url = "http://www.linkedin.com#{url}"
|
95
|
+
CGI.parse(URI.parse(url).query)['url']
|
96
|
+
end.flatten!
|
168
97
|
end
|
169
|
-
return websites.flatten!
|
170
98
|
end
|
99
|
+
@websites
|
171
100
|
end
|
172
101
|
|
173
|
-
def
|
174
|
-
groups
|
175
|
-
|
176
|
-
page.search(
|
177
|
-
|
178
|
-
|
179
|
-
|
102
|
+
def groups
|
103
|
+
unless @groups
|
104
|
+
@groups = []
|
105
|
+
if page.search('.group-data').first
|
106
|
+
@groups = page.search('.group-data').each do |item|
|
107
|
+
name = item.text.gsub(/\s+|\n/, ' ').strip
|
108
|
+
link = "http://www.linkedin.com#{item.at('a')['href']}"
|
109
|
+
{:name => name, :link => link}
|
110
|
+
end
|
180
111
|
end
|
181
|
-
return groups
|
182
112
|
end
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
name
|
197
|
-
position = nil # add this later
|
198
|
-
occupation = nil # add this latetr too, this relates to the experience/work
|
199
|
-
start_date = Date.parse(item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').first)
|
200
|
-
if item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').last == 'Present'
|
201
|
-
end_date = nil
|
202
|
-
else
|
203
|
-
Date.parse(item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').last)
|
204
|
-
end
|
205
|
-
|
206
|
-
organizations << { name: name, start_date: start_date, end_date: end_date }
|
207
|
-
rescue => e
|
208
|
-
|
113
|
+
@groups
|
114
|
+
end
|
115
|
+
|
116
|
+
def organizations
|
117
|
+
unless @organizations
|
118
|
+
@organizations = []
|
119
|
+
if @page.search('ul.organizations/li.organization').first
|
120
|
+
@organizations = @page.search('ul.organizations/li.organization').map do |item|
|
121
|
+
|
122
|
+
name = item.search('h3').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
123
|
+
start_date, end_date = item.search('ul.specifics li').text.gsub(/\s+|\n/, ' ').strip.split(' to ')
|
124
|
+
start_date = Date.parse(start_date) rescue nil
|
125
|
+
end_date = Date.parse(end_date) rescue nil
|
126
|
+
{:name => name, :start_date => start_date, :end_date => end_date}
|
209
127
|
end
|
210
128
|
end
|
211
|
-
return organizations
|
212
129
|
end
|
130
|
+
@organizations
|
213
131
|
end
|
214
132
|
|
215
|
-
def
|
216
|
-
languages
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
# find the h3 element within the above section and get the text with excess white space stripped
|
224
|
-
language = item.at('h3').text
|
225
|
-
proficiency = item.at('span.proficiency').text.gsub(/\s+|\n/, " ").strip
|
226
|
-
languages << { language:language, proficiency:proficiency }
|
227
|
-
rescue => e
|
133
|
+
def languages
|
134
|
+
unless @languages
|
135
|
+
@languages = []
|
136
|
+
if @page.at('ul.languages/li.language')
|
137
|
+
@languages = @page.search('ul.languages/li.language').map do |item|
|
138
|
+
language = item.at('h3').text rescue nil
|
139
|
+
proficiency = item.at('span.proficiency').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
140
|
+
{:language=> language, :proficiency => proficiency }
|
228
141
|
end
|
229
142
|
end
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
# loop over each element with cert data
|
247
|
-
page.search(query).each do |item|
|
248
|
-
begin
|
249
|
-
item_text = item.text.gsub(/\s+|\n/, " ").strip
|
250
|
-
name = item_text.split(" #{item_text.scan(/#{months} \d{4}/)[0]}")[0]
|
251
|
-
authority = nil # we need a profile with an example of this and probably will need to use the API to accuratetly get this data
|
252
|
-
license = nil # we need a profile with an example of this and probably will need to use the API to accuratetly get this data
|
253
|
-
start_date = Date.parse(item_text.scan(regex)[0].join(' '))
|
254
|
-
|
255
|
-
includes_end_date = item_text.scan(regex).count > 1
|
256
|
-
end_date = includes_end_date ? Date.parse(item_text.scan(regex)[0].join(' ')) : nil # we need a profile with an example of this and probably will need to use the API to accuratetly get this data
|
257
|
-
|
258
|
-
certifications << { name:name, authority:authority, license:license, start_date:start_date, end_date:end_date }
|
259
|
-
rescue => e
|
143
|
+
end
|
144
|
+
@languages
|
145
|
+
end
|
146
|
+
|
147
|
+
def certifications
|
148
|
+
unless @certtifications
|
149
|
+
@certifications = []
|
150
|
+
if @page.at('ul.certifications/li.certification')
|
151
|
+
@certifications = @page.search('ul.certifications/li.certification').map do |item|
|
152
|
+
name = item.at('h3').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
153
|
+
authority = item.at('.specifics/.org').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
154
|
+
license = item.at('.specifics/.licence-number').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
155
|
+
start_date = item.at('.specifics/.dtstart').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
156
|
+
|
157
|
+
{:name => name, :authority => authority, :license => license, :start_date => start_date}
|
260
158
|
end
|
261
159
|
end
|
262
|
-
return certifications
|
263
160
|
end
|
264
|
-
|
265
|
-
end
|
266
|
-
|
267
|
-
|
268
|
-
def
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
occupation = nil # add this latetr too, this relates to the experience/work
|
280
|
-
start_date = Date.parse(item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').first)
|
281
|
-
if item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').last == 'Present'
|
282
|
-
end_date = nil
|
283
|
-
else
|
284
|
-
Date.parse(item.search('ul.specifics li').text.gsub(/\s+|\n/, " ").strip.split(' to ').last)
|
285
|
-
end
|
286
|
-
|
287
|
-
organizations << { name: name, start_date: start_date, end_date: end_date }
|
288
|
-
rescue => e
|
161
|
+
@certifications
|
162
|
+
end
|
163
|
+
|
164
|
+
|
165
|
+
def recommended_visitors
|
166
|
+
unless @recommended_visitors
|
167
|
+
@recommended_visitors = []
|
168
|
+
if @page.at('.browsemap/.content/ul/li')
|
169
|
+
@recommended_visitors = @page.search('.browsemap/.content/ul/li').map do |visitor|
|
170
|
+
v = {}
|
171
|
+
v[:link] = visitor.at('a')['href']
|
172
|
+
v[:name] = visitor.at('strong/a').text
|
173
|
+
v[:title] = visitor.at('.headline').text.gsub('...',' ').split(' at ').first
|
174
|
+
v[:company] = visitor.at('.headline').text.gsub('...',' ').split(' at ')[1]
|
175
|
+
v
|
289
176
|
end
|
290
177
|
end
|
291
178
|
end
|
292
|
-
|
179
|
+
@recommended_visitors
|
293
180
|
end
|
294
181
|
|
182
|
+
def to_json
|
183
|
+
require 'json'
|
184
|
+
hash = {}
|
185
|
+
ATTRIBUTES.each do |attribute|
|
186
|
+
hash[attribute.to_sym] = self.send(attribute.to_sym)
|
187
|
+
end
|
188
|
+
hash.to_json
|
189
|
+
end
|
295
190
|
|
296
191
|
|
297
|
-
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
192
|
+
private
|
193
|
+
|
194
|
+
def get_companies(type)
|
195
|
+
companies = []
|
196
|
+
if @page.search(".position.experience.vevent.vcard.summary-#{type}").first
|
197
|
+
@page.search(".position.experience.vevent.vcard.summary-#{type}").each do |node|
|
198
|
+
|
199
|
+
company = {}
|
200
|
+
company[:title] = node.at('h3').text.gsub(/\s+|\n/, ' ').strip if node.at('h3')
|
201
|
+
company[:company] = node.at('h4').text.gsub(/\s+|\n/, ' ').strip if node.at('h4')
|
202
|
+
company[:description] = node.at(".description.#{type}-position").text.gsub(/\s+|\n/, ' ').strip if node.at(".description.#{type}-position")
|
203
|
+
start_date = node.at('.dtstart').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
204
|
+
company[:start_date] = Date.parse(start_date) rescue nil
|
205
|
+
|
206
|
+
end_date = node.at('.dtend').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
207
|
+
company[:end_date] = Date.parse(end_date) rescue nil
|
208
|
+
|
209
|
+
|
210
|
+
company_link = node.at('h4/strong/a')['href'] if node.at('h4/strong/a')
|
211
|
+
|
212
|
+
result = get_company_details(company_link)
|
213
|
+
companies << company.merge!(result)
|
214
|
+
end
|
215
|
+
end
|
216
|
+
companies
|
217
|
+
end
|
218
|
+
|
219
|
+
|
220
|
+
def get_company_details(link)
|
221
|
+
result = {:linkedin_company_url => "http://www.linkedin.com#{link}"}
|
222
|
+
page = http_client.get(result[:linkedin_company_url])
|
223
|
+
|
224
|
+
result[:url] = page.at('.basic-info/div/dl/dd/a').text if page.at('.basic-info/div/dl/dd/a')
|
225
|
+
node_2 = page.at('.basic-info/.content.inner-mod')
|
226
|
+
if node_2
|
227
|
+
node_2.search('dd').zip(node_2.search('dt')).each do |value,title|
|
228
|
+
result[title.text.gsub(' ','_').downcase.to_sym] = value.text.strip
|
308
229
|
end
|
309
|
-
|
230
|
+
end
|
231
|
+
result[:address] = page.at('.vcard.hq').at('.adr').text.gsub("\n",' ').strip if page.at('.vcard.hq')
|
232
|
+
result
|
233
|
+
end
|
234
|
+
|
235
|
+
def http_client
|
236
|
+
Mechanize.new do |agent|
|
237
|
+
agent.user_agent_alias = USER_AGENTS.sample
|
238
|
+
agent.max_history = 0
|
310
239
|
end
|
311
240
|
end
|
312
241
|
|
data/linkedin-scraper.gemspec
CHANGED
@@ -6,11 +6,16 @@ Gem::Specification.new do |gem|
|
|
6
6
|
gem.description = %q{Scrapes the linkedin profile when a url is given }
|
7
7
|
gem.summary = %q{when a url of public linkedin profile page is given it scrapes the entire page and converts into a accessible object}
|
8
8
|
gem.homepage = "https://github.com/yatishmehta27/linkedin-scraper"
|
9
|
-
gem.add_dependency(%q<mechanize>, [">= 0"])
|
10
9
|
gem.files = `git ls-files`.split($\)
|
11
10
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
12
11
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
13
12
|
gem.name = "linkedin-scraper"
|
14
13
|
gem.require_paths = ["lib"]
|
15
14
|
gem.version = Linkedin::Scraper::VERSION
|
15
|
+
|
16
|
+
|
17
|
+
gem.add_dependency(%q<mechanize>, [">= 0"])
|
18
|
+
gem.add_development_dependency 'rspec','>=0'
|
19
|
+
gem.add_development_dependency 'rake'
|
20
|
+
|
16
21
|
end
|
@@ -5,63 +5,120 @@ describe Linkedin::Profile do
|
|
5
5
|
|
6
6
|
|
7
7
|
before(:all) do
|
8
|
-
page = Nokogiri::HTML(File.open("spec/fixtures/jgrevich.html", 'r') { |f| f.read })
|
9
|
-
@profile = Linkedin::Profile.new(
|
8
|
+
@page = Nokogiri::HTML(File.open("spec/fixtures/jgrevich.html", 'r') { |f| f.read })
|
9
|
+
@profile = Linkedin::Profile.new("http://www.linkedin.com/in/jgrevich")
|
10
10
|
end
|
11
11
|
|
12
|
-
describe "
|
13
|
-
it "Create an instance of
|
12
|
+
describe ".get_profile" do
|
13
|
+
it "Create an instance of Linkedin::Profile class" do
|
14
14
|
expect(@profile).to be_instance_of Linkedin::Profile
|
15
15
|
end
|
16
16
|
end
|
17
17
|
|
18
|
-
describe "
|
19
|
-
it 'returns the first
|
18
|
+
describe "#first_name" do
|
19
|
+
it 'returns the first name of the profile' do
|
20
20
|
expect(@profile.first_name).to eq "Justin"
|
21
21
|
end
|
22
22
|
end
|
23
23
|
|
24
|
-
describe "
|
24
|
+
describe "#last_name" do
|
25
25
|
it 'returns the last name of the profile' do
|
26
26
|
expect(@profile.last_name).to eq "Grevich"
|
27
27
|
end
|
28
28
|
end
|
29
29
|
|
30
|
-
describe
|
31
|
-
it 'returns the
|
32
|
-
expect(@profile.
|
30
|
+
describe '#title' do
|
31
|
+
it 'returns the title of the profile' do
|
32
|
+
expect(@profile.title).to eq 'Presidential Innovation Fellow'
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
describe '#location' do
|
37
|
+
it 'returns the location of the profile' do
|
38
|
+
expect(@profile.location).to eq 'Washington'
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
describe '#country' do
|
43
|
+
it 'returns the country of the profile' do
|
44
|
+
expect(@profile.country).to eq 'District Of Columbia'
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
describe '#industry' do
|
49
|
+
it 'returns the industry of the profile' do
|
50
|
+
expect(@profile.industry).to eq 'Information Technology and Services'
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
describe '#summary' do
|
55
|
+
it 'returns the summary of the profile' do
|
56
|
+
expect(@profile.summary).to match(/Justin Grevich is a Presidential Innovation Fellow working/)
|
33
57
|
end
|
34
58
|
end
|
35
59
|
|
36
|
-
describe
|
37
|
-
it 'returns
|
38
|
-
expect(@profile.
|
39
|
-
expect(@profile.certifications.count).to eq 2
|
60
|
+
describe '#picture' do
|
61
|
+
it 'returns the picture url of the profile' do
|
62
|
+
expect(@profile.picture).to eq 'http://m.c.lnkd.licdn.com/mpr/pub/image-1OSOQPrarAEIMksx5uUyhfRUO9zb6R4JjbULhhrDOMFS6dtV1OSLWbcaOK9b92S3rlE9/justin-grevich.jpg'
|
40
63
|
end
|
64
|
+
end
|
41
65
|
|
42
|
-
|
43
|
-
|
66
|
+
describe '#skills' do
|
67
|
+
it 'returns the array of skills of the profile' do
|
68
|
+
skills = ["Ruby", "Ruby on Rails", "Web Development", "Web Applications", "CSS3", "HTML 5", "Shell Scripting", "Python", "Chef", "Git", "Subversion", "JavaScript", "Rspec", "jQuery", "Capistrano", "Sinatra", "CoffeeScript", "Haml", "Standards Compliance", "MySQL", "PostgreSQL", "Solr", "Sphinx", "Heroku", "Amazon Web Services (AWS)", "Information Security", "Vulnerability Assessment", "SAN", "ZFS", "Backup Solutions", "SaaS", "System Administration", "Project Management", "Linux", "Troubleshooting", "Network Security", "OS X", "Bash", "Cloud Computing", "Web Design", "MongoDB", "Z-Wave", "Home Automation"]
|
69
|
+
expect(@profile.skills).to include(*skills)
|
44
70
|
end
|
71
|
+
end
|
72
|
+
|
73
|
+
describe '#past_companies' do
|
74
|
+
it 'returns an array of hashes of past companies with its details' do
|
75
|
+
@profile.past_companies
|
76
|
+
end
|
77
|
+
end
|
45
78
|
|
46
|
-
|
47
|
-
|
79
|
+
describe '#current_companies' do
|
80
|
+
it 'returns an array of hashes of current companies with its details' do
|
81
|
+
@profile.current_companies
|
48
82
|
end
|
49
83
|
end
|
50
84
|
|
51
|
-
describe
|
85
|
+
describe '#education' do
|
86
|
+
it 'returns the array of hashes of education with details' do
|
87
|
+
@profile.education
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
describe '#websites' do
|
92
|
+
it 'returns the array of websites' do
|
93
|
+
@profile.websites
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
describe '#groups' do
|
98
|
+
it 'returns the array of hashes of groups with details' do
|
99
|
+
@profile.groups
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
describe "#name" do
|
104
|
+
it 'returns the first and last name of the profile' do
|
105
|
+
expect(@profile.name).to eq "Justin Grevich"
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
describe "#organizations" do
|
52
110
|
it 'returns an array of organization hashes for the profile' do
|
53
111
|
expect(@profile.organizations.class).to eq Array
|
54
112
|
expect(@profile.organizations.first[:name]).to eq 'San Diego Ruby'
|
55
113
|
end
|
56
114
|
end
|
57
115
|
|
58
|
-
describe "
|
116
|
+
describe "#languages" do
|
59
117
|
it 'returns an array of languages hashes' do
|
60
118
|
expect(@profile.languages.class).to eq Array
|
61
119
|
end
|
62
120
|
|
63
121
|
context 'with language data' do
|
64
|
-
|
65
122
|
it 'returns an array with one language hash' do
|
66
123
|
expect(@profile.languages.class).to eq Array
|
67
124
|
end
|
@@ -76,8 +133,20 @@ describe Linkedin::Profile do
|
|
76
133
|
end
|
77
134
|
end
|
78
135
|
end # context 'with language data' do
|
136
|
+
|
79
137
|
end # describe ".languages" do
|
80
138
|
|
139
|
+
describe '#recommended_visitors' do
|
140
|
+
it 'returns the array of hashes of recommended visitors' do
|
141
|
+
@profile.recommended_visitors
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
describe '#certifications' do
|
146
|
+
it 'returns the array of hashes of certifications' do
|
147
|
+
@profile.certifications
|
148
|
+
end
|
149
|
+
end
|
81
150
|
|
82
151
|
|
83
152
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedin-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.11
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yatish Mehta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-09-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -24,17 +24,48 @@ dependencies:
|
|
24
24
|
- - '>='
|
25
25
|
- !ruby/object:Gem::Version
|
26
26
|
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rspec
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
27
55
|
description: 'Scrapes the linkedin profile when a url is given '
|
28
56
|
email:
|
29
|
-
executables:
|
57
|
+
executables:
|
58
|
+
- linkedin-scraper
|
30
59
|
extensions: []
|
31
60
|
extra_rdoc_files: []
|
32
61
|
files:
|
33
62
|
- .gitignore
|
63
|
+
- .travis.yml
|
34
64
|
- Gemfile
|
35
65
|
- LICENSE
|
36
66
|
- README.md
|
37
67
|
- Rakefile
|
68
|
+
- bin/linkedin-scraper
|
38
69
|
- lib/linkedin-scraper.rb
|
39
70
|
- lib/linkedin-scraper/profile.rb
|
40
71
|
- lib/linkedin-scraper/version.rb
|
@@ -61,7 +92,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
61
92
|
version: '0'
|
62
93
|
requirements: []
|
63
94
|
rubyforge_project:
|
64
|
-
rubygems_version: 2.
|
95
|
+
rubygems_version: 2.1.2
|
65
96
|
signing_key:
|
66
97
|
specification_version: 4
|
67
98
|
summary: when a url of public linkedin profile page is given it scrapes the entire
|