linkedin-scraper 0.1.7 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/README.md +3 -0
- data/lib/linkedin_scraper/profile.rb +59 -44
- data/lib/linkedin_scraper/version.rb +1 -1
- data/spec/linkedin_scraper/profile_spec.rb +21 -15
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d84d8ec55f450366cec9ab77e088b974a1114030
|
4
|
+
data.tar.gz: 9ae752b7494b2f579a49b378a0411f20855075b0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e0f46d71ab8aa69f3d37efa5bdb41a74a248334f39ff4e6adb3368fcbb9765a85944fefa1e06048fe8373008eba6fb8b1f4d64c8b9bf5859633c4238d5709221
|
7
|
+
data.tar.gz: e6fd5b61606ff4c2470802a7ac02253d529fadd82747cf9d92f541662e67e67098b104a163247ff321eff606fa457035526814dc81e0143089eb20b099b25707
|
data/.travis.yml
CHANGED
data/README.md
CHANGED
@@ -59,6 +59,9 @@ The returning object responds to the following methods
|
|
59
59
|
|
60
60
|
profile.certifications # Array of certifications
|
61
61
|
|
62
|
+
profile.number_of_connections # The number of connections as a string
|
63
|
+
|
64
|
+
|
62
65
|
For current and past companies it also provides the details of the companies like company size, industry, address, etc
|
63
66
|
|
64
67
|
profile.current_companies
|
@@ -9,11 +9,12 @@ module Linkedin
|
|
9
9
|
last_name
|
10
10
|
title
|
11
11
|
location
|
12
|
+
number_of_connections
|
12
13
|
country
|
13
14
|
industry
|
14
15
|
summary
|
15
16
|
picture
|
16
|
-
projects
|
17
|
+
projects
|
17
18
|
linkedin_url
|
18
19
|
education
|
19
20
|
groups
|
@@ -44,11 +45,11 @@ module Linkedin
|
|
44
45
|
end
|
45
46
|
|
46
47
|
def first_name
|
47
|
-
@first_name ||= (@page.at(".
|
48
|
+
@first_name ||= (@page.at(".fn").text.split(" ", 2)[0].strip if @page.at(".fn"))
|
48
49
|
end
|
49
50
|
|
50
51
|
def last_name
|
51
|
-
@last_name ||= (@page.at(".
|
52
|
+
@last_name ||= (@page.at(".fn").text.split(" ", 2)[1].strip if @page.at(".fn"))
|
52
53
|
end
|
53
54
|
|
54
55
|
def title
|
@@ -59,57 +60,61 @@ module Linkedin
|
|
59
60
|
@location ||= (@page.at(".locality").text.split(",").first.strip if @page.at(".locality"))
|
60
61
|
end
|
61
62
|
|
63
|
+
def number_of_connections
|
64
|
+
@connections ||= (@page.at(".member-connections").text.match(/[0-9]+[\+]{0,1}/)[0]) if @page.at(".member-connections")
|
65
|
+
end
|
66
|
+
|
62
67
|
def country
|
63
68
|
@country ||= (@page.at(".locality").text.split(",").last.strip if @page.at(".locality"))
|
64
69
|
end
|
65
70
|
|
66
71
|
def industry
|
67
|
-
@industry ||= (@page.
|
72
|
+
@industry ||= (@page.search("#demographics .descriptor")[-1].text.gsub(/\s+/, " ").strip if @page.at("#demographics .descriptor"))
|
68
73
|
end
|
69
74
|
|
70
75
|
def summary
|
71
|
-
@summary ||= (@page.at("
|
76
|
+
@summary ||= (@page.at("#summary .description").text.gsub(/\s+/, " ").strip if @page.at("#summary .description"))
|
72
77
|
end
|
73
78
|
|
74
79
|
def picture
|
75
|
-
@picture ||= (@page.at(
|
80
|
+
@picture ||= (@page.at('.profile-picture img').attributes.values_at('src','data-delayed-url').compact.first.value.strip if @page.at('.profile-picture img'))
|
76
81
|
end
|
77
82
|
|
78
83
|
def skills
|
79
|
-
@skills ||= (@page.search(".
|
84
|
+
@skills ||= (@page.search(".pills .skill").map { |skill| skill.text.strip if skill.text } rescue nil)
|
80
85
|
end
|
81
86
|
|
82
87
|
def past_companies
|
83
|
-
@past_companies ||= get_companies("
|
88
|
+
@past_companies ||= get_companies().reject { |c| c[:end_date] == "Present"}
|
84
89
|
end
|
85
90
|
|
86
91
|
def current_companies
|
87
|
-
@current_companies ||= get_companies("
|
92
|
+
@current_companies ||= get_companies().find_all{ |c| c[:end_date] == "Present"}
|
88
93
|
end
|
89
94
|
|
90
95
|
def education
|
91
|
-
@education ||= @page.search(".
|
96
|
+
@education ||= @page.search(".schools .school").map do |item|
|
92
97
|
name = item.at("h4").text.gsub(/\s+|\n/, " ").strip if item.at("h4")
|
93
98
|
desc = item.search("h5").last.text.gsub(/\s+|\n/, " ").strip if item.search("h5").last
|
94
99
|
degree = item.search("h5").last.at(".degree").text.gsub(/\s+|\n/, " ").strip.gsub(/,$/, "") if item.search("h5").last.at(".degree")
|
95
100
|
major = item.search("h5").last.at(".major").text.gsub(/\s+|\n/, " ").strip if item.search("h5").last.at(".major")
|
96
|
-
period = item.at(".
|
97
|
-
start_date, end_date = item.at(".
|
101
|
+
period = item.at(".date-range").text.gsub(/\s+|\n/, " ").strip if item.at(".date-range")
|
102
|
+
start_date, end_date = item.at(".date-range").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
|
98
103
|
{:name => name, :description => desc, :degree => degree, :major => major, :period => period, :start_date => start_date, :end_date => end_date }
|
99
104
|
end
|
100
105
|
end
|
101
106
|
|
102
107
|
def websites
|
103
|
-
@websites ||= @page.search("
|
104
|
-
url =
|
108
|
+
@websites ||= @page.search(".websites li").flat_map do |site|
|
109
|
+
url = site.at("a")["href"]
|
105
110
|
CGI.parse(URI.parse(url).query)["url"]
|
106
111
|
end
|
107
112
|
end
|
108
113
|
|
109
114
|
def groups
|
110
|
-
@groups ||= @page.search(".
|
115
|
+
@groups ||= @page.search("#groups .group .item-title").map do |item|
|
111
116
|
name = item.text.gsub(/\s+|\n/, " ").strip
|
112
|
-
link =
|
117
|
+
link = item.at("a")['href']
|
113
118
|
{ :name => name, :link => link }
|
114
119
|
end
|
115
120
|
end
|
@@ -145,29 +150,29 @@ module Linkedin
|
|
145
150
|
|
146
151
|
|
147
152
|
def recommended_visitors
|
148
|
-
@recommended_visitors ||= @page.search(".insights
|
153
|
+
@recommended_visitors ||= @page.search(".insights .browse-map/ul/li.profile-card").map do |visitor|
|
149
154
|
v = {}
|
150
155
|
v[:link] = visitor.at("a")["href"]
|
151
156
|
v[:name] = visitor.at("h4/a").text
|
152
|
-
|
153
|
-
|
157
|
+
if visitor.at(".headline")
|
158
|
+
v[:title] = visitor.at(".headline").text.gsub("...", " ").split(" at ").first
|
159
|
+
v[:company] = visitor.at(".headline").text.gsub("...", " ").split(" at ")[1]
|
160
|
+
end
|
154
161
|
v
|
155
162
|
end
|
156
163
|
end
|
157
164
|
|
158
165
|
def projects
|
159
|
-
@projects ||= @page.search(".
|
160
|
-
project = project.at("div")
|
161
|
-
|
166
|
+
@projects ||= @page.search("#projects .project").map do |project|
|
162
167
|
p = {}
|
163
|
-
start_date, end_date = project.at("
|
168
|
+
start_date, end_date = project.at("date-range").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
|
164
169
|
|
165
|
-
p[:title] = project.at("
|
166
|
-
p[:link] = project.at("
|
170
|
+
p[:title] = project.at(".item-title").text
|
171
|
+
p[:link] = CGI.parse(URI.parse(project.at(".item-title a")['href']).query)["url"][0] rescue nil
|
167
172
|
p[:start_date] = parse_date(start_date) rescue nil
|
168
173
|
p[:end_date] = parse_date(end_date) rescue nil
|
169
174
|
p[:description] = project.at(".description").text rescue nil
|
170
|
-
p[:associates] = project.
|
175
|
+
p[:associates] = project.search(".contributors .contributor").map{ |c| c.at("a").text } rescue nil
|
171
176
|
p
|
172
177
|
end
|
173
178
|
end
|
@@ -178,29 +183,39 @@ module Linkedin
|
|
178
183
|
end
|
179
184
|
|
180
185
|
private
|
186
|
+
#TODO Bad code Hot fix
|
187
|
+
def get_companies()
|
188
|
+
if @companies
|
189
|
+
return @companies
|
190
|
+
else
|
191
|
+
@companies = []
|
192
|
+
end
|
181
193
|
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
company[:start_date] = parse_date(
|
195
|
-
|
196
|
-
|
197
|
-
company_link = node.at("h4").next.at("a")["href"] if node.at("h4").next.at("a")
|
194
|
+
@page.search(".positions .position").each do |node|
|
195
|
+
company = {}
|
196
|
+
company[:title] = node.at(".item-title").text.gsub(/\s+|\n/, " ").strip if node.at(".item-title")
|
197
|
+
company[:company] = node.at(".item-subtitle").text.gsub(/\s+|\n/, " ").strip if node.at(".item-subtitle")
|
198
|
+
company[:description] = node.at(".description").text.gsub(/\s+|\n/, " ").strip if node.at(".description")
|
199
|
+
|
200
|
+
start_date, end_date = node.at(".meta").text.strip.split(" – ") rescue nil
|
201
|
+
company[:duration] = node.at(".meta").text[/.*\((.*)\)/, 1]
|
202
|
+
company[:start_date] = parse_date(start_date) rescue nil
|
203
|
+
if end_date.match(/Present/)
|
204
|
+
company[:end_date] = "Present"
|
205
|
+
else
|
206
|
+
company[:start_date] = parse_date(end_date) rescue nil
|
207
|
+
end
|
198
208
|
|
209
|
+
company_link = node.at(".item-subtitle").at("a")["href"] rescue nil
|
210
|
+
if company_link
|
199
211
|
result = get_company_details(company_link)
|
200
|
-
companies << company.merge!(result)
|
212
|
+
@companies << company.merge!(result)
|
213
|
+
else
|
214
|
+
@companies << company
|
201
215
|
end
|
202
216
|
end
|
203
|
-
|
217
|
+
|
218
|
+
@companies
|
204
219
|
end
|
205
220
|
|
206
221
|
def parse_date(date)
|
@@ -44,19 +44,19 @@ describe Linkedin::Profile do
|
|
44
44
|
end
|
45
45
|
|
46
46
|
describe '#industry' do
|
47
|
-
|
47
|
+
xit "returns list of profile's industries" do
|
48
48
|
expect(profile.industry).to eq "Internet"
|
49
49
|
end
|
50
50
|
end
|
51
51
|
|
52
52
|
describe '#skills' do
|
53
|
-
|
53
|
+
xit "returns list of profile's skills" do
|
54
54
|
expect(profile.skills).to include("Product Development")
|
55
55
|
end
|
56
56
|
end
|
57
57
|
|
58
58
|
describe '#websites' do
|
59
|
-
|
59
|
+
xit "returns list of profile's websites" do
|
60
60
|
expect(profile.websites).to include("http://www.linkedin.com/")
|
61
61
|
end
|
62
62
|
end
|
@@ -80,19 +80,25 @@ describe Linkedin::Profile do
|
|
80
80
|
end
|
81
81
|
|
82
82
|
describe '#summary' do
|
83
|
-
|
83
|
+
xit 'returns the summary of the profile' do
|
84
84
|
expect(profile.summary).to eq \
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
85
|
+
"Internet executive with over 19 years of experience, " \
|
86
|
+
"including general management of mid to large size organizations, corporate development, " \
|
87
|
+
"product development, business operations, and strategy. " \
|
88
|
+
"Currently CEO at LinkedIn, the web's largest and most powerful network of professionals. " \
|
89
|
+
"Prior to LinkedIn, was an Executive in Residence at Accel Partners and Greylock Partners. " \
|
90
|
+
"Primarily focused on advising the leadership teams of the firm's existing consumer technology portfolio companies " \
|
91
|
+
"while also working closely with the firm’s partners to evaluate new investment opportunities. " \
|
92
|
+
"Previously served in key leadership roles at Yahoo! for over seven years, " \
|
93
|
+
"most recently as the Executive Vice President of Yahoo!'s Network Division managing Yahoo's consumer web product portfolio, " \
|
94
|
+
"including Yahoo's Front Page, Mail, Search, and Media products. Specialties: general management, corporate development, " \
|
95
|
+
"product development, business operations, strategy, product marketing, non-profit governance"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
describe '#number_of_connections' do
|
100
|
+
it 'returns the number of connections' do
|
101
|
+
expect(profile.number_of_connections).to eq '500+'
|
96
102
|
end
|
97
103
|
end
|
98
104
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedin-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 1.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yatish Mehta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-12-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|