linkedin-scraper 0.1.5 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -1
- data/README.md +17 -7
- data/bin/linkedin-scraper +1 -1
- data/lib/linkedin_scraper/profile.rb +243 -0
- data/lib/{linkedin-scraper → linkedin_scraper}/version.rb +1 -1
- data/lib/linkedin_scraper.rb +5 -0
- data/linkedin-scraper.gemspec +7 -6
- data/spec/fixtures/jeffweiner08.html +308 -0
- data/spec/linkedin_scraper/.DS_Store +0 -0
- data/spec/linkedin_scraper/profile_spec.rb +104 -0
- metadata +27 -24
- data/lib/linkedin-scraper/profile.rb +0 -225
- data/lib/linkedin-scraper.rb +0 -5
- data/spec/fixtures/jgrevich.html +0 -9300
- data/spec/linkedin-scraper/profile_spec.rb +0 -154
metadata
CHANGED
@@ -1,58 +1,58 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedin-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yatish Mehta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-09-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '2'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '2'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rspec
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '3'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '3'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rake
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
45
|
+
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
47
|
+
version: '10'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - "
|
52
|
+
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
55
|
-
description: 'Scrapes the
|
54
|
+
version: '10'
|
55
|
+
description: 'Scrapes the LinkedIn profile using the public url '
|
56
56
|
email:
|
57
57
|
executables:
|
58
58
|
- linkedin-scraper
|
@@ -67,15 +67,17 @@ files:
|
|
67
67
|
- README.md
|
68
68
|
- Rakefile
|
69
69
|
- bin/linkedin-scraper
|
70
|
-
- lib/
|
71
|
-
- lib/
|
72
|
-
- lib/
|
70
|
+
- lib/linkedin_scraper.rb
|
71
|
+
- lib/linkedin_scraper/profile.rb
|
72
|
+
- lib/linkedin_scraper/version.rb
|
73
73
|
- linkedin-scraper.gemspec
|
74
|
-
- spec/fixtures/
|
75
|
-
- spec/
|
74
|
+
- spec/fixtures/jeffweiner08.html
|
75
|
+
- spec/linkedin_scraper/.DS_Store
|
76
|
+
- spec/linkedin_scraper/profile_spec.rb
|
76
77
|
- spec/spec_helper.rb
|
77
78
|
homepage: https://github.com/yatishmehta27/linkedin-scraper
|
78
|
-
licenses:
|
79
|
+
licenses:
|
80
|
+
- MIT
|
79
81
|
metadata: {}
|
80
82
|
post_install_message:
|
81
83
|
rdoc_options: []
|
@@ -93,12 +95,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
93
95
|
version: '0'
|
94
96
|
requirements: []
|
95
97
|
rubyforge_project:
|
96
|
-
rubygems_version: 2.4.
|
98
|
+
rubygems_version: 2.4.6
|
97
99
|
signing_key:
|
98
100
|
specification_version: 4
|
99
101
|
summary: when a url of public linkedin profile page is given it scrapes the entire
|
100
102
|
page and converts into a accessible object
|
101
103
|
test_files:
|
102
|
-
- spec/fixtures/
|
103
|
-
- spec/
|
104
|
+
- spec/fixtures/jeffweiner08.html
|
105
|
+
- spec/linkedin_scraper/.DS_Store
|
106
|
+
- spec/linkedin_scraper/profile_spec.rb
|
104
107
|
- spec/spec_helper.rb
|
@@ -1,225 +0,0 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
module Linkedin
|
3
|
-
|
4
|
-
class Profile
|
5
|
-
|
6
|
-
USER_AGENTS = ['Windows IE 6', 'Windows IE 7', 'Windows Mozilla', 'Mac Safari', 'Mac FireFox', 'Mac Mozilla', 'Linux Mozilla', 'Linux Firefox', 'Linux Konqueror']
|
7
|
-
|
8
|
-
ATTRIBUTES = %w(
|
9
|
-
name
|
10
|
-
first_name
|
11
|
-
last_name
|
12
|
-
title
|
13
|
-
location
|
14
|
-
country
|
15
|
-
industry
|
16
|
-
summary
|
17
|
-
picture
|
18
|
-
linkedin_url
|
19
|
-
education
|
20
|
-
groups
|
21
|
-
websites
|
22
|
-
languages
|
23
|
-
skills
|
24
|
-
certifications
|
25
|
-
organizations
|
26
|
-
past_companies
|
27
|
-
current_companies
|
28
|
-
recommended_visitors)
|
29
|
-
|
30
|
-
attr_reader :page, :linkedin_url
|
31
|
-
|
32
|
-
def self.get_profile(url)
|
33
|
-
Linkedin::Profile.new(url)
|
34
|
-
rescue => e
|
35
|
-
puts e
|
36
|
-
end
|
37
|
-
|
38
|
-
def initialize(url)
|
39
|
-
@linkedin_url = url
|
40
|
-
@page = http_client.get(url)
|
41
|
-
end
|
42
|
-
|
43
|
-
def name
|
44
|
-
"#{first_name} #{last_name}"
|
45
|
-
end
|
46
|
-
|
47
|
-
def first_name
|
48
|
-
@first_name ||= (@page.at('.full-name').text.split(' ', 2)[0].strip if @page.at('.full-name'))
|
49
|
-
end
|
50
|
-
|
51
|
-
def last_name
|
52
|
-
@last_name ||= (@page.at('.full-name').text.split(' ', 2)[1].strip if @page.at('.full-name'))
|
53
|
-
end
|
54
|
-
|
55
|
-
def title
|
56
|
-
@title ||= (@page.at('.title').text.gsub(/\s+/, ' ').strip if @page.at('.title'))
|
57
|
-
end
|
58
|
-
|
59
|
-
def location
|
60
|
-
@location ||= (@page.at('.locality').text.split(',').first.strip if @page.at('.locality'))
|
61
|
-
end
|
62
|
-
|
63
|
-
def country
|
64
|
-
@country ||= (@page.at('.locality').text.split(',').last.strip if @page.at('.locality'))
|
65
|
-
end
|
66
|
-
|
67
|
-
def industry
|
68
|
-
@industry ||= (@page.at('.industry').text.gsub(/\s+/, ' ').strip if @page.at('.industry'))
|
69
|
-
end
|
70
|
-
|
71
|
-
def summary
|
72
|
-
@summary ||= (@page.at('.summary .description').text.gsub(/\s+/, ' ').strip if @page.at('.summary .description'))
|
73
|
-
end
|
74
|
-
|
75
|
-
def picture
|
76
|
-
@picture ||= (@page.at('.profile-picture img').attributes['src'].value.strip if @page.at('.profile-picture img'))
|
77
|
-
end
|
78
|
-
|
79
|
-
def skills
|
80
|
-
@skills ||= (@page.search('.skill-pill .endorse-item-name-text').map { |skill| skill.text.strip if skill.text } rescue nil)
|
81
|
-
end
|
82
|
-
|
83
|
-
def past_companies
|
84
|
-
@past_companies ||= get_companies('past')
|
85
|
-
end
|
86
|
-
|
87
|
-
def current_companies
|
88
|
-
@current_companies ||= get_companies('current')
|
89
|
-
end
|
90
|
-
|
91
|
-
def education
|
92
|
-
@education ||= @page.search('.background-education .education').map do |item|
|
93
|
-
name = item.at('h4').text.gsub(/\s+|\n/, ' ').strip if item.at('h4')
|
94
|
-
desc = item.at('h5').text.gsub(/\s+|\n/, ' ').strip if item.at('h5')
|
95
|
-
period = item.at('.education-date').text.gsub(/\s+|\n/, ' ').strip if item.at('.education-date')
|
96
|
-
|
97
|
-
{:name => name, :description => desc, :period => period }
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
def websites
|
102
|
-
@websites ||= @page.search('#overview-summary-websites').flat_map do |site|
|
103
|
-
url = "http://www.linkedin.com#{site.at('a')['href']}"
|
104
|
-
CGI.parse(URI.parse(url).query)['url']
|
105
|
-
end
|
106
|
-
|
107
|
-
end
|
108
|
-
|
109
|
-
def groups
|
110
|
-
@groups ||= @page.search('.groups-name').map do |item|
|
111
|
-
name = item.text.gsub(/\s+|\n/, ' ').strip
|
112
|
-
link = "http://www.linkedin.com#{item.at('a')['href']}"
|
113
|
-
{ :name => name, :link => link }
|
114
|
-
end
|
115
|
-
end
|
116
|
-
|
117
|
-
def organizations
|
118
|
-
@organizations ||= @page.search('#background-organizations .section-item').map do |item|
|
119
|
-
name = item.at('.summary').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
120
|
-
start_date, end_date = item.at('.organizations-date').text.gsub(/\s+|\n/, ' ').strip.split(' – ') rescue nil
|
121
|
-
start_date = Date.parse(start_date) rescue nil
|
122
|
-
end_date = Date.parse(end_date) rescue nil
|
123
|
-
{ :name => name, :start_date => start_date, :end_date => end_date }
|
124
|
-
end
|
125
|
-
end
|
126
|
-
|
127
|
-
def languages
|
128
|
-
@languages ||= @page.search('.background-languages #languages ol li').map do |item|
|
129
|
-
language = item.at('h4').text rescue nil
|
130
|
-
proficiency = item.at('div.languages-proficiency').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
131
|
-
{ :language => language, :proficiency => proficiency }
|
132
|
-
end
|
133
|
-
end
|
134
|
-
|
135
|
-
def certifications
|
136
|
-
@certifications ||= @page.search('background-certifications').map do |item|
|
137
|
-
name = item.at('h4').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
138
|
-
authority = item.at('h5').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
139
|
-
license = item.at('.specifics/.licence-number').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
140
|
-
start_date = item.at('.certification-date').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
141
|
-
|
142
|
-
{ :name => name, :authority => authority, :license => license, :start_date => start_date }
|
143
|
-
end
|
144
|
-
end
|
145
|
-
|
146
|
-
|
147
|
-
def recommended_visitors
|
148
|
-
@recommended_visitors ||= @page.search('.insights-browse-map/ul/li').map do |visitor|
|
149
|
-
v = {}
|
150
|
-
v[:link] = visitor.at('a')['href']
|
151
|
-
v[:name] = visitor.at('h4/a').text
|
152
|
-
v[:title] = visitor.at('.browse-map-title').text.gsub('...', ' ').split(' at ').first
|
153
|
-
v[:company] = visitor.at('.browse-map-title').text.gsub('...', ' ').split(' at ')[1]
|
154
|
-
v
|
155
|
-
end
|
156
|
-
end
|
157
|
-
|
158
|
-
def to_json
|
159
|
-
require 'json'
|
160
|
-
ATTRIBUTES.reduce({}){ |hash,attr| hash[attr.to_sym] = self.send(attr.to_sym);hash }.to_json
|
161
|
-
end
|
162
|
-
|
163
|
-
private
|
164
|
-
|
165
|
-
def get_companies(type)
|
166
|
-
companies = []
|
167
|
-
if @page.search(".background-experience .#{type}-position").first
|
168
|
-
@page.search(".background-experience .#{type}-position").each do |node|
|
169
|
-
|
170
|
-
company = {}
|
171
|
-
company[:title] = node.at('h4').text.gsub(/\s+|\n/, ' ').strip if node.at('h4')
|
172
|
-
company[:company] = node.at('h4').next.text.gsub(/\s+|\n/, ' ').strip if node.at('h4').next
|
173
|
-
company[:description] = node.at(".description").text.gsub(/\s+|\n/, ' ').strip if node.at(".description")
|
174
|
-
|
175
|
-
start_date, end_date = node.at('.experience-date-locale').text.strip.split(" – ") rescue nil
|
176
|
-
company[:start_date] = parse_date(start_date) rescue nil
|
177
|
-
company[:end_date] = parse_date(end_date) rescue nil
|
178
|
-
|
179
|
-
company_link = node.at('h4').next.at('a')['href'] if node.at('h4').next.at('a')
|
180
|
-
|
181
|
-
result = get_company_details(company_link)
|
182
|
-
companies << company.merge!(result)
|
183
|
-
end
|
184
|
-
end
|
185
|
-
companies
|
186
|
-
end
|
187
|
-
|
188
|
-
def parse_date(date)
|
189
|
-
date = "#{date}-01-01" if date =~ /^(19|20)\d{2}$/
|
190
|
-
Date.parse(date)
|
191
|
-
end
|
192
|
-
|
193
|
-
def get_company_details(link)
|
194
|
-
result = { :linkedin_company_url => get_linkedin_company_url(link) }
|
195
|
-
page = http_client.get(result[:linkedin_company_url])
|
196
|
-
|
197
|
-
result[:url] = page.at('.basic-info-about/ul/li/p/a').text if page.at('.basic-info-about/ul/li/p/a')
|
198
|
-
node_2 = page.at('.basic-info-about/ul')
|
199
|
-
if node_2
|
200
|
-
node_2.search('p').zip(node_2.search('h4')).each do |value, title|
|
201
|
-
result[title.text.gsub(' ', '_').downcase.to_sym] = value.text.strip
|
202
|
-
end
|
203
|
-
end
|
204
|
-
result[:address] = page.at('.vcard.hq').at('.adr').text.gsub("\n", ' ').strip if page.at('.vcard.hq')
|
205
|
-
result
|
206
|
-
end
|
207
|
-
|
208
|
-
def http_client
|
209
|
-
Mechanize.new do |agent|
|
210
|
-
agent.user_agent_alias = USER_AGENTS.sample
|
211
|
-
agent.max_history = 0
|
212
|
-
end
|
213
|
-
end
|
214
|
-
|
215
|
-
def get_linkedin_company_url(link)
|
216
|
-
http = %r{http://www.linkedin.com/}
|
217
|
-
https = %r{https://www.linkedin.com/}
|
218
|
-
if http.match(link) || https.match(link)
|
219
|
-
link
|
220
|
-
else
|
221
|
-
"http://www.linkedin.com/#{link}"
|
222
|
-
end
|
223
|
-
end
|
224
|
-
end
|
225
|
-
end
|