linkedin-scraper 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -1
- data/README.md +17 -7
- data/bin/linkedin-scraper +1 -1
- data/lib/linkedin_scraper/profile.rb +243 -0
- data/lib/{linkedin-scraper → linkedin_scraper}/version.rb +1 -1
- data/lib/linkedin_scraper.rb +5 -0
- data/linkedin-scraper.gemspec +7 -6
- data/spec/fixtures/jeffweiner08.html +308 -0
- data/spec/linkedin_scraper/.DS_Store +0 -0
- data/spec/linkedin_scraper/profile_spec.rb +104 -0
- metadata +27 -24
- data/lib/linkedin-scraper/profile.rb +0 -225
- data/lib/linkedin-scraper.rb +0 -5
- data/spec/fixtures/jgrevich.html +0 -9300
- data/spec/linkedin-scraper/profile_spec.rb +0 -154
metadata
CHANGED
@@ -1,58 +1,58 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedin-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.7
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yatish Mehta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-
|
11
|
+
date: 2015-09-14 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - "~>"
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
19
|
+
version: '2'
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- - "
|
24
|
+
- - "~>"
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: '
|
26
|
+
version: '2'
|
27
27
|
- !ruby/object:Gem::Dependency
|
28
28
|
name: rspec
|
29
29
|
requirement: !ruby/object:Gem::Requirement
|
30
30
|
requirements:
|
31
|
-
- - "
|
31
|
+
- - "~>"
|
32
32
|
- !ruby/object:Gem::Version
|
33
|
-
version: '
|
33
|
+
version: '3'
|
34
34
|
type: :development
|
35
35
|
prerelease: false
|
36
36
|
version_requirements: !ruby/object:Gem::Requirement
|
37
37
|
requirements:
|
38
|
-
- - "
|
38
|
+
- - "~>"
|
39
39
|
- !ruby/object:Gem::Version
|
40
|
-
version: '
|
40
|
+
version: '3'
|
41
41
|
- !ruby/object:Gem::Dependency
|
42
42
|
name: rake
|
43
43
|
requirement: !ruby/object:Gem::Requirement
|
44
44
|
requirements:
|
45
|
-
- - "
|
45
|
+
- - "~>"
|
46
46
|
- !ruby/object:Gem::Version
|
47
|
-
version: '
|
47
|
+
version: '10'
|
48
48
|
type: :development
|
49
49
|
prerelease: false
|
50
50
|
version_requirements: !ruby/object:Gem::Requirement
|
51
51
|
requirements:
|
52
|
-
- - "
|
52
|
+
- - "~>"
|
53
53
|
- !ruby/object:Gem::Version
|
54
|
-
version: '
|
55
|
-
description: 'Scrapes the
|
54
|
+
version: '10'
|
55
|
+
description: 'Scrapes the LinkedIn profile using the public url '
|
56
56
|
email:
|
57
57
|
executables:
|
58
58
|
- linkedin-scraper
|
@@ -67,15 +67,17 @@ files:
|
|
67
67
|
- README.md
|
68
68
|
- Rakefile
|
69
69
|
- bin/linkedin-scraper
|
70
|
-
- lib/
|
71
|
-
- lib/
|
72
|
-
- lib/
|
70
|
+
- lib/linkedin_scraper.rb
|
71
|
+
- lib/linkedin_scraper/profile.rb
|
72
|
+
- lib/linkedin_scraper/version.rb
|
73
73
|
- linkedin-scraper.gemspec
|
74
|
-
- spec/fixtures/
|
75
|
-
- spec/
|
74
|
+
- spec/fixtures/jeffweiner08.html
|
75
|
+
- spec/linkedin_scraper/.DS_Store
|
76
|
+
- spec/linkedin_scraper/profile_spec.rb
|
76
77
|
- spec/spec_helper.rb
|
77
78
|
homepage: https://github.com/yatishmehta27/linkedin-scraper
|
78
|
-
licenses:
|
79
|
+
licenses:
|
80
|
+
- MIT
|
79
81
|
metadata: {}
|
80
82
|
post_install_message:
|
81
83
|
rdoc_options: []
|
@@ -93,12 +95,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
93
95
|
version: '0'
|
94
96
|
requirements: []
|
95
97
|
rubyforge_project:
|
96
|
-
rubygems_version: 2.4.
|
98
|
+
rubygems_version: 2.4.6
|
97
99
|
signing_key:
|
98
100
|
specification_version: 4
|
99
101
|
summary: when a url of public linkedin profile page is given it scrapes the entire
|
100
102
|
page and converts into a accessible object
|
101
103
|
test_files:
|
102
|
-
- spec/fixtures/
|
103
|
-
- spec/
|
104
|
+
- spec/fixtures/jeffweiner08.html
|
105
|
+
- spec/linkedin_scraper/.DS_Store
|
106
|
+
- spec/linkedin_scraper/profile_spec.rb
|
104
107
|
- spec/spec_helper.rb
|
@@ -1,225 +0,0 @@
|
|
1
|
-
# -*- coding: utf-8 -*-
|
2
|
-
module Linkedin
|
3
|
-
|
4
|
-
class Profile
|
5
|
-
|
6
|
-
USER_AGENTS = ['Windows IE 6', 'Windows IE 7', 'Windows Mozilla', 'Mac Safari', 'Mac FireFox', 'Mac Mozilla', 'Linux Mozilla', 'Linux Firefox', 'Linux Konqueror']
|
7
|
-
|
8
|
-
ATTRIBUTES = %w(
|
9
|
-
name
|
10
|
-
first_name
|
11
|
-
last_name
|
12
|
-
title
|
13
|
-
location
|
14
|
-
country
|
15
|
-
industry
|
16
|
-
summary
|
17
|
-
picture
|
18
|
-
linkedin_url
|
19
|
-
education
|
20
|
-
groups
|
21
|
-
websites
|
22
|
-
languages
|
23
|
-
skills
|
24
|
-
certifications
|
25
|
-
organizations
|
26
|
-
past_companies
|
27
|
-
current_companies
|
28
|
-
recommended_visitors)
|
29
|
-
|
30
|
-
attr_reader :page, :linkedin_url
|
31
|
-
|
32
|
-
def self.get_profile(url)
|
33
|
-
Linkedin::Profile.new(url)
|
34
|
-
rescue => e
|
35
|
-
puts e
|
36
|
-
end
|
37
|
-
|
38
|
-
def initialize(url)
|
39
|
-
@linkedin_url = url
|
40
|
-
@page = http_client.get(url)
|
41
|
-
end
|
42
|
-
|
43
|
-
def name
|
44
|
-
"#{first_name} #{last_name}"
|
45
|
-
end
|
46
|
-
|
47
|
-
def first_name
|
48
|
-
@first_name ||= (@page.at('.full-name').text.split(' ', 2)[0].strip if @page.at('.full-name'))
|
49
|
-
end
|
50
|
-
|
51
|
-
def last_name
|
52
|
-
@last_name ||= (@page.at('.full-name').text.split(' ', 2)[1].strip if @page.at('.full-name'))
|
53
|
-
end
|
54
|
-
|
55
|
-
def title
|
56
|
-
@title ||= (@page.at('.title').text.gsub(/\s+/, ' ').strip if @page.at('.title'))
|
57
|
-
end
|
58
|
-
|
59
|
-
def location
|
60
|
-
@location ||= (@page.at('.locality').text.split(',').first.strip if @page.at('.locality'))
|
61
|
-
end
|
62
|
-
|
63
|
-
def country
|
64
|
-
@country ||= (@page.at('.locality').text.split(',').last.strip if @page.at('.locality'))
|
65
|
-
end
|
66
|
-
|
67
|
-
def industry
|
68
|
-
@industry ||= (@page.at('.industry').text.gsub(/\s+/, ' ').strip if @page.at('.industry'))
|
69
|
-
end
|
70
|
-
|
71
|
-
def summary
|
72
|
-
@summary ||= (@page.at('.summary .description').text.gsub(/\s+/, ' ').strip if @page.at('.summary .description'))
|
73
|
-
end
|
74
|
-
|
75
|
-
def picture
|
76
|
-
@picture ||= (@page.at('.profile-picture img').attributes['src'].value.strip if @page.at('.profile-picture img'))
|
77
|
-
end
|
78
|
-
|
79
|
-
def skills
|
80
|
-
@skills ||= (@page.search('.skill-pill .endorse-item-name-text').map { |skill| skill.text.strip if skill.text } rescue nil)
|
81
|
-
end
|
82
|
-
|
83
|
-
def past_companies
|
84
|
-
@past_companies ||= get_companies('past')
|
85
|
-
end
|
86
|
-
|
87
|
-
def current_companies
|
88
|
-
@current_companies ||= get_companies('current')
|
89
|
-
end
|
90
|
-
|
91
|
-
def education
|
92
|
-
@education ||= @page.search('.background-education .education').map do |item|
|
93
|
-
name = item.at('h4').text.gsub(/\s+|\n/, ' ').strip if item.at('h4')
|
94
|
-
desc = item.at('h5').text.gsub(/\s+|\n/, ' ').strip if item.at('h5')
|
95
|
-
period = item.at('.education-date').text.gsub(/\s+|\n/, ' ').strip if item.at('.education-date')
|
96
|
-
|
97
|
-
{:name => name, :description => desc, :period => period }
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
def websites
|
102
|
-
@websites ||= @page.search('#overview-summary-websites').flat_map do |site|
|
103
|
-
url = "http://www.linkedin.com#{site.at('a')['href']}"
|
104
|
-
CGI.parse(URI.parse(url).query)['url']
|
105
|
-
end
|
106
|
-
|
107
|
-
end
|
108
|
-
|
109
|
-
def groups
|
110
|
-
@groups ||= @page.search('.groups-name').map do |item|
|
111
|
-
name = item.text.gsub(/\s+|\n/, ' ').strip
|
112
|
-
link = "http://www.linkedin.com#{item.at('a')['href']}"
|
113
|
-
{ :name => name, :link => link }
|
114
|
-
end
|
115
|
-
end
|
116
|
-
|
117
|
-
def organizations
|
118
|
-
@organizations ||= @page.search('#background-organizations .section-item').map do |item|
|
119
|
-
name = item.at('.summary').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
120
|
-
start_date, end_date = item.at('.organizations-date').text.gsub(/\s+|\n/, ' ').strip.split(' – ') rescue nil
|
121
|
-
start_date = Date.parse(start_date) rescue nil
|
122
|
-
end_date = Date.parse(end_date) rescue nil
|
123
|
-
{ :name => name, :start_date => start_date, :end_date => end_date }
|
124
|
-
end
|
125
|
-
end
|
126
|
-
|
127
|
-
def languages
|
128
|
-
@languages ||= @page.search('.background-languages #languages ol li').map do |item|
|
129
|
-
language = item.at('h4').text rescue nil
|
130
|
-
proficiency = item.at('div.languages-proficiency').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
131
|
-
{ :language => language, :proficiency => proficiency }
|
132
|
-
end
|
133
|
-
end
|
134
|
-
|
135
|
-
def certifications
|
136
|
-
@certifications ||= @page.search('background-certifications').map do |item|
|
137
|
-
name = item.at('h4').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
138
|
-
authority = item.at('h5').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
139
|
-
license = item.at('.specifics/.licence-number').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
140
|
-
start_date = item.at('.certification-date').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
141
|
-
|
142
|
-
{ :name => name, :authority => authority, :license => license, :start_date => start_date }
|
143
|
-
end
|
144
|
-
end
|
145
|
-
|
146
|
-
|
147
|
-
def recommended_visitors
|
148
|
-
@recommended_visitors ||= @page.search('.insights-browse-map/ul/li').map do |visitor|
|
149
|
-
v = {}
|
150
|
-
v[:link] = visitor.at('a')['href']
|
151
|
-
v[:name] = visitor.at('h4/a').text
|
152
|
-
v[:title] = visitor.at('.browse-map-title').text.gsub('...', ' ').split(' at ').first
|
153
|
-
v[:company] = visitor.at('.browse-map-title').text.gsub('...', ' ').split(' at ')[1]
|
154
|
-
v
|
155
|
-
end
|
156
|
-
end
|
157
|
-
|
158
|
-
def to_json
|
159
|
-
require 'json'
|
160
|
-
ATTRIBUTES.reduce({}){ |hash,attr| hash[attr.to_sym] = self.send(attr.to_sym);hash }.to_json
|
161
|
-
end
|
162
|
-
|
163
|
-
private
|
164
|
-
|
165
|
-
def get_companies(type)
|
166
|
-
companies = []
|
167
|
-
if @page.search(".background-experience .#{type}-position").first
|
168
|
-
@page.search(".background-experience .#{type}-position").each do |node|
|
169
|
-
|
170
|
-
company = {}
|
171
|
-
company[:title] = node.at('h4').text.gsub(/\s+|\n/, ' ').strip if node.at('h4')
|
172
|
-
company[:company] = node.at('h4').next.text.gsub(/\s+|\n/, ' ').strip if node.at('h4').next
|
173
|
-
company[:description] = node.at(".description").text.gsub(/\s+|\n/, ' ').strip if node.at(".description")
|
174
|
-
|
175
|
-
start_date, end_date = node.at('.experience-date-locale').text.strip.split(" – ") rescue nil
|
176
|
-
company[:start_date] = parse_date(start_date) rescue nil
|
177
|
-
company[:end_date] = parse_date(end_date) rescue nil
|
178
|
-
|
179
|
-
company_link = node.at('h4').next.at('a')['href'] if node.at('h4').next.at('a')
|
180
|
-
|
181
|
-
result = get_company_details(company_link)
|
182
|
-
companies << company.merge!(result)
|
183
|
-
end
|
184
|
-
end
|
185
|
-
companies
|
186
|
-
end
|
187
|
-
|
188
|
-
def parse_date(date)
|
189
|
-
date = "#{date}-01-01" if date =~ /^(19|20)\d{2}$/
|
190
|
-
Date.parse(date)
|
191
|
-
end
|
192
|
-
|
193
|
-
def get_company_details(link)
|
194
|
-
result = { :linkedin_company_url => get_linkedin_company_url(link) }
|
195
|
-
page = http_client.get(result[:linkedin_company_url])
|
196
|
-
|
197
|
-
result[:url] = page.at('.basic-info-about/ul/li/p/a').text if page.at('.basic-info-about/ul/li/p/a')
|
198
|
-
node_2 = page.at('.basic-info-about/ul')
|
199
|
-
if node_2
|
200
|
-
node_2.search('p').zip(node_2.search('h4')).each do |value, title|
|
201
|
-
result[title.text.gsub(' ', '_').downcase.to_sym] = value.text.strip
|
202
|
-
end
|
203
|
-
end
|
204
|
-
result[:address] = page.at('.vcard.hq').at('.adr').text.gsub("\n", ' ').strip if page.at('.vcard.hq')
|
205
|
-
result
|
206
|
-
end
|
207
|
-
|
208
|
-
def http_client
|
209
|
-
Mechanize.new do |agent|
|
210
|
-
agent.user_agent_alias = USER_AGENTS.sample
|
211
|
-
agent.max_history = 0
|
212
|
-
end
|
213
|
-
end
|
214
|
-
|
215
|
-
def get_linkedin_company_url(link)
|
216
|
-
http = %r{http://www.linkedin.com/}
|
217
|
-
https = %r{https://www.linkedin.com/}
|
218
|
-
if http.match(link) || https.match(link)
|
219
|
-
link
|
220
|
-
else
|
221
|
-
"http://www.linkedin.com/#{link}"
|
222
|
-
end
|
223
|
-
end
|
224
|
-
end
|
225
|
-
end
|