linkedin-scraper 1.1.0 → 2.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -1
- data/CHANGE.md +0 -0
- data/README.md +48 -123
- data/bin/linkedin-scraper +4 -2
- data/lib/{linkedin_scraper.rb → linkedin-scraper.rb} +1 -1
- data/lib/linkedin-scraper/profile.rb +284 -0
- data/lib/{linkedin_scraper → linkedin-scraper}/version.rb +1 -1
- data/linkedin-scraper.gemspec +1 -1
- data/spec/linkedin_scraper/profile_spec.rb +1 -7
- data/spec/spec_helper.rb +1 -10
- metadata +7 -6
- data/lib/linkedin_scraper/profile.rb +0 -265
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 5ad336d03d93fa02d91f8c8452d594de789caba6
|
4
|
+
data.tar.gz: 157ac80ca5c1887181d83c69a40fdba7523298e0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: afda2323f3441e925e9d4dad00f7df991ecdfd918268b79cf258ba908f8eb1f0c76bc453488d15fc74302330163f4b7df87ff1f39d228086f80f2d3714bafa28
|
7
|
+
data.tar.gz: 3dcd605d5076a186779b9a37177797f23b47ecc13d0ca646abc176e48933710a09192480261c2220231017d4efb5c517ed79ecb5ad4d6d8bb9954cf15c866f44
|
data/.travis.yml
CHANGED
data/CHANGE.md
ADDED
File without changes
|
data/README.md
CHANGED
@@ -4,6 +4,8 @@
|
|
4
4
|
Linkedin Scraper
|
5
5
|
================
|
6
6
|
|
7
|
+
**2.0.0 is the new version. It does not support the `get_profile` method. It does not support Ruby 1.8**
|
8
|
+
|
7
9
|
Linkedin-scraper is a gem for scraping linkedin public profiles.
|
8
10
|
Given the URL of the profile, it gets the name, country, title, area, current companies, past companies,
|
9
11
|
organizations, skills, groups, etc
|
@@ -15,22 +17,29 @@ Install the gem from RubyGems:
|
|
15
17
|
|
16
18
|
gem install linkedin-scraper
|
17
19
|
|
18
|
-
This gem is tested on 1.9.2, 1.9.3, 2.0.0, 2.2, 2.3
|
20
|
+
This gem is tested on 1.9.2, 1.9.3, 2.0.0, 2.2, 2.3
|
19
21
|
|
20
22
|
## Usage
|
21
23
|
Include the gem
|
22
24
|
|
23
|
-
require '
|
25
|
+
require 'linkedin-scraper'
|
24
26
|
|
25
27
|
Initialize a scraper instance
|
26
28
|
|
27
|
-
profile = Linkedin::Profile.
|
28
|
-
|
29
|
+
profile = Linkedin::Profile.new("http://www.linkedin.com/in/jeffweiner08")
|
30
|
+
|
29
31
|
|
30
32
|
With a http web-proxy:
|
31
33
|
|
32
|
-
profile = Linkedin::Profile.
|
34
|
+
profile = Linkedin::Profile.new("http://www.linkedin.com/in/jeffweiner08", { proxy_ip: '127.0.0.1', proxy_port: '3128', username: 'user', password: 'pass' })
|
35
|
+
|
36
|
+
The scraper can also get the details of each past and current companies. This will lead to multiple hits.
|
37
|
+
To enable this functionality, pass `company_details=true` in options. You can pass them along with proxy options
|
38
|
+
as well
|
33
39
|
|
40
|
+
profile = Linkedin::Profile.new("http://www.linkedin.com/in/jeffweiner08", { company_details: true })
|
41
|
+
|
42
|
+
profile = Linkedin::Profile.new("http://www.linkedin.com/in/jeffweiner08", { company_details: true, proxy_ip: '127.0.0.1', proxy_port: '3128', username: 'user', password: 'pass' })
|
34
43
|
|
35
44
|
The returning object responds to the following methods
|
36
45
|
|
@@ -71,24 +80,35 @@ The returning object responds to the following methods
|
|
71
80
|
|
72
81
|
|
73
82
|
For current and past companies it also provides the details of the companies like company size, industry, address, etc
|
83
|
+
The company details will only be scraped if you pass company_details=true. It is false by default.
|
84
|
+
|
74
85
|
|
75
86
|
profile.current_companies
|
76
87
|
|
77
88
|
[
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
:
|
88
|
-
:
|
89
|
-
|
90
|
-
|
91
|
-
|
89
|
+
[0] {
|
90
|
+
:title => "CEO",
|
91
|
+
:company => "LinkedIn",
|
92
|
+
:company_logo => "https://media.licdn.com/media/AAEAAQAAAAAAAAL0AAAAJGMwYWZhNTYxLWJkMTktNDAzMi05NzEzLTlhNzUxMGU0NDg0Mw.png",
|
93
|
+
:duration => "7 years 6 months",
|
94
|
+
:start_date => #<Date: 2008-12-01 ((2454802j,0s,0n),+0s,2299161j)>,
|
95
|
+
:end_date => "Present",
|
96
|
+
:linkedin_company_url => "https://www.linkedin.com/company/linkedin",
|
97
|
+
:website => "http://www.linkedin.com",
|
98
|
+
:description => "The future is all about what you do next and we’re excited to help you get there. Ready for your moonshot? You're closer than you think. \r\n\r\nFounded in 2003, LinkedIn connects the world's professionals to make them more productive and successful. With more than 430 million members worldwide, including executives from every Fortune 500 company, LinkedIn is the world's largest professional network on the Internet. The company has a diversified business model with revenue coming from Talent Solutions, Marketing Solutions and Premium Subscriptions products. Headquartered in Silicon Valley, LinkedIn has offices across the globe.",
|
99
|
+
:company_size => "5001-10,000 employees",
|
100
|
+
:type => "Public Company",
|
101
|
+
:industry => "Internet",
|
102
|
+
:founded => 2003,
|
103
|
+
:address => "2029 Stierlin Court Mountain View, CA 94043 United States",
|
104
|
+
:street1 => "2029 Stierlin Court",
|
105
|
+
:street2 => "",
|
106
|
+
:city => "Mountain View",
|
107
|
+
:zip => "94043",
|
108
|
+
:state => "CA",
|
109
|
+
:country => "United States"
|
110
|
+
}
|
111
|
+
]
|
92
112
|
[1] {
|
93
113
|
:current_company => "Intuit",
|
94
114
|
:current_title => "Member, Board of Directors",
|
@@ -97,114 +117,16 @@ For current and past companies it also provides the details of the companies lik
|
|
97
117
|
:linkedin_company_url => "http://www.linkedin.com/company/intuit?trk=ppro_cprof",
|
98
118
|
:url => "http://network.intuit.com/",
|
99
119
|
:type => "Public Company",
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
:founded => "1983",
|
104
|
-
:address => "2632 Marine Way Mountain View, CA 94043 United States"
|
105
|
-
},
|
106
|
-
[2] {
|
107
|
-
:current_company => "DonorsChoose",
|
108
|
-
:current_title => "Member, Board of Directors",
|
109
|
-
:current_company_url => "http://www.donorschoose.org",
|
110
|
-
:description => nil,
|
111
|
-
:linkedin_company_url => "http://www.linkedin.com/company/donorschoose.org?trk=ppro_cprof",
|
112
|
-
:url => "http://www.donorschoose.org",
|
113
|
-
:type => "Nonprofit",
|
114
|
-
:company_size => "51-200 employees",
|
115
|
-
:website => "http://www.donorschoose.org",
|
116
|
-
:industry => "Nonprofit Organization Management",
|
117
|
-
:founded => "2000",
|
118
|
-
:address => "213 West 35th Street 2nd Floor East New York, NY 10001 United States"
|
119
|
-
},
|
120
|
-
[3] {
|
121
|
-
:current_company => "Malaria No More",
|
122
|
-
:current_title => "Member, Board of Directors",
|
123
|
-
:current_company_url => nil,
|
124
|
-
:description => nil
|
125
|
-
},
|
126
|
-
[4] {
|
127
|
-
:current_company => "Venture For America",
|
128
|
-
:current_title => "Member, Advisory Board",
|
129
|
-
:current_company_url => "http://ventureforamerica.org/",
|
130
|
-
:description => nil,
|
131
|
-
:linkedin_company_url => "http://www.linkedin.com/company/venture-for-america?trk=ppro_cprof",
|
132
|
-
:url => "http://ventureforamerica.org/",
|
133
|
-
:type => "Nonprofit",
|
134
|
-
:company_size => "1-10 employees",
|
135
|
-
:website => "http://ventureforamerica.org/",
|
136
|
-
:industry => "Nonprofit Organization Management",
|
137
|
-
:founded => "2011"
|
138
|
-
}
|
139
|
-
]
|
140
|
-
|
120
|
+
.
|
121
|
+
.
|
122
|
+
.
|
141
123
|
|
142
124
|
profile.past_companies
|
143
|
-
|
144
|
-
[0] {
|
145
|
-
:past_company => "Accel Partners",
|
146
|
-
:past_title => "Executive in Residence",
|
147
|
-
:past_company_website => "http://www.facebook.com/accel",
|
148
|
-
:description => nil,
|
149
|
-
:linkedin_company_url => "http://www.linkedin.com/company/accel-partners?trk=ppro_cprof",
|
150
|
-
:url => "http://www.facebook.com/accel",
|
151
|
-
:type => "Partnership",
|
152
|
-
:company_size => "51-200 employees",
|
153
|
-
:website => "http://www.facebook.com/accel",
|
154
|
-
:industry => "Venture Capital & Private Equity",
|
155
|
-
:address => "428 University Palo Alto, CA 94301 United States"
|
156
|
-
},
|
157
|
-
[1] {
|
158
|
-
:past_company => "Greylock",
|
159
|
-
:past_title => "Executive in Residence",
|
160
|
-
:past_company_website => "http://www.greylock.com",
|
161
|
-
:description => nil,
|
162
|
-
:linkedin_company_url => "http://www.linkedin.com/company/greylock-partners?trk=ppro_cprof",
|
163
|
-
:url => "http://www.greylock.com",
|
164
|
-
:type => "Partnership",
|
165
|
-
:company_size => "51-200 employees",
|
166
|
-
:website => "http://www.greylock.com",
|
167
|
-
:industry => "Venture Capital & Private Equity",
|
168
|
-
:address => "2550 Sand Hill Road Menlo Park, CA 94025 United States"
|
169
|
-
},
|
170
|
-
[2] {
|
171
|
-
:past_company => "Yahoo!",
|
172
|
-
:past_title => "Executive Vice President Network Division",
|
173
|
-
:past_company_website => "http://www.yahoo.com",
|
174
|
-
:description => nil,
|
175
|
-
:linkedin_company_url => "http://www.linkedin.com/company/yahoo?trk=ppro_cprof",
|
176
|
-
:url => "http://www.yahoo.com",
|
177
|
-
:type => "Public Company",
|
178
|
-
:company_size => "10,001+ employees",
|
179
|
-
:website => "http://www.yahoo.com",
|
180
|
-
:industry => "Internet",
|
181
|
-
:founded => "1994",
|
182
|
-
:address => "701 First Avenue Sunnyvale, CA 94089 United States"
|
183
|
-
},
|
184
|
-
[3] {
|
185
|
-
:past_company => "Windsor Media",
|
186
|
-
:past_title => "Founding Partner",
|
187
|
-
:past_company_website => nil,
|
188
|
-
:description => nil
|
189
|
-
},
|
190
|
-
[4] {
|
191
|
-
:past_company => "Warner Bros.",
|
192
|
-
:past_title => "Vice President Online",
|
193
|
-
:past_company_website => "http://www.warnerbros.com/",
|
194
|
-
:description => nil,
|
195
|
-
:linkedin_company_url => "http://www.linkedin.com/company/warner-bros.-entertainment-group-of-companies?trk=ppro_cprof",
|
196
|
-
:url => "http://www.warnerbros.com/",
|
197
|
-
:type => "Public Company",
|
198
|
-
:company_size => "10,001+ employees",
|
199
|
-
:website => "http://www.warnerbros.com/",
|
200
|
-
:industry => "Entertainment",
|
201
|
-
:address => "4000 Warner Boulevard Burbank, CA 91522 United States"
|
202
|
-
}
|
203
|
-
]
|
125
|
+
# Same as current companies
|
204
126
|
|
205
127
|
|
206
128
|
profile.recommended_visitors
|
207
|
-
#It is the list of visitors "Viewers of this profile also viewed..."
|
129
|
+
# It is the list of visitors "Viewers of this profile also viewed..."
|
208
130
|
[
|
209
131
|
[0] {
|
210
132
|
:link => "http://www.linkedin.com/in/barackobama?trk=pub-pbmap",
|
@@ -264,10 +186,13 @@ For current and past companies it also provides the details of the companies lik
|
|
264
186
|
|
265
187
|
|
266
188
|
The gem also comes with a binary and can be used from the command line to get a json response of the scraped data.
|
267
|
-
It takes the url as the first argument.
|
189
|
+
It takes the url as the first argument. If the last argument is true it will fetch the company details for each company
|
268
190
|
|
269
191
|
linkedin-scraper http://www.linkedin.com/in/jeffweiner08 127.0.0.1 3128 username password
|
270
192
|
|
193
|
+
linkedin-scraper http://www.linkedin.com/in/jeffweiner08 127.0.0.1 3128 username password true
|
194
|
+
|
195
|
+
|
271
196
|
## Contributing
|
272
197
|
|
273
198
|
Bug reports and pull requests are welcome on GitHub at https://github.com/yatish27/linkedin-scraper.
|
data/bin/linkedin-scraper
CHANGED
@@ -1,10 +1,12 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
2
|
|
3
|
-
require_relative '../lib/
|
3
|
+
require_relative '../lib/linkedin-scraper'
|
4
4
|
options = {}
|
5
|
-
options[:proxy_ip] = ARGV[1]
|
5
|
+
options[:proxy_ip] = ARGV[1]
|
6
6
|
options[:proxy_port] = ARGV[2]
|
7
7
|
options[:username] = ARGV[3]
|
8
8
|
options[:password] = ARGV[4]
|
9
|
+
options[:company_details] = ARGV[5]
|
10
|
+
|
9
11
|
profile = Linkedin::Profile.new(ARGV[0], options)
|
10
12
|
puts JSON.pretty_generate JSON.parse(profile.to_json)
|
@@ -3,4 +3,4 @@ require "mechanize"
|
|
3
3
|
require "cgi"
|
4
4
|
require "net/http"
|
5
5
|
require "random_user_agent"
|
6
|
-
Dir["#{File.expand_path(File.dirname(__FILE__))}/
|
6
|
+
Dir["#{File.expand_path(File.dirname(__FILE__))}/linkedin-scraper/*.rb"].each { |file| require file }
|
@@ -0,0 +1,284 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
module Linkedin
|
3
|
+
class Profile
|
4
|
+
ATTRIBUTES = %w(
|
5
|
+
name
|
6
|
+
first_name
|
7
|
+
last_name
|
8
|
+
title
|
9
|
+
location
|
10
|
+
number_of_connections
|
11
|
+
country
|
12
|
+
industry
|
13
|
+
summary
|
14
|
+
picture
|
15
|
+
projects
|
16
|
+
linkedin_url
|
17
|
+
education
|
18
|
+
groups
|
19
|
+
websites
|
20
|
+
languages
|
21
|
+
skills
|
22
|
+
certifications
|
23
|
+
organizations
|
24
|
+
past_companies
|
25
|
+
current_companies
|
26
|
+
recommended_visitors )
|
27
|
+
|
28
|
+
attr_reader :page, :linkedin_url
|
29
|
+
|
30
|
+
def initialize(url, options = {})
|
31
|
+
@linkedin_url = url
|
32
|
+
@options = options
|
33
|
+
@page = http_client.get(url)
|
34
|
+
end
|
35
|
+
|
36
|
+
def name
|
37
|
+
"#{first_name} #{last_name}"
|
38
|
+
end
|
39
|
+
|
40
|
+
def first_name
|
41
|
+
@first_name ||= (@page.at('.fn').text.split(' ', 2)[0].strip if @page.at('.fn'))
|
42
|
+
end
|
43
|
+
|
44
|
+
def last_name
|
45
|
+
@last_name ||= (@page.at('.fn').text.split(' ', 2)[1].strip if @page.at('.fn'))
|
46
|
+
end
|
47
|
+
|
48
|
+
def title
|
49
|
+
@title ||= (@page.at('.title').text.gsub(/\s+/, ' ').strip if @page.at('.title'))
|
50
|
+
end
|
51
|
+
|
52
|
+
def location
|
53
|
+
@location ||= (@page.at('.locality').text.split(',').first.strip if @page.at('.locality'))
|
54
|
+
end
|
55
|
+
|
56
|
+
def country
|
57
|
+
@country ||= (@page.at('.locality').text.split(',').last.strip if @page.at('.locality'))
|
58
|
+
end
|
59
|
+
|
60
|
+
def number_of_connections
|
61
|
+
if @page.at('.member-connections')
|
62
|
+
@connections ||= (@page.at('.member-connections').text.match(/[0-9]+[\+]{0,1}/)[0])
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def industry
|
67
|
+
if @page.at('#demographics .descriptor')
|
68
|
+
@industry ||= (@page.search('#demographics .descriptor')[-1].text.gsub(/\s+/, ' ').strip)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def summary
|
73
|
+
@summary ||= (@page.at('#summary .description').text.gsub(/\s+/, ' ').strip if @page.at('#summary .description'))
|
74
|
+
end
|
75
|
+
|
76
|
+
def picture
|
77
|
+
if @page.at('.profile-picture img')
|
78
|
+
@picture ||= @page.at('.profile-picture img').attributes.values_at('src', 'data-delayed-url').
|
79
|
+
compact.first.value.strip
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def skills
|
84
|
+
@skills ||= (@page.search('.pills .skill:not(.see-less)').map { |skill| skill.text.strip if skill.text } rescue nil)
|
85
|
+
end
|
86
|
+
|
87
|
+
def past_companies
|
88
|
+
@past_companies ||= get_companies.reject { |c| c[:end_date] == 'Present' }
|
89
|
+
end
|
90
|
+
|
91
|
+
def current_companies
|
92
|
+
@current_companies ||= get_companies.find_all { |c| c[:end_date] == 'Present' }
|
93
|
+
end
|
94
|
+
|
95
|
+
def education
|
96
|
+
@education ||= @page.search('.schools .school').map do |item|
|
97
|
+
name = item.at('h4').text.gsub(/\s+|\n/, ' ').strip if item.at('h4')
|
98
|
+
desc = item.search('h5').last.text.gsub(/\s+|\n/, ' ').strip if item.search('h5').last
|
99
|
+
if item.search('h5').last.at('.degree')
|
100
|
+
degree = item.search('h5').last.at('.degree').text.gsub(/\s+|\n/, ' ').strip.gsub(/,$/, '')
|
101
|
+
end
|
102
|
+
major = item.search('h5').last.at('.major').text.gsub(/\s+|\n/, ' ').strip if item.search('h5').last.at('.major')
|
103
|
+
period = item.at('.date-range').text.gsub(/\s+|\n/, ' ').strip if item.at('.date-range')
|
104
|
+
start_date, end_date = item.at('.date-range').text.gsub(/\s+|\n/, ' ').strip.split(' – ') rescue nil
|
105
|
+
|
106
|
+
{
|
107
|
+
name: name,
|
108
|
+
description: desc,
|
109
|
+
degree: degree,
|
110
|
+
major: major,
|
111
|
+
period: period,
|
112
|
+
start_date: start_date,
|
113
|
+
end_date: end_date
|
114
|
+
}
|
115
|
+
end
|
116
|
+
end
|
117
|
+
|
118
|
+
def websites
|
119
|
+
@websites ||= @page.search('.websites li').flat_map do |site|
|
120
|
+
url = site.at('a')['href']
|
121
|
+
CGI.parse(URI.parse(url).query)['url']
|
122
|
+
end
|
123
|
+
end
|
124
|
+
|
125
|
+
def groups
|
126
|
+
@groups ||= @page.search('#groups .group .item-title').map do |item|
|
127
|
+
name = item.text.gsub(/\s+|\n/, ' ').strip
|
128
|
+
link = item.at('a')['href']
|
129
|
+
|
130
|
+
{ name: name, link: link }
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
def organizations
|
135
|
+
@organizations ||= @page.search('#background-organizations .section-item').map do |item|
|
136
|
+
name = item.at('.summary').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
137
|
+
start_date, end_date = item.at('.organizations-date').text.gsub(/\s+|\n/, ' ').strip.split(' – ') rescue nil
|
138
|
+
start_date = Date.parse(start_date) rescue nil
|
139
|
+
end_date = Date.parse(end_date) rescue nil
|
140
|
+
{name: name, start_date: start_date, end_date: end_date}
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
def languages
|
145
|
+
@languages ||= @page.search('.background-languages #languages ol li').map do |item|
|
146
|
+
language = item.at('h4').text rescue nil
|
147
|
+
proficiency = item.at('div.languages-proficiency').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
148
|
+
{ language: language, proficiency: proficiency }
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
def certifications
|
153
|
+
@certifications ||= @page.search('background-certifications').map do |item|
|
154
|
+
name = item.at('h4').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
155
|
+
authority = item.at('h5').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
156
|
+
license = item.at('.specifics/.licence-number').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
157
|
+
start_date = item.at('.certification-date').text.gsub(/\s+|\n/, ' ').strip rescue nil
|
158
|
+
|
159
|
+
{ name: name, authority: authority, license: license, start_date: start_date }
|
160
|
+
end
|
161
|
+
end
|
162
|
+
|
163
|
+
|
164
|
+
def recommended_visitors
|
165
|
+
@recommended_visitors ||= @page.search('.insights .browse-map/ul/li.profile-card').map do |node|
|
166
|
+
visitor = {}
|
167
|
+
visitor[:link] = node.at('a')['href']
|
168
|
+
visitor[:name] = node.at('h4/a').text
|
169
|
+
if node.at('.headline')
|
170
|
+
visitor[:title], visitor[:company], _ = node.at('.headline').text.gsub('...', ' ').split(' at ')
|
171
|
+
end
|
172
|
+
visitor
|
173
|
+
end
|
174
|
+
end
|
175
|
+
|
176
|
+
def projects
|
177
|
+
@projects ||= @page.search('#projects .project').map do |node|
|
178
|
+
project = {}
|
179
|
+
start_date, end_date = node.at('.date-range').text.gsub(/\s+|\n/, ' ').strip.split(' – ') rescue nil
|
180
|
+
|
181
|
+
project[:title] = node.at('.item-title').text
|
182
|
+
project[:link] = CGI.parse(URI.parse(node.at('.item-title a')['href']).query)['url'][0] rescue nil
|
183
|
+
project[:start_date] = parse_date(start_date) rescue nil
|
184
|
+
project[:end_date] = parse_date(end_date) rescue nil
|
185
|
+
project[:description] = node.at('.description').children().to_s rescue nil
|
186
|
+
project[:associates] = node.search('.contributors .contributor').map { |c| c.at('a').text } rescue nil
|
187
|
+
project
|
188
|
+
end
|
189
|
+
end
|
190
|
+
|
191
|
+
def to_json
|
192
|
+
require 'json'
|
193
|
+
ATTRIBUTES.reduce({}) { |hash, attr| hash[attr.to_sym] = self.send(attr.to_sym); hash }.to_json
|
194
|
+
end
|
195
|
+
|
196
|
+
private
|
197
|
+
def get_companies
|
198
|
+
if @companies
|
199
|
+
return @companies
|
200
|
+
else
|
201
|
+
@companies = []
|
202
|
+
end
|
203
|
+
|
204
|
+
@page.search('.positions .position').each do |node|
|
205
|
+
company = {}
|
206
|
+
company[:title] = node.at('.item-title').text.gsub(/\s+|\n/, ' ').strip if node.at('.item-title')
|
207
|
+
company[:company] = node.at('.item-subtitle').text.gsub(/\s+|\n/, ' ').strip if node.at('.item-subtitle')
|
208
|
+
company[:location] = node.at('.location').text if node.at('.location')
|
209
|
+
company[:description] = node.at('.description').text.gsub(/\s+|\n/, ' ').strip if node.at('.description')
|
210
|
+
company[:company_logo] = node.at('.logo a img').first[1] if node.at('.logo')
|
211
|
+
|
212
|
+
start_date, end_date = node.at('.date-range').text.strip.split(' – ') rescue nil
|
213
|
+
company[:duration] = node.at('.date-range').text[/.*\((.*)\)/, 1]
|
214
|
+
company[:start_date] = parse_date(start_date) rescue nil
|
215
|
+
|
216
|
+
if end_date && end_date.match(/Present/)
|
217
|
+
company[:end_date] = 'Present'
|
218
|
+
else
|
219
|
+
company[:end_date] = parse_date(end_date) rescue nil
|
220
|
+
end
|
221
|
+
|
222
|
+
company_link = node.at('.item-subtitle').at('a')['href'] rescue nil
|
223
|
+
if @options[:company_details] && company_link
|
224
|
+
company.merge!(get_company_details(company_link))
|
225
|
+
end
|
226
|
+
|
227
|
+
@companies << company
|
228
|
+
end
|
229
|
+
|
230
|
+
@companies
|
231
|
+
end
|
232
|
+
|
233
|
+
def parse_date(date)
|
234
|
+
date = '#{date}-01-01' if date =~ /^(19|20)\d{2}$/
|
235
|
+
Date.parse(date)
|
236
|
+
end
|
237
|
+
|
238
|
+
def get_company_details(link)
|
239
|
+
sleep(1.5)
|
240
|
+
parsed = URI::parse(get_linkedin_company_url(link))
|
241
|
+
parsed.fragment = parsed.query = nil
|
242
|
+
result = { linkedin_company_url: parsed.to_s }
|
243
|
+
|
244
|
+
page = http_client.get(parsed.to_s)
|
245
|
+
company_details = JSON.parse(page.at('#stream-footer-embed-id-content').children.first.text) rescue nil
|
246
|
+
if company_details
|
247
|
+
result[:website] = company_details['website']
|
248
|
+
result[:description] = company_details['description']
|
249
|
+
result[:company_size] = company_details['size']
|
250
|
+
result[:type] = company_details['companyType']
|
251
|
+
result[:industry] = company_details['industry']
|
252
|
+
result[:founded] = company_details['yearFounded']
|
253
|
+
headquarters = company_details['headquarters']
|
254
|
+
if headquarters
|
255
|
+
result[:address] = %{#{headquarters['street1']} #{headquarters['street2']} #{headquarters['city']}, #{headquarters['state']} #{headquarters['zip']} #{headquarters['country']}}
|
256
|
+
end
|
257
|
+
[:street1, :street2, :city, :zip, :state, :country].each do |section|
|
258
|
+
result[section] = headquarters[section.to_s]
|
259
|
+
end
|
260
|
+
end
|
261
|
+
result
|
262
|
+
end
|
263
|
+
|
264
|
+
def http_client
|
265
|
+
@http_client ||= Mechanize.new do |agent|
|
266
|
+
agent.user_agent = RandomUserAgent.randomize
|
267
|
+
if !@options.empty?
|
268
|
+
agent.set_proxy(@options[:proxy_ip], @options[:proxy_port], @options[:username], @options[:password])
|
269
|
+
end
|
270
|
+
agent.max_history = 0
|
271
|
+
end
|
272
|
+
end
|
273
|
+
|
274
|
+
def get_linkedin_company_url(link)
|
275
|
+
http = %r{http://www.linkedin.com/}
|
276
|
+
https = %r{https://www.linkedin.com/}
|
277
|
+
if http.match(link) || https.match(link)
|
278
|
+
link
|
279
|
+
else
|
280
|
+
"http://www.linkedin.com/#{link}"
|
281
|
+
end
|
282
|
+
end
|
283
|
+
end
|
284
|
+
end
|
data/linkedin-scraper.gemspec
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
3
|
require 'spec_helper'
|
4
|
-
require '
|
4
|
+
require 'linkedin-scraper'
|
5
5
|
|
6
6
|
describe Linkedin::Profile do
|
7
7
|
# This is the HTML of https://www.linkedin.com/in/jeffweiner08
|
@@ -59,12 +59,6 @@ describe Linkedin::Profile do
|
|
59
59
|
end
|
60
60
|
end
|
61
61
|
|
62
|
-
describe '#groups' do
|
63
|
-
it "returns list of profile's groups" do
|
64
|
-
p profile.groups
|
65
|
-
end
|
66
|
-
end
|
67
|
-
|
68
62
|
describe '#name' do
|
69
63
|
it 'returns the first and last name of the profile' do
|
70
64
|
expect(profile.name).to eq "Jeff Weiner"
|
data/spec/spec_helper.rb
CHANGED
@@ -1,17 +1,8 @@
|
|
1
1
|
$LOAD_PATH << File.join(File.dirname(__FILE__), '../lib')
|
2
|
-
|
3
|
-
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
4
|
-
# Require this file using `require "spec_helper"` to ensure that it is only
|
5
|
-
# loaded once.
|
6
|
-
#
|
7
|
-
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
2
|
+
|
8
3
|
RSpec.configure do |config|
|
9
4
|
config.run_all_when_everything_filtered = true
|
10
5
|
config.filter_run :focus
|
11
6
|
|
12
|
-
# Run specs in random order to surface order dependencies. If you find an
|
13
|
-
# order dependency and want to debug it, you can fix the order by providing
|
14
|
-
# the seed, which is printed after each run.
|
15
|
-
# --seed 1234
|
16
7
|
config.order = 'random'
|
17
8
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedin-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.1.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Yatish Mehta
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-05-
|
11
|
+
date: 2016-05-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: mechanize
|
@@ -76,14 +76,15 @@ files:
|
|
76
76
|
- ".gitignore"
|
77
77
|
- ".rubocop.yml"
|
78
78
|
- ".travis.yml"
|
79
|
+
- CHANGE.md
|
79
80
|
- Gemfile
|
80
81
|
- LICENSE
|
81
82
|
- README.md
|
82
83
|
- Rakefile
|
83
84
|
- bin/linkedin-scraper
|
84
|
-
- lib/
|
85
|
-
- lib/
|
86
|
-
- lib/
|
85
|
+
- lib/linkedin-scraper.rb
|
86
|
+
- lib/linkedin-scraper/profile.rb
|
87
|
+
- lib/linkedin-scraper/version.rb
|
87
88
|
- linkedin-scraper.gemspec
|
88
89
|
- spec/fixtures/jeffweiner08.html
|
89
90
|
- spec/linkedin_scraper/.DS_Store
|
@@ -109,7 +110,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
109
110
|
version: '0'
|
110
111
|
requirements: []
|
111
112
|
rubyforge_project:
|
112
|
-
rubygems_version: 2.4
|
113
|
+
rubygems_version: 2.6.4
|
113
114
|
signing_key:
|
114
115
|
specification_version: 4
|
115
116
|
summary: when a url of public linkedin profile page is given it scrapes the entire
|
@@ -1,265 +0,0 @@
|
|
1
|
-
# -*- encoding: utf-8 -*-
|
2
|
-
module Linkedin
|
3
|
-
class Profile
|
4
|
-
|
5
|
-
ATTRIBUTES = %w(
|
6
|
-
name
|
7
|
-
first_name
|
8
|
-
last_name
|
9
|
-
title
|
10
|
-
location
|
11
|
-
number_of_connections
|
12
|
-
country
|
13
|
-
industry
|
14
|
-
summary
|
15
|
-
picture
|
16
|
-
projects
|
17
|
-
linkedin_url
|
18
|
-
education
|
19
|
-
groups
|
20
|
-
websites
|
21
|
-
languages
|
22
|
-
skills
|
23
|
-
certifications
|
24
|
-
organizations
|
25
|
-
past_companies
|
26
|
-
current_companies
|
27
|
-
recommended_visitors)
|
28
|
-
|
29
|
-
attr_reader :page, :linkedin_url
|
30
|
-
|
31
|
-
# support old version
|
32
|
-
def self.get_profile(url, options = {})
|
33
|
-
Linkedin::Profile.new(url, options)
|
34
|
-
rescue => e
|
35
|
-
puts e
|
36
|
-
end
|
37
|
-
|
38
|
-
def initialize(url, options = {})
|
39
|
-
@linkedin_url = url
|
40
|
-
@options = options
|
41
|
-
@page = http_client.get(url)
|
42
|
-
end
|
43
|
-
|
44
|
-
def name
|
45
|
-
"#{first_name} #{last_name}"
|
46
|
-
end
|
47
|
-
|
48
|
-
def first_name
|
49
|
-
@first_name ||= (@page.at(".fn").text.split(" ", 2)[0].strip if @page.at(".fn"))
|
50
|
-
end
|
51
|
-
|
52
|
-
def last_name
|
53
|
-
@last_name ||= (@page.at(".fn").text.split(" ", 2)[1].strip if @page.at(".fn"))
|
54
|
-
end
|
55
|
-
|
56
|
-
def title
|
57
|
-
@title ||= (@page.at(".title").text.gsub(/\s+/, " ").strip if @page.at(".title"))
|
58
|
-
end
|
59
|
-
|
60
|
-
def location
|
61
|
-
@location ||= (@page.at(".locality").text.split(",").first.strip if @page.at(".locality"))
|
62
|
-
end
|
63
|
-
|
64
|
-
def number_of_connections
|
65
|
-
@connections ||= (@page.at(".member-connections").text.match(/[0-9]+[\+]{0,1}/)[0]) if @page.at(".member-connections")
|
66
|
-
end
|
67
|
-
|
68
|
-
def country
|
69
|
-
@country ||= (@page.at(".locality").text.split(",").last.strip if @page.at(".locality"))
|
70
|
-
end
|
71
|
-
|
72
|
-
def industry
|
73
|
-
@industry ||= (@page.search("#demographics .descriptor")[-1].text.gsub(/\s+/, " ").strip if @page.at("#demographics .descriptor"))
|
74
|
-
end
|
75
|
-
|
76
|
-
def summary
|
77
|
-
@summary ||= (@page.at("#summary .description").text.gsub(/\s+/, " ").strip if @page.at("#summary .description"))
|
78
|
-
end
|
79
|
-
|
80
|
-
def picture
|
81
|
-
@picture ||= (@page.at('.profile-picture img').attributes.values_at('src','data-delayed-url').compact.first.value.strip if @page.at('.profile-picture img'))
|
82
|
-
end
|
83
|
-
|
84
|
-
def skills
|
85
|
-
@skills ||= (@page.search(".pills .skill:not(.see-less)").map { |skill| skill.text.strip if skill.text } rescue nil)
|
86
|
-
end
|
87
|
-
|
88
|
-
def past_companies
|
89
|
-
@past_companies ||= get_companies().reject { |c| c[:end_date] == "Present"}
|
90
|
-
end
|
91
|
-
|
92
|
-
def current_companies
|
93
|
-
@current_companies ||= get_companies().find_all{ |c| c[:end_date] == "Present"}
|
94
|
-
end
|
95
|
-
|
96
|
-
def education
|
97
|
-
@education ||= @page.search(".schools .school").map do |item|
|
98
|
-
name = item.at("h4").text.gsub(/\s+|\n/, " ").strip if item.at("h4")
|
99
|
-
desc = item.search("h5").last.text.gsub(/\s+|\n/, " ").strip if item.search("h5").last
|
100
|
-
degree = item.search("h5").last.at(".degree").text.gsub(/\s+|\n/, " ").strip.gsub(/,$/, "") if item.search("h5").last.at(".degree")
|
101
|
-
major = item.search("h5").last.at(".major").text.gsub(/\s+|\n/, " ").strip if item.search("h5").last.at(".major")
|
102
|
-
period = item.at(".date-range").text.gsub(/\s+|\n/, " ").strip if item.at(".date-range")
|
103
|
-
start_date, end_date = item.at(".date-range").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
|
104
|
-
{:name => name, :description => desc, :degree => degree, :major => major, :period => period, :start_date => start_date, :end_date => end_date }
|
105
|
-
end
|
106
|
-
end
|
107
|
-
|
108
|
-
def websites
|
109
|
-
@websites ||= @page.search(".websites li").flat_map do |site|
|
110
|
-
url = site.at("a")["href"]
|
111
|
-
CGI.parse(URI.parse(url).query)["url"]
|
112
|
-
end
|
113
|
-
end
|
114
|
-
|
115
|
-
def groups
|
116
|
-
@groups ||= @page.search("#groups .group .item-title").map do |item|
|
117
|
-
name = item.text.gsub(/\s+|\n/, " ").strip
|
118
|
-
link = item.at("a")['href']
|
119
|
-
{ :name => name, :link => link }
|
120
|
-
end
|
121
|
-
end
|
122
|
-
|
123
|
-
def organizations
|
124
|
-
@organizations ||= @page.search("#background-organizations .section-item").map do |item|
|
125
|
-
name = item.at(".summary").text.gsub(/\s+|\n/, " ").strip rescue nil
|
126
|
-
start_date, end_date = item.at(".organizations-date").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
|
127
|
-
start_date = Date.parse(start_date) rescue nil
|
128
|
-
end_date = Date.parse(end_date) rescue nil
|
129
|
-
{ :name => name, :start_date => start_date, :end_date => end_date }
|
130
|
-
end
|
131
|
-
end
|
132
|
-
|
133
|
-
def languages
|
134
|
-
@languages ||= @page.search(".background-languages #languages ol li").map do |item|
|
135
|
-
language = item.at("h4").text rescue nil
|
136
|
-
proficiency = item.at("div.languages-proficiency").text.gsub(/\s+|\n/, " ").strip rescue nil
|
137
|
-
{ :language => language, :proficiency => proficiency }
|
138
|
-
end
|
139
|
-
end
|
140
|
-
|
141
|
-
def certifications
|
142
|
-
@certifications ||= @page.search("background-certifications").map do |item|
|
143
|
-
name = item.at("h4").text.gsub(/\s+|\n/, " ").strip rescue nil
|
144
|
-
authority = item.at("h5").text.gsub(/\s+|\n/, " ").strip rescue nil
|
145
|
-
license = item.at(".specifics/.licence-number").text.gsub(/\s+|\n/, " ").strip rescue nil
|
146
|
-
start_date = item.at(".certification-date").text.gsub(/\s+|\n/, " ").strip rescue nil
|
147
|
-
|
148
|
-
{ :name => name, :authority => authority, :license => license, :start_date => start_date }
|
149
|
-
end
|
150
|
-
end
|
151
|
-
|
152
|
-
|
153
|
-
def recommended_visitors
|
154
|
-
@recommended_visitors ||= @page.search(".insights .browse-map/ul/li.profile-card").map do |visitor|
|
155
|
-
v = {}
|
156
|
-
v[:link] = visitor.at("a")["href"]
|
157
|
-
v[:name] = visitor.at("h4/a").text
|
158
|
-
if visitor.at(".headline")
|
159
|
-
v[:title] = visitor.at(".headline").text.gsub("...", " ").split(" at ").first
|
160
|
-
v[:company] = visitor.at(".headline").text.gsub("...", " ").split(" at ")[1]
|
161
|
-
end
|
162
|
-
v
|
163
|
-
end
|
164
|
-
end
|
165
|
-
|
166
|
-
def projects
|
167
|
-
@projects ||= @page.search("#projects .project").map do |project|
|
168
|
-
p = {}
|
169
|
-
start_date, end_date = project.at(".date-range").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
|
170
|
-
|
171
|
-
p[:title] = project.at(".item-title").text
|
172
|
-
p[:link] = CGI.parse(URI.parse(project.at(".item-title a")['href']).query)["url"][0] rescue nil
|
173
|
-
p[:start_date] = parse_date(start_date) rescue nil
|
174
|
-
p[:end_date] = parse_date(end_date) rescue nil
|
175
|
-
p[:description] = project.at(".description").children().to_s rescue nil
|
176
|
-
p[:associates] = project.search(".contributors .contributor").map{ |c| c.at("a").text } rescue nil
|
177
|
-
p
|
178
|
-
end
|
179
|
-
end
|
180
|
-
|
181
|
-
def to_json
|
182
|
-
require "json"
|
183
|
-
ATTRIBUTES.reduce({}){ |hash,attr| hash[attr.to_sym] = self.send(attr.to_sym);hash }.to_json
|
184
|
-
end
|
185
|
-
|
186
|
-
private
|
187
|
-
#TODO Bad code Hot fix
|
188
|
-
def get_companies()
|
189
|
-
if @companies
|
190
|
-
return @companies
|
191
|
-
else
|
192
|
-
@companies = []
|
193
|
-
end
|
194
|
-
|
195
|
-
@page.search(".positions .position").each do |node|
|
196
|
-
company = {}
|
197
|
-
company[:title] = node.at(".item-title").text.gsub(/\s+|\n/, " ").strip if node.at(".item-title")
|
198
|
-
company[:company] = node.at(".item-subtitle").text.gsub(/\s+|\n/, " ").strip if node.at(".item-subtitle")
|
199
|
-
company[:location] = node.at(".location").text if node.at(".location")
|
200
|
-
company[:description] = node.at(".description").text.gsub(/\s+|\n/, " ").strip if node.at(".description")
|
201
|
-
company[:company_logo] = node.at(".logo a img").first[1] if node.at(".logo")
|
202
|
-
|
203
|
-
start_date, end_date = node.at(".date-range").text.strip.split(" – ") rescue nil
|
204
|
-
company[:duration] = node.at(".date-range").text[/.*\((.*)\)/, 1]
|
205
|
-
company[:start_date] = parse_date(start_date) rescue nil
|
206
|
-
|
207
|
-
if end_date && end_date.match(/Present/)
|
208
|
-
company[:end_date] = "Present"
|
209
|
-
else
|
210
|
-
company[:end_date] = parse_date(end_date) rescue nil
|
211
|
-
end
|
212
|
-
|
213
|
-
company_link = node.at(".item-subtitle").at("a")["href"] rescue nil
|
214
|
-
if company_link
|
215
|
-
result = get_company_details(company_link)
|
216
|
-
@companies << company.merge!(result)
|
217
|
-
else
|
218
|
-
@companies << company
|
219
|
-
end
|
220
|
-
end
|
221
|
-
|
222
|
-
@companies
|
223
|
-
end
|
224
|
-
|
225
|
-
def parse_date(date)
|
226
|
-
date = "#{date}-01-01" if date =~ /^(19|20)\d{2}$/
|
227
|
-
Date.parse(date)
|
228
|
-
end
|
229
|
-
|
230
|
-
def get_company_details(link)
|
231
|
-
result = { :linkedin_company_url => get_linkedin_company_url(link) }
|
232
|
-
page = http_client.get(result[:linkedin_company_url])
|
233
|
-
|
234
|
-
result[:url] = page.at(".basic-info-about/ul/li/p/a").text if page.at(".basic-info-about/ul/li/p/a")
|
235
|
-
node_2 = page.at(".basic-info-about/ul")
|
236
|
-
if node_2
|
237
|
-
node_2.search("p").zip(node_2.search("h4")).each do |value, title|
|
238
|
-
result[title.text.gsub(" ", "_").downcase.to_sym] = value.text.strip
|
239
|
-
end
|
240
|
-
end
|
241
|
-
result[:address] = page.at(".vcard.hq").at(".adr").text.gsub("\n", " ").strip if page.at(".vcard.hq")
|
242
|
-
result
|
243
|
-
end
|
244
|
-
|
245
|
-
def http_client
|
246
|
-
Mechanize.new do |agent|
|
247
|
-
agent.user_agent = RandomUserAgent.randomize
|
248
|
-
unless @options.empty?
|
249
|
-
agent.set_proxy(@options[:proxy_ip], @options[:proxy_port], @options[:username], @options[:password])
|
250
|
-
end
|
251
|
-
agent.max_history = 0
|
252
|
-
end
|
253
|
-
end
|
254
|
-
|
255
|
-
def get_linkedin_company_url(link)
|
256
|
-
http = %r{http://www.linkedin.com/}
|
257
|
-
https = %r{https://www.linkedin.com/}
|
258
|
-
if http.match(link) || https.match(link)
|
259
|
-
link
|
260
|
-
else
|
261
|
-
"http://www.linkedin.com/#{link}"
|
262
|
-
end
|
263
|
-
end
|
264
|
-
end
|
265
|
-
end
|