linkedin-scraper-v2 0.1.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +23 -0
- data/.rubocop.yml +11 -0
- data/.travis.yml +8 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +272 -0
- data/Rakefile +3 -0
- data/bin/linkedin-scraper +5 -0
- data/lib/linkedin_scraper.rb +5 -0
- data/lib/linkedin_scraper/profile.rb +252 -0
- data/lib/linkedin_scraper/version.rb +5 -0
- data/linkedin-scraper.gemspec +22 -0
- data/spec/fixtures/jeffweiner08.html +308 -0
- data/spec/linkedin_scraper/profile_spec.rb +110 -0
- data/spec/spec_helper.rb +17 -0
- metadata +107 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ea72cf17a2f1766bdb4eb7b320d3251392d21824
|
4
|
+
data.tar.gz: ebccbf3dcd9f3511b452e103986916067a502e6d
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: cd1a28b3081b4d1ab7f86423cc169af184822cdb0ae96b5077a59314b6012a8e3ed837cc03e032c0878df7bc32c672bcfb1c32e8f3cbf56ea989b075c5468d71
|
7
|
+
data.tar.gz: 7d5c7acbde250b8976ddb64d0b0b496131baf3784993e5f6df17756762a9cdd37461771fcc12146e6174b64444ec64c460c2a7e9dd78b0edcd58ddbcd9327d15
|
data/.gitignore
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.rspec
|
6
|
+
.yardoc
|
7
|
+
Gemfile.lock
|
8
|
+
InstalledFiles
|
9
|
+
_yardoc
|
10
|
+
coverage
|
11
|
+
doc/
|
12
|
+
lib/bundler/man
|
13
|
+
pkg
|
14
|
+
rdoc
|
15
|
+
spec/reports
|
16
|
+
test/tmp
|
17
|
+
test/version_tmp
|
18
|
+
tmp
|
19
|
+
.ruby-version
|
20
|
+
.ruby-gemset
|
21
|
+
.projectile
|
22
|
+
*.DS_Store
|
23
|
+
.idea/*
|
data/.rubocop.yml
ADDED
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Yatish Mehta
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,272 @@
|
|
1
|
+
[![Build Status](https://secure.travis-ci.org/yatish27/linkedin-scraper.png)](http://travis-ci.org/yatish27/linkedin-scraper)
|
2
|
+
[![Gem Version](https://badge.fury.io/rb/linkedin-scraper.png)](http://badge.fury.io/rb/linkedin-scraper)
|
3
|
+
|
4
|
+
Linkedin Scraper
|
5
|
+
================
|
6
|
+
|
7
|
+
Linkedin-scraper is a gem for scraping linkedin public profiles.
|
8
|
+
Given the URL of the profile, it gets the name, country, title, area, current companies, past companies,
|
9
|
+
organizations, skills, groups, etc
|
10
|
+
|
11
|
+
|
12
|
+
## Installation
|
13
|
+
|
14
|
+
Install the gem from RubyGems:
|
15
|
+
|
16
|
+
gem install linkedin-scraper
|
17
|
+
|
18
|
+
This gem is tested on 1.9.2, 1.9.3, 2.0.0, JRuby1.9, rbx1.9,
|
19
|
+
|
20
|
+
## Usage
|
21
|
+
|
22
|
+
|
23
|
+
Initialize a scraper instance
|
24
|
+
|
25
|
+
profile = Linkedin::Profile.get_profile("http://www.linkedin.com/in/jeffweiner08")
|
26
|
+
|
27
|
+
The returning object responds to the following methods
|
28
|
+
|
29
|
+
|
30
|
+
profile.first_name # The first name of the contact
|
31
|
+
|
32
|
+
profile.last_name # The last name of the contact
|
33
|
+
|
34
|
+
profile.name # The full name of the profile
|
35
|
+
|
36
|
+
profile.title # The job title
|
37
|
+
|
38
|
+
profile.summary # The summary of the profile
|
39
|
+
|
40
|
+
profile.location # The location of the contact
|
41
|
+
|
42
|
+
profile.country # The country of the contact
|
43
|
+
|
44
|
+
profile.industry # The domain for which the contact belongs
|
45
|
+
|
46
|
+
profile.picture # The profile picture link of profile
|
47
|
+
|
48
|
+
profile.skills # Array of skills of the profile
|
49
|
+
|
50
|
+
profile.organizations # Array organizations of the profile
|
51
|
+
|
52
|
+
profile.education # Array of hashes for education
|
53
|
+
|
54
|
+
profile.websites # Array of websites
|
55
|
+
|
56
|
+
profile.groups # Array of groups
|
57
|
+
|
58
|
+
profile.languages # Array of languages
|
59
|
+
|
60
|
+
profile.certifications # Array of certifications
|
61
|
+
|
62
|
+
profile.number_of_connections # The number of connections as a string
|
63
|
+
|
64
|
+
|
65
|
+
For current and past companies it also provides the details of the companies like company size, industry, address, etc
|
66
|
+
|
67
|
+
profile.current_companies
|
68
|
+
|
69
|
+
[
|
70
|
+
[0] {
|
71
|
+
:current_company => "LinkedIn",
|
72
|
+
:current_title => "CEO",
|
73
|
+
:current_company_url => "http://www.linkedin.com",
|
74
|
+
:description => nil,
|
75
|
+
:linkedin_company_url => "http://www.linkedin.com/company/linkedin?trk=ppro_cprof",
|
76
|
+
:url => "http://www.linkedin.com",
|
77
|
+
:type => "Public Company",
|
78
|
+
:company_size => "1001-5000 employees",
|
79
|
+
:website => "http://www.linkedin.com",
|
80
|
+
:industry => "Internet",
|
81
|
+
:founded => "2003",
|
82
|
+
:address => "2029 Stierlin Court Mountain View, CA 94043 United States"
|
83
|
+
},
|
84
|
+
[1] {
|
85
|
+
:current_company => "Intuit",
|
86
|
+
:current_title => "Member, Board of Directors",
|
87
|
+
:current_company_url => "http://network.intuit.com/",
|
88
|
+
:description => nil,
|
89
|
+
:linkedin_company_url => "http://www.linkedin.com/company/intuit?trk=ppro_cprof",
|
90
|
+
:url => "http://network.intuit.com/",
|
91
|
+
:type => "Public Company",
|
92
|
+
:company_size => "5001-10,000 employees",
|
93
|
+
:website => "http://network.intuit.com/",
|
94
|
+
:industry => "Computer Software",
|
95
|
+
:founded => "1983",
|
96
|
+
:address => "2632 Marine Way Mountain View, CA 94043 United States"
|
97
|
+
},
|
98
|
+
[2] {
|
99
|
+
:current_company => "DonorsChoose",
|
100
|
+
:current_title => "Member, Board of Directors",
|
101
|
+
:current_company_url => "http://www.donorschoose.org",
|
102
|
+
:description => nil,
|
103
|
+
:linkedin_company_url => "http://www.linkedin.com/company/donorschoose.org?trk=ppro_cprof",
|
104
|
+
:url => "http://www.donorschoose.org",
|
105
|
+
:type => "Nonprofit",
|
106
|
+
:company_size => "51-200 employees",
|
107
|
+
:website => "http://www.donorschoose.org",
|
108
|
+
:industry => "Nonprofit Organization Management",
|
109
|
+
:founded => "2000",
|
110
|
+
:address => "213 West 35th Street 2nd Floor East New York, NY 10001 United States"
|
111
|
+
},
|
112
|
+
[3] {
|
113
|
+
:current_company => "Malaria No More",
|
114
|
+
:current_title => "Member, Board of Directors",
|
115
|
+
:current_company_url => nil,
|
116
|
+
:description => nil
|
117
|
+
},
|
118
|
+
[4] {
|
119
|
+
:current_company => "Venture For America",
|
120
|
+
:current_title => "Member, Advisory Board",
|
121
|
+
:current_company_url => "http://ventureforamerica.org/",
|
122
|
+
:description => nil,
|
123
|
+
:linkedin_company_url => "http://www.linkedin.com/company/venture-for-america?trk=ppro_cprof",
|
124
|
+
:url => "http://ventureforamerica.org/",
|
125
|
+
:type => "Nonprofit",
|
126
|
+
:company_size => "1-10 employees",
|
127
|
+
:website => "http://ventureforamerica.org/",
|
128
|
+
:industry => "Nonprofit Organization Management",
|
129
|
+
:founded => "2011"
|
130
|
+
}
|
131
|
+
]
|
132
|
+
|
133
|
+
|
134
|
+
profile.past_companies
|
135
|
+
[
|
136
|
+
[0] {
|
137
|
+
:past_company => "Accel Partners",
|
138
|
+
:past_title => "Executive in Residence",
|
139
|
+
:past_company_website => "http://www.facebook.com/accel",
|
140
|
+
:description => nil,
|
141
|
+
:linkedin_company_url => "http://www.linkedin.com/company/accel-partners?trk=ppro_cprof",
|
142
|
+
:url => "http://www.facebook.com/accel",
|
143
|
+
:type => "Partnership",
|
144
|
+
:company_size => "51-200 employees",
|
145
|
+
:website => "http://www.facebook.com/accel",
|
146
|
+
:industry => "Venture Capital & Private Equity",
|
147
|
+
:address => "428 University Palo Alto, CA 94301 United States"
|
148
|
+
},
|
149
|
+
[1] {
|
150
|
+
:past_company => "Greylock",
|
151
|
+
:past_title => "Executive in Residence",
|
152
|
+
:past_company_website => "http://www.greylock.com",
|
153
|
+
:description => nil,
|
154
|
+
:linkedin_company_url => "http://www.linkedin.com/company/greylock-partners?trk=ppro_cprof",
|
155
|
+
:url => "http://www.greylock.com",
|
156
|
+
:type => "Partnership",
|
157
|
+
:company_size => "51-200 employees",
|
158
|
+
:website => "http://www.greylock.com",
|
159
|
+
:industry => "Venture Capital & Private Equity",
|
160
|
+
:address => "2550 Sand Hill Road Menlo Park, CA 94025 United States"
|
161
|
+
},
|
162
|
+
[2] {
|
163
|
+
:past_company => "Yahoo!",
|
164
|
+
:past_title => "Executive Vice President Network Division",
|
165
|
+
:past_company_website => "http://www.yahoo.com",
|
166
|
+
:description => nil,
|
167
|
+
:linkedin_company_url => "http://www.linkedin.com/company/yahoo?trk=ppro_cprof",
|
168
|
+
:url => "http://www.yahoo.com",
|
169
|
+
:type => "Public Company",
|
170
|
+
:company_size => "10,001+ employees",
|
171
|
+
:website => "http://www.yahoo.com",
|
172
|
+
:industry => "Internet",
|
173
|
+
:founded => "1994",
|
174
|
+
:address => "701 First Avenue Sunnyvale, CA 94089 United States"
|
175
|
+
},
|
176
|
+
[3] {
|
177
|
+
:past_company => "Windsor Media",
|
178
|
+
:past_title => "Founding Partner",
|
179
|
+
:past_company_website => nil,
|
180
|
+
:description => nil
|
181
|
+
},
|
182
|
+
[4] {
|
183
|
+
:past_company => "Warner Bros.",
|
184
|
+
:past_title => "Vice President Online",
|
185
|
+
:past_company_website => "http://www.warnerbros.com/",
|
186
|
+
:description => nil,
|
187
|
+
:linkedin_company_url => "http://www.linkedin.com/company/warner-bros.-entertainment-group-of-companies?trk=ppro_cprof",
|
188
|
+
:url => "http://www.warnerbros.com/",
|
189
|
+
:type => "Public Company",
|
190
|
+
:company_size => "10,001+ employees",
|
191
|
+
:website => "http://www.warnerbros.com/",
|
192
|
+
:industry => "Entertainment",
|
193
|
+
:address => "4000 Warner Boulevard Burbank, CA 91522 United States"
|
194
|
+
}
|
195
|
+
]
|
196
|
+
|
197
|
+
|
198
|
+
profile.recommended_visitors
|
199
|
+
#It is the list of visitors "Viewers of this profile also viewed..."
|
200
|
+
[
|
201
|
+
[0] {
|
202
|
+
:link => "http://www.linkedin.com/in/barackobama?trk=pub-pbmap",
|
203
|
+
:name => "Barack Obama",
|
204
|
+
:title => "President of the United States of ",
|
205
|
+
:company => nil
|
206
|
+
},
|
207
|
+
[1] {
|
208
|
+
:link => "http://www.linkedin.com/in/marissamayer?trk=pub-pbmap",
|
209
|
+
:name => "Marissa Mayer",
|
210
|
+
:title => "Yahoo!, President & CEO",
|
211
|
+
:company => nil
|
212
|
+
},
|
213
|
+
[2] {
|
214
|
+
:link => "http://www.linkedin.com/pub/sean-parker/0/1/826?trk=pub-pbmap",
|
215
|
+
:name => "Sean Parker",
|
216
|
+
:title => nil,
|
217
|
+
:company => nil
|
218
|
+
},
|
219
|
+
[3] {
|
220
|
+
:link => "http://www.linkedin.com/pub/eduardo-saverin/0/70a/31b?trk=pub-pbmap",
|
221
|
+
:name => "Eduardo Saverin",
|
222
|
+
:title => nil,
|
223
|
+
:company => nil
|
224
|
+
},
|
225
|
+
[4] {
|
226
|
+
:link => "http://www.linkedin.com/in/rbranson?trk=pub-pbmap",
|
227
|
+
:name => "Richard Branson",
|
228
|
+
:title => "Founder",
|
229
|
+
:company => "Virgin Group"
|
230
|
+
},
|
231
|
+
[5] {
|
232
|
+
:link => "http://www.linkedin.com/in/reidhoffman?trk=pub-pbmap",
|
233
|
+
:name => "Reid Hoffman",
|
234
|
+
:title => "Entrepreneur. Product Strategist. ",
|
235
|
+
:company => nil
|
236
|
+
},
|
237
|
+
[6] {
|
238
|
+
:link => "http://www.linkedin.com/in/mdell?trk=pub-pbmap",
|
239
|
+
:name => "Michael Dell",
|
240
|
+
:title => "Chairman and CEO",
|
241
|
+
:company => "Dell"
|
242
|
+
},
|
243
|
+
[7] {
|
244
|
+
:link => "http://www.linkedin.com/in/mittromney?trk=pub-pbmap",
|
245
|
+
:name => "Mitt Romney",
|
246
|
+
:title => "Believe in America",
|
247
|
+
:company => nil
|
248
|
+
},
|
249
|
+
[8] {
|
250
|
+
:link => "http://www.linkedin.com/pub/sheryl-sandberg/2/665/512?trk=pub-pbmap",
|
251
|
+
:name => "Sheryl Sandberg",
|
252
|
+
:title => nil,
|
253
|
+
:company => nil
|
254
|
+
}
|
255
|
+
]
|
256
|
+
|
257
|
+
|
258
|
+
The gem also comes with a binary and can be used from the command line to get a json response of the scraped data.
|
259
|
+
It takes the url as the first argument.
|
260
|
+
|
261
|
+
linkedin-scraper http://www.linkedin.com/in/jeffweiner08
|
262
|
+
|
263
|
+
## Contributing
|
264
|
+
|
265
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/yatish27/linkedin-scraper.
|
266
|
+
This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the
|
267
|
+
[Contributor Covenant](contributor-covenant.org) code of conduct.
|
268
|
+
|
269
|
+
|
270
|
+
## License
|
271
|
+
|
272
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
data/Rakefile
ADDED
@@ -0,0 +1,252 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
module Linkedin
|
3
|
+
class Profile
|
4
|
+
|
5
|
+
USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
|
6
|
+
ATTRIBUTES = %w(
|
7
|
+
name
|
8
|
+
first_name
|
9
|
+
last_name
|
10
|
+
title
|
11
|
+
location
|
12
|
+
number_of_connections
|
13
|
+
country
|
14
|
+
industry
|
15
|
+
summary
|
16
|
+
picture
|
17
|
+
projects
|
18
|
+
linkedin_url
|
19
|
+
education
|
20
|
+
groups
|
21
|
+
websites
|
22
|
+
languages
|
23
|
+
skills
|
24
|
+
certifications
|
25
|
+
organizations
|
26
|
+
past_companies
|
27
|
+
current_companies
|
28
|
+
recommended_visitors)
|
29
|
+
|
30
|
+
attr_reader :page, :linkedin_url
|
31
|
+
|
32
|
+
def self.get_profile(url, options = {})
|
33
|
+
Linkedin::Profile.new(url, options)
|
34
|
+
rescue => e
|
35
|
+
puts e
|
36
|
+
end
|
37
|
+
|
38
|
+
def initialize(url, options = {})
|
39
|
+
@linkedin_url = url
|
40
|
+
@options = options
|
41
|
+
@page = http_client.get(url)
|
42
|
+
end
|
43
|
+
|
44
|
+
def name
|
45
|
+
"#{first_name} #{last_name}"
|
46
|
+
end
|
47
|
+
|
48
|
+
def first_name
|
49
|
+
@first_name ||= (@page.at(".full-name").text.split(" ", 2)[0].strip if @page.at(".full-name"))
|
50
|
+
end
|
51
|
+
|
52
|
+
def last_name
|
53
|
+
@last_name ||= (@page.at(".full-name").text.split(" ", 2)[1].strip if @page.at(".full-name"))
|
54
|
+
end
|
55
|
+
|
56
|
+
def title
|
57
|
+
@title ||= (@page.at(".title").text.gsub(/\s+/, " ").strip if @page.at(".title"))
|
58
|
+
end
|
59
|
+
|
60
|
+
def location
|
61
|
+
@location ||= (@page.at(".locality").text.split(",").first.strip if @page.at(".locality"))
|
62
|
+
end
|
63
|
+
|
64
|
+
def number_of_connections
|
65
|
+
@connections ||= (@page.at(".member-connections").text.match(/[0-9]+[\+]{0,1}/)[0])
|
66
|
+
end
|
67
|
+
|
68
|
+
def country
|
69
|
+
@country ||= (@page.at(".locality").text.split(",").last.strip if @page.at(".locality"))
|
70
|
+
end
|
71
|
+
|
72
|
+
def industry
|
73
|
+
@industry ||= (@page.at(".industry").text.gsub(/\s+/, " ").strip if @page.at(".industry"))
|
74
|
+
end
|
75
|
+
|
76
|
+
def summary
|
77
|
+
@summary ||= (@page.at(".summary .description").text.gsub(/\s+/, " ").strip if @page.at(".summary .description"))
|
78
|
+
end
|
79
|
+
|
80
|
+
def picture
|
81
|
+
@picture ||= (@page.at(".profile-picture img").attributes["src"].value.strip if @page.at(".profile-picture img"))
|
82
|
+
end
|
83
|
+
|
84
|
+
def skills
|
85
|
+
@skills ||= (@page.search(".skill-pill .endorse-item-name-text").map { |skill| skill.text.strip if skill.text } rescue nil)
|
86
|
+
end
|
87
|
+
|
88
|
+
def past_companies
|
89
|
+
@past_companies ||= get_companies("past")
|
90
|
+
end
|
91
|
+
|
92
|
+
def current_companies
|
93
|
+
@current_companies ||= get_companies("current")
|
94
|
+
end
|
95
|
+
|
96
|
+
def education
|
97
|
+
@education ||= @page.search(".background-education .education").map do |item|
|
98
|
+
name = item.at("h4").text.gsub(/\s+|\n/, " ").strip if item.at("h4")
|
99
|
+
desc = item.search("h5").last.text.gsub(/\s+|\n/, " ").strip if item.search("h5").last
|
100
|
+
degree = item.search("h5").last.at(".degree").text.gsub(/\s+|\n/, " ").strip.gsub(/,$/, "") if item.search("h5").last.at(".degree")
|
101
|
+
major = item.search("h5").last.at(".major").text.gsub(/\s+|\n/, " ").strip if item.search("h5").last.at(".major")
|
102
|
+
period = item.at(".education-date").text.gsub(/\s+|\n/, " ").strip if item.at(".education-date")
|
103
|
+
start_date, end_date = item.at(".education-date").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
|
104
|
+
{:name => name, :description => desc, :degree => degree, :major => major, :period => period, :start_date => start_date, :end_date => end_date }
|
105
|
+
end
|
106
|
+
end
|
107
|
+
|
108
|
+
def websites
|
109
|
+
@websites ||= @page.search("#overview-summary-websites").flat_map do |site|
|
110
|
+
url = "http://www.linkedin.com#{site.at("a")["href"]}"
|
111
|
+
CGI.parse(URI.parse(url).query)["url"]
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
def groups
|
116
|
+
@groups ||= @page.search(".groups-name").map do |item|
|
117
|
+
name = item.text.gsub(/\s+|\n/, " ").strip
|
118
|
+
link = "http://www.linkedin.com#{item.at("a")["href"]}"
|
119
|
+
{ :name => name, :link => link }
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
def organizations
|
124
|
+
@organizations ||= @page.search("#background-organizations .section-item").map do |item|
|
125
|
+
name = item.at(".summary").text.gsub(/\s+|\n/, " ").strip rescue nil
|
126
|
+
start_date, end_date = item.at(".organizations-date").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
|
127
|
+
start_date = Date.parse(start_date) rescue nil
|
128
|
+
end_date = Date.parse(end_date) rescue nil
|
129
|
+
{ :name => name, :start_date => start_date, :end_date => end_date }
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def languages
|
134
|
+
@languages ||= @page.search(".background-languages #languages ol li").map do |item|
|
135
|
+
language = item.at("h4").text rescue nil
|
136
|
+
proficiency = item.at("div.languages-proficiency").text.gsub(/\s+|\n/, " ").strip rescue nil
|
137
|
+
{ :language => language, :proficiency => proficiency }
|
138
|
+
end
|
139
|
+
end
|
140
|
+
|
141
|
+
def certifications
|
142
|
+
@certifications ||= @page.search("background-certifications").map do |item|
|
143
|
+
name = item.at("h4").text.gsub(/\s+|\n/, " ").strip rescue nil
|
144
|
+
authority = item.at("h5").text.gsub(/\s+|\n/, " ").strip rescue nil
|
145
|
+
license = item.at(".specifics/.licence-number").text.gsub(/\s+|\n/, " ").strip rescue nil
|
146
|
+
start_date = item.at(".certification-date").text.gsub(/\s+|\n/, " ").strip rescue nil
|
147
|
+
|
148
|
+
{ :name => name, :authority => authority, :license => license, :start_date => start_date }
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
|
153
|
+
def recommended_visitors
|
154
|
+
@recommended_visitors ||= @page.search(".insights-browse-map/ul/li").map do |visitor|
|
155
|
+
v = {}
|
156
|
+
v[:link] = visitor.at("a")["href"]
|
157
|
+
v[:name] = visitor.at("h4/a").text
|
158
|
+
v[:title] = visitor.at(".browse-map-title").text.gsub("...", " ").split(" at ").first
|
159
|
+
v[:company] = visitor.at(".browse-map-title").text.gsub("...", " ").split(" at ")[1]
|
160
|
+
v
|
161
|
+
end
|
162
|
+
end
|
163
|
+
|
164
|
+
def projects
|
165
|
+
@projects ||= @page.search(".background-projects/div").map do |project|
|
166
|
+
project = project.at("div")
|
167
|
+
|
168
|
+
p = {}
|
169
|
+
start_date, end_date = project.at(".projects-date").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
|
170
|
+
|
171
|
+
p[:title] = project.at("hgroup/h4 span:first-of-type").text rescue nil
|
172
|
+
p[:link] = project.at("hgroup/h4 a:first-of-type")['href'] rescue nil
|
173
|
+
p[:start_date] = parse_date(start_date) rescue nil
|
174
|
+
p[:end_date] = parse_date(end_date) rescue nil
|
175
|
+
p[:description] = project.at(".description").text rescue nil
|
176
|
+
p[:associates] = project.at(".associated-list ul").children.map{ |c| c.at("a").text } rescue nil
|
177
|
+
p
|
178
|
+
end
|
179
|
+
end
|
180
|
+
|
181
|
+
def to_json
|
182
|
+
require "json"
|
183
|
+
ATTRIBUTES.reduce({}){ |hash,attr| hash[attr.to_sym] = self.send(attr.to_sym);hash }.to_json
|
184
|
+
end
|
185
|
+
|
186
|
+
private
|
187
|
+
|
188
|
+
def get_companies(type)
|
189
|
+
companies = []
|
190
|
+
if @page.search(".background-experience .#{type}-position").first
|
191
|
+
@page.search(".background-experience .#{type}-position").each do |node|
|
192
|
+
|
193
|
+
company = {}
|
194
|
+
company[:title] = node.at("h4").text.gsub(/\s+|\n/, " ").strip if node.at("h4")
|
195
|
+
company[:company] = node.at("h4").next.text.gsub(/\s+|\n/, " ").strip if node.at("h4").next
|
196
|
+
company[:description] = node.at(".description").text.gsub(/\s+|\n/, " ").strip if node.at(".description")
|
197
|
+
|
198
|
+
start_date, end_date = node.at(".experience-date-locale").text.strip.split(" – ") rescue nil
|
199
|
+
company[:duration] = node.at(".experience-date-locale").text[/.*\((.*)\)/, 1]
|
200
|
+
company[:start_date] = parse_date(start_date) rescue nil
|
201
|
+
company[:end_date] = parse_date(end_date) rescue nil
|
202
|
+
|
203
|
+
company_link = node.at("h4").next.at("a")["href"] if node.at("h4").next.at("a")
|
204
|
+
|
205
|
+
result = get_company_details(company_link)
|
206
|
+
companies << company.merge!(result)
|
207
|
+
end
|
208
|
+
end
|
209
|
+
companies
|
210
|
+
end
|
211
|
+
|
212
|
+
def parse_date(date)
|
213
|
+
date = "#{date}-01-01" if date =~ /^(19|20)\d{2}$/
|
214
|
+
Date.parse(date)
|
215
|
+
end
|
216
|
+
|
217
|
+
def get_company_details(link)
|
218
|
+
result = { :linkedin_company_url => get_linkedin_company_url(link) }
|
219
|
+
page = http_client.get(result[:linkedin_company_url])
|
220
|
+
|
221
|
+
result[:url] = page.at(".basic-info-about/ul/li/p/a").text if page.at(".basic-info-about/ul/li/p/a")
|
222
|
+
node_2 = page.at(".basic-info-about/ul")
|
223
|
+
if node_2
|
224
|
+
node_2.search("p").zip(node_2.search("h4")).each do |value, title|
|
225
|
+
result[title.text.gsub(" ", "_").downcase.to_sym] = value.text.strip
|
226
|
+
end
|
227
|
+
end
|
228
|
+
result[:address] = page.at(".vcard.hq").at(".adr").text.gsub("\n", " ").strip if page.at(".vcard.hq")
|
229
|
+
result
|
230
|
+
end
|
231
|
+
|
232
|
+
def http_client
|
233
|
+
Mechanize.new do |agent|
|
234
|
+
agent.user_agent_alias = USER_AGENTS.sample
|
235
|
+
unless @options.empty?
|
236
|
+
agent.set_proxy(@options[:proxy_ip], @options[:proxy_port])
|
237
|
+
end
|
238
|
+
agent.max_history = 0
|
239
|
+
end
|
240
|
+
end
|
241
|
+
|
242
|
+
def get_linkedin_company_url(link)
|
243
|
+
http = %r{http://www.linkedin.com/}
|
244
|
+
https = %r{https://www.linkedin.com/}
|
245
|
+
if http.match(link) || https.match(link)
|
246
|
+
link
|
247
|
+
else
|
248
|
+
"http://www.linkedin.com/#{link}"
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|