linkedin-scraper 0.1.5 → 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -1
- data/README.md +17 -7
- data/bin/linkedin-scraper +1 -1
- data/lib/linkedin_scraper/profile.rb +243 -0
- data/lib/{linkedin-scraper → linkedin_scraper}/version.rb +1 -1
- data/lib/linkedin_scraper.rb +5 -0
- data/linkedin-scraper.gemspec +7 -6
- data/spec/fixtures/jeffweiner08.html +308 -0
- data/spec/linkedin_scraper/.DS_Store +0 -0
- data/spec/linkedin_scraper/profile_spec.rb +104 -0
- metadata +27 -24
- data/lib/linkedin-scraper/profile.rb +0 -225
- data/lib/linkedin-scraper.rb +0 -5
- data/spec/fixtures/jgrevich.html +0 -9300
- data/spec/linkedin-scraper/profile_spec.rb +0 -154
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2030446ef750ed1a95c9818d63d0cf97a0cbd60a
|
4
|
+
data.tar.gz: 1639e466dadbee02704a853fe13f0ae10bb42f94
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dd080bec613c77eb50a439ccd3628932ba0b9ed0ddf7b5e03781036d89722f423e1c439d8b9a08e49e10cf50744926cbae05cc5218ce71a6b923ff793b118b93
|
7
|
+
data.tar.gz: b796724e23fb34f49c3f1012c97c9bd2a0d38372d652699095a1e0592f918a9778caaf9ecacce51217289a6500a49d2319bdf1a8e5ab54fe4d0ccc5e9afc64b3
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -5,11 +5,11 @@ Linkedin Scraper
|
|
5
5
|
================
|
6
6
|
|
7
7
|
Linkedin-scraper is a gem for scraping linkedin public profiles.
|
8
|
-
Given the URL of the profile, it gets the name, country, title, area, current companies, past
|
8
|
+
Given the URL of the profile, it gets the name, country, title, area, current companies, past companies,
|
9
|
+
organizations, skills, groups, etc
|
9
10
|
|
10
11
|
|
11
|
-
##Installation
|
12
|
-
|
12
|
+
## Installation
|
13
13
|
|
14
14
|
Install the gem from RubyGems:
|
15
15
|
|
@@ -17,7 +17,7 @@ Install the gem from RubyGems:
|
|
17
17
|
|
18
18
|
This gem is tested on 1.9.2, 1.9.3, 2.0.0, JRuby1.9, rbx1.9,
|
19
19
|
|
20
|
-
##Usage
|
20
|
+
## Usage
|
21
21
|
|
22
22
|
|
23
23
|
Initialize a scraper instance
|
@@ -59,7 +59,7 @@ The returning object responds to the following methods
|
|
59
59
|
|
60
60
|
profile.certifications # Array of certifications
|
61
61
|
|
62
|
-
For current and past
|
62
|
+
For current and past companies it also provides the details of the companies like company size, industry, address, etc
|
63
63
|
|
64
64
|
profile.current_companies
|
65
65
|
|
@@ -252,8 +252,18 @@ For current and past comapnies it also provides the details of the companies lik
|
|
252
252
|
]
|
253
253
|
|
254
254
|
|
255
|
-
The gem also comes with a binary and can be used from the command line to get a json response of the scraped data.
|
255
|
+
The gem also comes with a binary and can be used from the command line to get a json response of the scraped data.
|
256
|
+
It takes the url as the first argument.
|
256
257
|
|
257
258
|
linkedin-scraper http://www.linkedin.com/in/jeffweiner08
|
258
259
|
|
259
|
-
|
260
|
+
## Contributing
|
261
|
+
|
262
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/yatish27/linkedin-scraper.
|
263
|
+
This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the
|
264
|
+
[Contributor Covenant](contributor-covenant.org) code of conduct.
|
265
|
+
|
266
|
+
|
267
|
+
## License
|
268
|
+
|
269
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
data/bin/linkedin-scraper
CHANGED
@@ -0,0 +1,243 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
module Linkedin
|
3
|
+
class Profile
|
4
|
+
|
5
|
+
USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
|
6
|
+
ATTRIBUTES = %w(
|
7
|
+
name
|
8
|
+
first_name
|
9
|
+
last_name
|
10
|
+
title
|
11
|
+
location
|
12
|
+
country
|
13
|
+
industry
|
14
|
+
summary
|
15
|
+
picture
|
16
|
+
projects
|
17
|
+
linkedin_url
|
18
|
+
education
|
19
|
+
groups
|
20
|
+
websites
|
21
|
+
languages
|
22
|
+
skills
|
23
|
+
certifications
|
24
|
+
organizations
|
25
|
+
past_companies
|
26
|
+
current_companies
|
27
|
+
recommended_visitors)
|
28
|
+
|
29
|
+
attr_reader :page, :linkedin_url
|
30
|
+
|
31
|
+
def self.get_profile(url)
|
32
|
+
Linkedin::Profile.new(url)
|
33
|
+
rescue => e
|
34
|
+
puts e
|
35
|
+
end
|
36
|
+
|
37
|
+
def initialize(url)
|
38
|
+
@linkedin_url = url
|
39
|
+
@page = http_client.get(url)
|
40
|
+
end
|
41
|
+
|
42
|
+
def name
|
43
|
+
"#{first_name} #{last_name}"
|
44
|
+
end
|
45
|
+
|
46
|
+
def first_name
|
47
|
+
@first_name ||= (@page.at(".full-name").text.split(" ", 2)[0].strip if @page.at(".full-name"))
|
48
|
+
end
|
49
|
+
|
50
|
+
def last_name
|
51
|
+
@last_name ||= (@page.at(".full-name").text.split(" ", 2)[1].strip if @page.at(".full-name"))
|
52
|
+
end
|
53
|
+
|
54
|
+
def title
|
55
|
+
@title ||= (@page.at(".title").text.gsub(/\s+/, " ").strip if @page.at(".title"))
|
56
|
+
end
|
57
|
+
|
58
|
+
def location
|
59
|
+
@location ||= (@page.at(".locality").text.split(",").first.strip if @page.at(".locality"))
|
60
|
+
end
|
61
|
+
|
62
|
+
def country
|
63
|
+
@country ||= (@page.at(".locality").text.split(",").last.strip if @page.at(".locality"))
|
64
|
+
end
|
65
|
+
|
66
|
+
def industry
|
67
|
+
@industry ||= (@page.at(".industry").text.gsub(/\s+/, " ").strip if @page.at(".industry"))
|
68
|
+
end
|
69
|
+
|
70
|
+
def summary
|
71
|
+
@summary ||= (@page.at(".summary .description").text.gsub(/\s+/, " ").strip if @page.at(".summary .description"))
|
72
|
+
end
|
73
|
+
|
74
|
+
def picture
|
75
|
+
@picture ||= (@page.at(".profile-picture img").attributes["src"].value.strip if @page.at(".profile-picture img"))
|
76
|
+
end
|
77
|
+
|
78
|
+
def skills
|
79
|
+
@skills ||= (@page.search(".skill-pill .endorse-item-name-text").map { |skill| skill.text.strip if skill.text } rescue nil)
|
80
|
+
end
|
81
|
+
|
82
|
+
def past_companies
|
83
|
+
@past_companies ||= get_companies("past")
|
84
|
+
end
|
85
|
+
|
86
|
+
def current_companies
|
87
|
+
@current_companies ||= get_companies("current")
|
88
|
+
end
|
89
|
+
|
90
|
+
def education
|
91
|
+
@education ||= @page.search(".background-education .education").map do |item|
|
92
|
+
name = item.at("h4").text.gsub(/\s+|\n/, " ").strip if item.at("h4")
|
93
|
+
desc = item.search("h5").last.text.gsub(/\s+|\n/, " ").strip if item.search("h5").last
|
94
|
+
degree = item.search("h5").last.at(".degree").text.gsub(/\s+|\n/, " ").strip.gsub(/,$/, "") if item.search("h5").last.at(".degree")
|
95
|
+
major = item.search("h5").last.at(".major").text.gsub(/\s+|\n/, " ").strip if item.search("h5").last.at(".major")
|
96
|
+
period = item.at(".education-date").text.gsub(/\s+|\n/, " ").strip if item.at(".education-date")
|
97
|
+
start_date, end_date = item.at(".education-date").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
|
98
|
+
{:name => name, :description => desc, :degree => degree, :major => major, :period => period, :start_date => start_date, :end_date => end_date }
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def websites
|
103
|
+
@websites ||= @page.search("#overview-summary-websites").flat_map do |site|
|
104
|
+
url = "http://www.linkedin.com#{site.at("a")["href"]}"
|
105
|
+
CGI.parse(URI.parse(url).query)["url"]
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def groups
|
110
|
+
@groups ||= @page.search(".groups-name").map do |item|
|
111
|
+
name = item.text.gsub(/\s+|\n/, " ").strip
|
112
|
+
link = "http://www.linkedin.com#{item.at("a")["href"]}"
|
113
|
+
{ :name => name, :link => link }
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def organizations
|
118
|
+
@organizations ||= @page.search("#background-organizations .section-item").map do |item|
|
119
|
+
name = item.at(".summary").text.gsub(/\s+|\n/, " ").strip rescue nil
|
120
|
+
start_date, end_date = item.at(".organizations-date").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
|
121
|
+
start_date = Date.parse(start_date) rescue nil
|
122
|
+
end_date = Date.parse(end_date) rescue nil
|
123
|
+
{ :name => name, :start_date => start_date, :end_date => end_date }
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
def languages
|
128
|
+
@languages ||= @page.search(".background-languages #languages ol li").map do |item|
|
129
|
+
language = item.at("h4").text rescue nil
|
130
|
+
proficiency = item.at("div.languages-proficiency").text.gsub(/\s+|\n/, " ").strip rescue nil
|
131
|
+
{ :language => language, :proficiency => proficiency }
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def certifications
|
136
|
+
@certifications ||= @page.search("background-certifications").map do |item|
|
137
|
+
name = item.at("h4").text.gsub(/\s+|\n/, " ").strip rescue nil
|
138
|
+
authority = item.at("h5").text.gsub(/\s+|\n/, " ").strip rescue nil
|
139
|
+
license = item.at(".specifics/.licence-number").text.gsub(/\s+|\n/, " ").strip rescue nil
|
140
|
+
start_date = item.at(".certification-date").text.gsub(/\s+|\n/, " ").strip rescue nil
|
141
|
+
|
142
|
+
{ :name => name, :authority => authority, :license => license, :start_date => start_date }
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
|
147
|
+
def recommended_visitors
|
148
|
+
@recommended_visitors ||= @page.search(".insights-browse-map/ul/li").map do |visitor|
|
149
|
+
v = {}
|
150
|
+
v[:link] = visitor.at("a")["href"]
|
151
|
+
v[:name] = visitor.at("h4/a").text
|
152
|
+
v[:title] = visitor.at(".browse-map-title").text.gsub("...", " ").split(" at ").first
|
153
|
+
v[:company] = visitor.at(".browse-map-title").text.gsub("...", " ").split(" at ")[1]
|
154
|
+
v
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def projects
|
159
|
+
@projects ||= @page.search(".background-projects/div").map do |project|
|
160
|
+
project = project.at("div")
|
161
|
+
|
162
|
+
p = {}
|
163
|
+
start_date, end_date = project.at(".projects-date").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
|
164
|
+
|
165
|
+
p[:title] = project.at("hgroup/h4 span:first-of-type").text rescue nil
|
166
|
+
p[:link] = project.at("hgroup/h4 a:first-of-type")['href'] rescue nil
|
167
|
+
p[:start_date] = parse_date(start_date) rescue nil
|
168
|
+
p[:end_date] = parse_date(end_date) rescue nil
|
169
|
+
p[:description] = project.at(".description").text rescue nil
|
170
|
+
p[:associates] = project.at(".associated-list ul").children.map{ |c| c.at("a").text } rescue nil
|
171
|
+
p
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def to_json
|
176
|
+
require "json"
|
177
|
+
ATTRIBUTES.reduce({}){ |hash,attr| hash[attr.to_sym] = self.send(attr.to_sym);hash }.to_json
|
178
|
+
end
|
179
|
+
|
180
|
+
private
|
181
|
+
|
182
|
+
def get_companies(type)
|
183
|
+
companies = []
|
184
|
+
if @page.search(".background-experience .#{type}-position").first
|
185
|
+
@page.search(".background-experience .#{type}-position").each do |node|
|
186
|
+
|
187
|
+
company = {}
|
188
|
+
company[:title] = node.at("h4").text.gsub(/\s+|\n/, " ").strip if node.at("h4")
|
189
|
+
company[:company] = node.at("h4").next.text.gsub(/\s+|\n/, " ").strip if node.at("h4").next
|
190
|
+
company[:description] = node.at(".description").text.gsub(/\s+|\n/, " ").strip if node.at(".description")
|
191
|
+
|
192
|
+
start_date, end_date = node.at(".experience-date-locale").text.strip.split(" – ") rescue nil
|
193
|
+
company[:duration] = node.at(".experience-date-locale").text[/.*\((.*)\)/, 1]
|
194
|
+
company[:start_date] = parse_date(start_date) rescue nil
|
195
|
+
company[:end_date] = parse_date(end_date) rescue nil
|
196
|
+
|
197
|
+
company_link = node.at("h4").next.at("a")["href"] if node.at("h4").next.at("a")
|
198
|
+
|
199
|
+
result = get_company_details(company_link)
|
200
|
+
companies << company.merge!(result)
|
201
|
+
end
|
202
|
+
end
|
203
|
+
companies
|
204
|
+
end
|
205
|
+
|
206
|
+
def parse_date(date)
|
207
|
+
date = "#{date}-01-01" if date =~ /^(19|20)\d{2}$/
|
208
|
+
Date.parse(date)
|
209
|
+
end
|
210
|
+
|
211
|
+
def get_company_details(link)
|
212
|
+
result = { :linkedin_company_url => get_linkedin_company_url(link) }
|
213
|
+
page = http_client.get(result[:linkedin_company_url])
|
214
|
+
|
215
|
+
result[:url] = page.at(".basic-info-about/ul/li/p/a").text if page.at(".basic-info-about/ul/li/p/a")
|
216
|
+
node_2 = page.at(".basic-info-about/ul")
|
217
|
+
if node_2
|
218
|
+
node_2.search("p").zip(node_2.search("h4")).each do |value, title|
|
219
|
+
result[title.text.gsub(" ", "_").downcase.to_sym] = value.text.strip
|
220
|
+
end
|
221
|
+
end
|
222
|
+
result[:address] = page.at(".vcard.hq").at(".adr").text.gsub("\n", " ").strip if page.at(".vcard.hq")
|
223
|
+
result
|
224
|
+
end
|
225
|
+
|
226
|
+
def http_client
|
227
|
+
Mechanize.new do |agent|
|
228
|
+
agent.user_agent_alias = USER_AGENTS.sample
|
229
|
+
agent.max_history = 0
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
def get_linkedin_company_url(link)
|
234
|
+
http = %r{http://www.linkedin.com/}
|
235
|
+
https = %r{https://www.linkedin.com/}
|
236
|
+
if http.match(link) || https.match(link)
|
237
|
+
link
|
238
|
+
else
|
239
|
+
"http://www.linkedin.com/#{link}"
|
240
|
+
end
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
data/linkedin-scraper.gemspec
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
|
-
require File.expand_path('../lib/
|
2
|
+
require File.expand_path('../lib/linkedin_scraper/version', __FILE__)
|
3
3
|
|
4
4
|
Gem::Specification.new do |gem|
|
5
5
|
gem.authors = ['Yatish Mehta']
|
6
|
-
gem.description = %q{Scrapes the
|
6
|
+
gem.description = %q{Scrapes the LinkedIn profile using the public url }
|
7
7
|
gem.summary = %q{when a url of public linkedin profile page is given it scrapes the entire page and converts into a accessible object}
|
8
8
|
gem.homepage = 'https://github.com/yatishmehta27/linkedin-scraper'
|
9
9
|
gem.files = `git ls-files`.split($\)
|
@@ -13,9 +13,10 @@ Gem::Specification.new do |gem|
|
|
13
13
|
gem.require_paths = ['lib']
|
14
14
|
gem.version = Linkedin::Scraper::VERSION
|
15
15
|
|
16
|
-
gem.
|
17
|
-
|
18
|
-
gem.add_development_dependency 'rspec', '>=0'
|
19
|
-
gem.add_development_dependency 'rake'
|
16
|
+
gem.license = "MIT"
|
20
17
|
|
18
|
+
gem.add_dependency 'mechanize', '~> 2'
|
19
|
+
|
20
|
+
gem.add_development_dependency 'rspec', '~> 3'
|
21
|
+
gem.add_development_dependency 'rake', '~> 10'
|
21
22
|
end
|