linkedin-scraper 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -1
- data/README.md +17 -7
- data/bin/linkedin-scraper +1 -1
- data/lib/linkedin_scraper/profile.rb +243 -0
- data/lib/{linkedin-scraper → linkedin_scraper}/version.rb +1 -1
- data/lib/linkedin_scraper.rb +5 -0
- data/linkedin-scraper.gemspec +7 -6
- data/spec/fixtures/jeffweiner08.html +308 -0
- data/spec/linkedin_scraper/.DS_Store +0 -0
- data/spec/linkedin_scraper/profile_spec.rb +104 -0
- metadata +27 -24
- data/lib/linkedin-scraper/profile.rb +0 -225
- data/lib/linkedin-scraper.rb +0 -5
- data/spec/fixtures/jgrevich.html +0 -9300
- data/spec/linkedin-scraper/profile_spec.rb +0 -154
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2030446ef750ed1a95c9818d63d0cf97a0cbd60a
|
4
|
+
data.tar.gz: 1639e466dadbee02704a853fe13f0ae10bb42f94
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: dd080bec613c77eb50a439ccd3628932ba0b9ed0ddf7b5e03781036d89722f423e1c439d8b9a08e49e10cf50744926cbae05cc5218ce71a6b923ff793b118b93
|
7
|
+
data.tar.gz: b796724e23fb34f49c3f1012c97c9bd2a0d38372d652699095a1e0592f918a9778caaf9ecacce51217289a6500a49d2319bdf1a8e5ab54fe4d0ccc5e9afc64b3
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -5,11 +5,11 @@ Linkedin Scraper
|
|
5
5
|
================
|
6
6
|
|
7
7
|
Linkedin-scraper is a gem for scraping linkedin public profiles.
|
8
|
-
Given the URL of the profile, it gets the name, country, title, area, current companies, past
|
8
|
+
Given the URL of the profile, it gets the name, country, title, area, current companies, past companies,
|
9
|
+
organizations, skills, groups, etc
|
9
10
|
|
10
11
|
|
11
|
-
##Installation
|
12
|
-
|
12
|
+
## Installation
|
13
13
|
|
14
14
|
Install the gem from RubyGems:
|
15
15
|
|
@@ -17,7 +17,7 @@ Install the gem from RubyGems:
|
|
17
17
|
|
18
18
|
This gem is tested on 1.9.2, 1.9.3, 2.0.0, JRuby1.9, rbx1.9,
|
19
19
|
|
20
|
-
##Usage
|
20
|
+
## Usage
|
21
21
|
|
22
22
|
|
23
23
|
Initialize a scraper instance
|
@@ -59,7 +59,7 @@ The returning object responds to the following methods
|
|
59
59
|
|
60
60
|
profile.certifications # Array of certifications
|
61
61
|
|
62
|
-
For current and past
|
62
|
+
For current and past companies it also provides the details of the companies like company size, industry, address, etc
|
63
63
|
|
64
64
|
profile.current_companies
|
65
65
|
|
@@ -252,8 +252,18 @@ For current and past comapnies it also provides the details of the companies lik
|
|
252
252
|
]
|
253
253
|
|
254
254
|
|
255
|
-
The gem also comes with a binary and can be used from the command line to get a json response of the scraped data.
|
255
|
+
The gem also comes with a binary and can be used from the command line to get a json response of the scraped data.
|
256
|
+
It takes the url as the first argument.
|
256
257
|
|
257
258
|
linkedin-scraper http://www.linkedin.com/in/jeffweiner08
|
258
259
|
|
259
|
-
|
260
|
+
## Contributing
|
261
|
+
|
262
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/yatish27/linkedin-scraper.
|
263
|
+
This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the
|
264
|
+
[Contributor Covenant](contributor-covenant.org) code of conduct.
|
265
|
+
|
266
|
+
|
267
|
+
## License
|
268
|
+
|
269
|
+
The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
|
data/bin/linkedin-scraper
CHANGED
@@ -0,0 +1,243 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
module Linkedin
|
3
|
+
class Profile
|
4
|
+
|
5
|
+
USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
|
6
|
+
ATTRIBUTES = %w(
|
7
|
+
name
|
8
|
+
first_name
|
9
|
+
last_name
|
10
|
+
title
|
11
|
+
location
|
12
|
+
country
|
13
|
+
industry
|
14
|
+
summary
|
15
|
+
picture
|
16
|
+
projects
|
17
|
+
linkedin_url
|
18
|
+
education
|
19
|
+
groups
|
20
|
+
websites
|
21
|
+
languages
|
22
|
+
skills
|
23
|
+
certifications
|
24
|
+
organizations
|
25
|
+
past_companies
|
26
|
+
current_companies
|
27
|
+
recommended_visitors)
|
28
|
+
|
29
|
+
attr_reader :page, :linkedin_url
|
30
|
+
|
31
|
+
def self.get_profile(url)
|
32
|
+
Linkedin::Profile.new(url)
|
33
|
+
rescue => e
|
34
|
+
puts e
|
35
|
+
end
|
36
|
+
|
37
|
+
def initialize(url)
|
38
|
+
@linkedin_url = url
|
39
|
+
@page = http_client.get(url)
|
40
|
+
end
|
41
|
+
|
42
|
+
def name
|
43
|
+
"#{first_name} #{last_name}"
|
44
|
+
end
|
45
|
+
|
46
|
+
def first_name
|
47
|
+
@first_name ||= (@page.at(".full-name").text.split(" ", 2)[0].strip if @page.at(".full-name"))
|
48
|
+
end
|
49
|
+
|
50
|
+
def last_name
|
51
|
+
@last_name ||= (@page.at(".full-name").text.split(" ", 2)[1].strip if @page.at(".full-name"))
|
52
|
+
end
|
53
|
+
|
54
|
+
def title
|
55
|
+
@title ||= (@page.at(".title").text.gsub(/\s+/, " ").strip if @page.at(".title"))
|
56
|
+
end
|
57
|
+
|
58
|
+
def location
|
59
|
+
@location ||= (@page.at(".locality").text.split(",").first.strip if @page.at(".locality"))
|
60
|
+
end
|
61
|
+
|
62
|
+
def country
|
63
|
+
@country ||= (@page.at(".locality").text.split(",").last.strip if @page.at(".locality"))
|
64
|
+
end
|
65
|
+
|
66
|
+
def industry
|
67
|
+
@industry ||= (@page.at(".industry").text.gsub(/\s+/, " ").strip if @page.at(".industry"))
|
68
|
+
end
|
69
|
+
|
70
|
+
def summary
|
71
|
+
@summary ||= (@page.at(".summary .description").text.gsub(/\s+/, " ").strip if @page.at(".summary .description"))
|
72
|
+
end
|
73
|
+
|
74
|
+
def picture
|
75
|
+
@picture ||= (@page.at(".profile-picture img").attributes["src"].value.strip if @page.at(".profile-picture img"))
|
76
|
+
end
|
77
|
+
|
78
|
+
def skills
|
79
|
+
@skills ||= (@page.search(".skill-pill .endorse-item-name-text").map { |skill| skill.text.strip if skill.text } rescue nil)
|
80
|
+
end
|
81
|
+
|
82
|
+
def past_companies
|
83
|
+
@past_companies ||= get_companies("past")
|
84
|
+
end
|
85
|
+
|
86
|
+
def current_companies
|
87
|
+
@current_companies ||= get_companies("current")
|
88
|
+
end
|
89
|
+
|
90
|
+
def education
|
91
|
+
@education ||= @page.search(".background-education .education").map do |item|
|
92
|
+
name = item.at("h4").text.gsub(/\s+|\n/, " ").strip if item.at("h4")
|
93
|
+
desc = item.search("h5").last.text.gsub(/\s+|\n/, " ").strip if item.search("h5").last
|
94
|
+
degree = item.search("h5").last.at(".degree").text.gsub(/\s+|\n/, " ").strip.gsub(/,$/, "") if item.search("h5").last.at(".degree")
|
95
|
+
major = item.search("h5").last.at(".major").text.gsub(/\s+|\n/, " ").strip if item.search("h5").last.at(".major")
|
96
|
+
period = item.at(".education-date").text.gsub(/\s+|\n/, " ").strip if item.at(".education-date")
|
97
|
+
start_date, end_date = item.at(".education-date").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
|
98
|
+
{:name => name, :description => desc, :degree => degree, :major => major, :period => period, :start_date => start_date, :end_date => end_date }
|
99
|
+
end
|
100
|
+
end
|
101
|
+
|
102
|
+
def websites
|
103
|
+
@websites ||= @page.search("#overview-summary-websites").flat_map do |site|
|
104
|
+
url = "http://www.linkedin.com#{site.at("a")["href"]}"
|
105
|
+
CGI.parse(URI.parse(url).query)["url"]
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def groups
|
110
|
+
@groups ||= @page.search(".groups-name").map do |item|
|
111
|
+
name = item.text.gsub(/\s+|\n/, " ").strip
|
112
|
+
link = "http://www.linkedin.com#{item.at("a")["href"]}"
|
113
|
+
{ :name => name, :link => link }
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def organizations
|
118
|
+
@organizations ||= @page.search("#background-organizations .section-item").map do |item|
|
119
|
+
name = item.at(".summary").text.gsub(/\s+|\n/, " ").strip rescue nil
|
120
|
+
start_date, end_date = item.at(".organizations-date").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
|
121
|
+
start_date = Date.parse(start_date) rescue nil
|
122
|
+
end_date = Date.parse(end_date) rescue nil
|
123
|
+
{ :name => name, :start_date => start_date, :end_date => end_date }
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
def languages
|
128
|
+
@languages ||= @page.search(".background-languages #languages ol li").map do |item|
|
129
|
+
language = item.at("h4").text rescue nil
|
130
|
+
proficiency = item.at("div.languages-proficiency").text.gsub(/\s+|\n/, " ").strip rescue nil
|
131
|
+
{ :language => language, :proficiency => proficiency }
|
132
|
+
end
|
133
|
+
end
|
134
|
+
|
135
|
+
def certifications
|
136
|
+
@certifications ||= @page.search("background-certifications").map do |item|
|
137
|
+
name = item.at("h4").text.gsub(/\s+|\n/, " ").strip rescue nil
|
138
|
+
authority = item.at("h5").text.gsub(/\s+|\n/, " ").strip rescue nil
|
139
|
+
license = item.at(".specifics/.licence-number").text.gsub(/\s+|\n/, " ").strip rescue nil
|
140
|
+
start_date = item.at(".certification-date").text.gsub(/\s+|\n/, " ").strip rescue nil
|
141
|
+
|
142
|
+
{ :name => name, :authority => authority, :license => license, :start_date => start_date }
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
|
147
|
+
def recommended_visitors
|
148
|
+
@recommended_visitors ||= @page.search(".insights-browse-map/ul/li").map do |visitor|
|
149
|
+
v = {}
|
150
|
+
v[:link] = visitor.at("a")["href"]
|
151
|
+
v[:name] = visitor.at("h4/a").text
|
152
|
+
v[:title] = visitor.at(".browse-map-title").text.gsub("...", " ").split(" at ").first
|
153
|
+
v[:company] = visitor.at(".browse-map-title").text.gsub("...", " ").split(" at ")[1]
|
154
|
+
v
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
def projects
|
159
|
+
@projects ||= @page.search(".background-projects/div").map do |project|
|
160
|
+
project = project.at("div")
|
161
|
+
|
162
|
+
p = {}
|
163
|
+
start_date, end_date = project.at(".projects-date").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
|
164
|
+
|
165
|
+
p[:title] = project.at("hgroup/h4 span:first-of-type").text rescue nil
|
166
|
+
p[:link] = project.at("hgroup/h4 a:first-of-type")['href'] rescue nil
|
167
|
+
p[:start_date] = parse_date(start_date) rescue nil
|
168
|
+
p[:end_date] = parse_date(end_date) rescue nil
|
169
|
+
p[:description] = project.at(".description").text rescue nil
|
170
|
+
p[:associates] = project.at(".associated-list ul").children.map{ |c| c.at("a").text } rescue nil
|
171
|
+
p
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def to_json
|
176
|
+
require "json"
|
177
|
+
ATTRIBUTES.reduce({}){ |hash,attr| hash[attr.to_sym] = self.send(attr.to_sym);hash }.to_json
|
178
|
+
end
|
179
|
+
|
180
|
+
private
|
181
|
+
|
182
|
+
def get_companies(type)
|
183
|
+
companies = []
|
184
|
+
if @page.search(".background-experience .#{type}-position").first
|
185
|
+
@page.search(".background-experience .#{type}-position").each do |node|
|
186
|
+
|
187
|
+
company = {}
|
188
|
+
company[:title] = node.at("h4").text.gsub(/\s+|\n/, " ").strip if node.at("h4")
|
189
|
+
company[:company] = node.at("h4").next.text.gsub(/\s+|\n/, " ").strip if node.at("h4").next
|
190
|
+
company[:description] = node.at(".description").text.gsub(/\s+|\n/, " ").strip if node.at(".description")
|
191
|
+
|
192
|
+
start_date, end_date = node.at(".experience-date-locale").text.strip.split(" – ") rescue nil
|
193
|
+
company[:duration] = node.at(".experience-date-locale").text[/.*\((.*)\)/, 1]
|
194
|
+
company[:start_date] = parse_date(start_date) rescue nil
|
195
|
+
company[:end_date] = parse_date(end_date) rescue nil
|
196
|
+
|
197
|
+
company_link = node.at("h4").next.at("a")["href"] if node.at("h4").next.at("a")
|
198
|
+
|
199
|
+
result = get_company_details(company_link)
|
200
|
+
companies << company.merge!(result)
|
201
|
+
end
|
202
|
+
end
|
203
|
+
companies
|
204
|
+
end
|
205
|
+
|
206
|
+
def parse_date(date)
|
207
|
+
date = "#{date}-01-01" if date =~ /^(19|20)\d{2}$/
|
208
|
+
Date.parse(date)
|
209
|
+
end
|
210
|
+
|
211
|
+
def get_company_details(link)
|
212
|
+
result = { :linkedin_company_url => get_linkedin_company_url(link) }
|
213
|
+
page = http_client.get(result[:linkedin_company_url])
|
214
|
+
|
215
|
+
result[:url] = page.at(".basic-info-about/ul/li/p/a").text if page.at(".basic-info-about/ul/li/p/a")
|
216
|
+
node_2 = page.at(".basic-info-about/ul")
|
217
|
+
if node_2
|
218
|
+
node_2.search("p").zip(node_2.search("h4")).each do |value, title|
|
219
|
+
result[title.text.gsub(" ", "_").downcase.to_sym] = value.text.strip
|
220
|
+
end
|
221
|
+
end
|
222
|
+
result[:address] = page.at(".vcard.hq").at(".adr").text.gsub("\n", " ").strip if page.at(".vcard.hq")
|
223
|
+
result
|
224
|
+
end
|
225
|
+
|
226
|
+
def http_client
|
227
|
+
Mechanize.new do |agent|
|
228
|
+
agent.user_agent_alias = USER_AGENTS.sample
|
229
|
+
agent.max_history = 0
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
def get_linkedin_company_url(link)
|
234
|
+
http = %r{http://www.linkedin.com/}
|
235
|
+
https = %r{https://www.linkedin.com/}
|
236
|
+
if http.match(link) || https.match(link)
|
237
|
+
link
|
238
|
+
else
|
239
|
+
"http://www.linkedin.com/#{link}"
|
240
|
+
end
|
241
|
+
end
|
242
|
+
end
|
243
|
+
end
|
data/linkedin-scraper.gemspec
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
# -*- encoding: utf-8 -*-
|
2
|
-
require File.expand_path('../lib/
|
2
|
+
require File.expand_path('../lib/linkedin_scraper/version', __FILE__)
|
3
3
|
|
4
4
|
Gem::Specification.new do |gem|
|
5
5
|
gem.authors = ['Yatish Mehta']
|
6
|
-
gem.description = %q{Scrapes the
|
6
|
+
gem.description = %q{Scrapes the LinkedIn profile using the public url }
|
7
7
|
gem.summary = %q{when a url of public linkedin profile page is given it scrapes the entire page and converts into a accessible object}
|
8
8
|
gem.homepage = 'https://github.com/yatishmehta27/linkedin-scraper'
|
9
9
|
gem.files = `git ls-files`.split($\)
|
@@ -13,9 +13,10 @@ Gem::Specification.new do |gem|
|
|
13
13
|
gem.require_paths = ['lib']
|
14
14
|
gem.version = Linkedin::Scraper::VERSION
|
15
15
|
|
16
|
-
gem.
|
17
|
-
|
18
|
-
gem.add_development_dependency 'rspec', '>=0'
|
19
|
-
gem.add_development_dependency 'rake'
|
16
|
+
gem.license = "MIT"
|
20
17
|
|
18
|
+
gem.add_dependency 'mechanize', '~> 2'
|
19
|
+
|
20
|
+
gem.add_development_dependency 'rspec', '~> 3'
|
21
|
+
gem.add_development_dependency 'rake', '~> 10'
|
21
22
|
end
|