linkedin-scraper 0.1.5 → 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
metadata CHANGED
@@ -1,58 +1,58 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkedin-scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yatish Mehta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-04 00:00:00.000000000 Z
11
+ date: 2015-09-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ">="
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '0'
19
+ version: '2'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ">="
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '0'
26
+ version: '2'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rspec
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ">="
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '0'
33
+ version: '3'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ">="
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '0'
40
+ version: '3'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rake
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ">="
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '0'
47
+ version: '10'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ">="
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '0'
55
- description: 'Scrapes the linkedin profile when a url is given '
54
+ version: '10'
55
+ description: 'Scrapes the LinkedIn profile using the public url '
56
56
  email:
57
57
  executables:
58
58
  - linkedin-scraper
@@ -67,15 +67,17 @@ files:
67
67
  - README.md
68
68
  - Rakefile
69
69
  - bin/linkedin-scraper
70
- - lib/linkedin-scraper.rb
71
- - lib/linkedin-scraper/profile.rb
72
- - lib/linkedin-scraper/version.rb
70
+ - lib/linkedin_scraper.rb
71
+ - lib/linkedin_scraper/profile.rb
72
+ - lib/linkedin_scraper/version.rb
73
73
  - linkedin-scraper.gemspec
74
- - spec/fixtures/jgrevich.html
75
- - spec/linkedin-scraper/profile_spec.rb
74
+ - spec/fixtures/jeffweiner08.html
75
+ - spec/linkedin_scraper/.DS_Store
76
+ - spec/linkedin_scraper/profile_spec.rb
76
77
  - spec/spec_helper.rb
77
78
  homepage: https://github.com/yatishmehta27/linkedin-scraper
78
- licenses: []
79
+ licenses:
80
+ - MIT
79
81
  metadata: {}
80
82
  post_install_message:
81
83
  rdoc_options: []
@@ -93,12 +95,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
93
95
  version: '0'
94
96
  requirements: []
95
97
  rubyforge_project:
96
- rubygems_version: 2.4.5
98
+ rubygems_version: 2.4.6
97
99
  signing_key:
98
100
  specification_version: 4
99
101
  summary: when a url of public linkedin profile page is given it scrapes the entire
100
102
  page and converts into a accessible object
101
103
  test_files:
102
- - spec/fixtures/jgrevich.html
103
- - spec/linkedin-scraper/profile_spec.rb
104
+ - spec/fixtures/jeffweiner08.html
105
+ - spec/linkedin_scraper/.DS_Store
106
+ - spec/linkedin_scraper/profile_spec.rb
104
107
  - spec/spec_helper.rb
@@ -1,225 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- module Linkedin
3
-
4
- class Profile
5
-
6
- USER_AGENTS = ['Windows IE 6', 'Windows IE 7', 'Windows Mozilla', 'Mac Safari', 'Mac FireFox', 'Mac Mozilla', 'Linux Mozilla', 'Linux Firefox', 'Linux Konqueror']
7
-
8
- ATTRIBUTES = %w(
9
- name
10
- first_name
11
- last_name
12
- title
13
- location
14
- country
15
- industry
16
- summary
17
- picture
18
- linkedin_url
19
- education
20
- groups
21
- websites
22
- languages
23
- skills
24
- certifications
25
- organizations
26
- past_companies
27
- current_companies
28
- recommended_visitors)
29
-
30
- attr_reader :page, :linkedin_url
31
-
32
- def self.get_profile(url)
33
- Linkedin::Profile.new(url)
34
- rescue => e
35
- puts e
36
- end
37
-
38
- def initialize(url)
39
- @linkedin_url = url
40
- @page = http_client.get(url)
41
- end
42
-
43
- def name
44
- "#{first_name} #{last_name}"
45
- end
46
-
47
- def first_name
48
- @first_name ||= (@page.at('.full-name').text.split(' ', 2)[0].strip if @page.at('.full-name'))
49
- end
50
-
51
- def last_name
52
- @last_name ||= (@page.at('.full-name').text.split(' ', 2)[1].strip if @page.at('.full-name'))
53
- end
54
-
55
- def title
56
- @title ||= (@page.at('.title').text.gsub(/\s+/, ' ').strip if @page.at('.title'))
57
- end
58
-
59
- def location
60
- @location ||= (@page.at('.locality').text.split(',').first.strip if @page.at('.locality'))
61
- end
62
-
63
- def country
64
- @country ||= (@page.at('.locality').text.split(',').last.strip if @page.at('.locality'))
65
- end
66
-
67
- def industry
68
- @industry ||= (@page.at('.industry').text.gsub(/\s+/, ' ').strip if @page.at('.industry'))
69
- end
70
-
71
- def summary
72
- @summary ||= (@page.at('.summary .description').text.gsub(/\s+/, ' ').strip if @page.at('.summary .description'))
73
- end
74
-
75
- def picture
76
- @picture ||= (@page.at('.profile-picture img').attributes['src'].value.strip if @page.at('.profile-picture img'))
77
- end
78
-
79
- def skills
80
- @skills ||= (@page.search('.skill-pill .endorse-item-name-text').map { |skill| skill.text.strip if skill.text } rescue nil)
81
- end
82
-
83
- def past_companies
84
- @past_companies ||= get_companies('past')
85
- end
86
-
87
- def current_companies
88
- @current_companies ||= get_companies('current')
89
- end
90
-
91
- def education
92
- @education ||= @page.search('.background-education .education').map do |item|
93
- name = item.at('h4').text.gsub(/\s+|\n/, ' ').strip if item.at('h4')
94
- desc = item.at('h5').text.gsub(/\s+|\n/, ' ').strip if item.at('h5')
95
- period = item.at('.education-date').text.gsub(/\s+|\n/, ' ').strip if item.at('.education-date')
96
-
97
- {:name => name, :description => desc, :period => period }
98
- end
99
- end
100
-
101
- def websites
102
- @websites ||= @page.search('#overview-summary-websites').flat_map do |site|
103
- url = "http://www.linkedin.com#{site.at('a')['href']}"
104
- CGI.parse(URI.parse(url).query)['url']
105
- end
106
-
107
- end
108
-
109
- def groups
110
- @groups ||= @page.search('.groups-name').map do |item|
111
- name = item.text.gsub(/\s+|\n/, ' ').strip
112
- link = "http://www.linkedin.com#{item.at('a')['href']}"
113
- { :name => name, :link => link }
114
- end
115
- end
116
-
117
- def organizations
118
- @organizations ||= @page.search('#background-organizations .section-item').map do |item|
119
- name = item.at('.summary').text.gsub(/\s+|\n/, ' ').strip rescue nil
120
- start_date, end_date = item.at('.organizations-date').text.gsub(/\s+|\n/, ' ').strip.split(' – ') rescue nil
121
- start_date = Date.parse(start_date) rescue nil
122
- end_date = Date.parse(end_date) rescue nil
123
- { :name => name, :start_date => start_date, :end_date => end_date }
124
- end
125
- end
126
-
127
- def languages
128
- @languages ||= @page.search('.background-languages #languages ol li').map do |item|
129
- language = item.at('h4').text rescue nil
130
- proficiency = item.at('div.languages-proficiency').text.gsub(/\s+|\n/, ' ').strip rescue nil
131
- { :language => language, :proficiency => proficiency }
132
- end
133
- end
134
-
135
- def certifications
136
- @certifications ||= @page.search('background-certifications').map do |item|
137
- name = item.at('h4').text.gsub(/\s+|\n/, ' ').strip rescue nil
138
- authority = item.at('h5').text.gsub(/\s+|\n/, ' ').strip rescue nil
139
- license = item.at('.specifics/.licence-number').text.gsub(/\s+|\n/, ' ').strip rescue nil
140
- start_date = item.at('.certification-date').text.gsub(/\s+|\n/, ' ').strip rescue nil
141
-
142
- { :name => name, :authority => authority, :license => license, :start_date => start_date }
143
- end
144
- end
145
-
146
-
147
- def recommended_visitors
148
- @recommended_visitors ||= @page.search('.insights-browse-map/ul/li').map do |visitor|
149
- v = {}
150
- v[:link] = visitor.at('a')['href']
151
- v[:name] = visitor.at('h4/a').text
152
- v[:title] = visitor.at('.browse-map-title').text.gsub('...', ' ').split(' at ').first
153
- v[:company] = visitor.at('.browse-map-title').text.gsub('...', ' ').split(' at ')[1]
154
- v
155
- end
156
- end
157
-
158
- def to_json
159
- require 'json'
160
- ATTRIBUTES.reduce({}){ |hash,attr| hash[attr.to_sym] = self.send(attr.to_sym);hash }.to_json
161
- end
162
-
163
- private
164
-
165
- def get_companies(type)
166
- companies = []
167
- if @page.search(".background-experience .#{type}-position").first
168
- @page.search(".background-experience .#{type}-position").each do |node|
169
-
170
- company = {}
171
- company[:title] = node.at('h4').text.gsub(/\s+|\n/, ' ').strip if node.at('h4')
172
- company[:company] = node.at('h4').next.text.gsub(/\s+|\n/, ' ').strip if node.at('h4').next
173
- company[:description] = node.at(".description").text.gsub(/\s+|\n/, ' ').strip if node.at(".description")
174
-
175
- start_date, end_date = node.at('.experience-date-locale').text.strip.split(" – ") rescue nil
176
- company[:start_date] = parse_date(start_date) rescue nil
177
- company[:end_date] = parse_date(end_date) rescue nil
178
-
179
- company_link = node.at('h4').next.at('a')['href'] if node.at('h4').next.at('a')
180
-
181
- result = get_company_details(company_link)
182
- companies << company.merge!(result)
183
- end
184
- end
185
- companies
186
- end
187
-
188
- def parse_date(date)
189
- date = "#{date}-01-01" if date =~ /^(19|20)\d{2}$/
190
- Date.parse(date)
191
- end
192
-
193
- def get_company_details(link)
194
- result = { :linkedin_company_url => get_linkedin_company_url(link) }
195
- page = http_client.get(result[:linkedin_company_url])
196
-
197
- result[:url] = page.at('.basic-info-about/ul/li/p/a').text if page.at('.basic-info-about/ul/li/p/a')
198
- node_2 = page.at('.basic-info-about/ul')
199
- if node_2
200
- node_2.search('p').zip(node_2.search('h4')).each do |value, title|
201
- result[title.text.gsub(' ', '_').downcase.to_sym] = value.text.strip
202
- end
203
- end
204
- result[:address] = page.at('.vcard.hq').at('.adr').text.gsub("\n", ' ').strip if page.at('.vcard.hq')
205
- result
206
- end
207
-
208
- def http_client
209
- Mechanize.new do |agent|
210
- agent.user_agent_alias = USER_AGENTS.sample
211
- agent.max_history = 0
212
- end
213
- end
214
-
215
- def get_linkedin_company_url(link)
216
- http = %r{http://www.linkedin.com/}
217
- https = %r{https://www.linkedin.com/}
218
- if http.match(link) || https.match(link)
219
- link
220
- else
221
- "http://www.linkedin.com/#{link}"
222
- end
223
- end
224
- end
225
- end
@@ -1,5 +0,0 @@
1
- require 'rubygems'
2
- require 'mechanize'
3
- require 'cgi'
4
- require 'net/http'
5
- Dir["#{File.expand_path(File.dirname(__FILE__))}/linkedin-scraper/*.rb"].each { |file| require file }