linkedin-scraper 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
metadata CHANGED
@@ -1,58 +1,58 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkedin-scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.5
4
+ version: 0.1.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Yatish Mehta
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-01-04 00:00:00.000000000 Z
11
+ date: 2015-09-14 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: mechanize
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - ">="
17
+ - - "~>"
18
18
  - !ruby/object:Gem::Version
19
- version: '0'
19
+ version: '2'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
- - - ">="
24
+ - - "~>"
25
25
  - !ruby/object:Gem::Version
26
- version: '0'
26
+ version: '2'
27
27
  - !ruby/object:Gem::Dependency
28
28
  name: rspec
29
29
  requirement: !ruby/object:Gem::Requirement
30
30
  requirements:
31
- - - ">="
31
+ - - "~>"
32
32
  - !ruby/object:Gem::Version
33
- version: '0'
33
+ version: '3'
34
34
  type: :development
35
35
  prerelease: false
36
36
  version_requirements: !ruby/object:Gem::Requirement
37
37
  requirements:
38
- - - ">="
38
+ - - "~>"
39
39
  - !ruby/object:Gem::Version
40
- version: '0'
40
+ version: '3'
41
41
  - !ruby/object:Gem::Dependency
42
42
  name: rake
43
43
  requirement: !ruby/object:Gem::Requirement
44
44
  requirements:
45
- - - ">="
45
+ - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '0'
47
+ version: '10'
48
48
  type: :development
49
49
  prerelease: false
50
50
  version_requirements: !ruby/object:Gem::Requirement
51
51
  requirements:
52
- - - ">="
52
+ - - "~>"
53
53
  - !ruby/object:Gem::Version
54
- version: '0'
55
- description: 'Scrapes the linkedin profile when a url is given '
54
+ version: '10'
55
+ description: 'Scrapes the LinkedIn profile using the public url '
56
56
  email:
57
57
  executables:
58
58
  - linkedin-scraper
@@ -67,15 +67,17 @@ files:
67
67
  - README.md
68
68
  - Rakefile
69
69
  - bin/linkedin-scraper
70
- - lib/linkedin-scraper.rb
71
- - lib/linkedin-scraper/profile.rb
72
- - lib/linkedin-scraper/version.rb
70
+ - lib/linkedin_scraper.rb
71
+ - lib/linkedin_scraper/profile.rb
72
+ - lib/linkedin_scraper/version.rb
73
73
  - linkedin-scraper.gemspec
74
- - spec/fixtures/jgrevich.html
75
- - spec/linkedin-scraper/profile_spec.rb
74
+ - spec/fixtures/jeffweiner08.html
75
+ - spec/linkedin_scraper/.DS_Store
76
+ - spec/linkedin_scraper/profile_spec.rb
76
77
  - spec/spec_helper.rb
77
78
  homepage: https://github.com/yatishmehta27/linkedin-scraper
78
- licenses: []
79
+ licenses:
80
+ - MIT
79
81
  metadata: {}
80
82
  post_install_message:
81
83
  rdoc_options: []
@@ -93,12 +95,13 @@ required_rubygems_version: !ruby/object:Gem::Requirement
93
95
  version: '0'
94
96
  requirements: []
95
97
  rubyforge_project:
96
- rubygems_version: 2.4.5
98
+ rubygems_version: 2.4.6
97
99
  signing_key:
98
100
  specification_version: 4
99
101
  summary: when a url of public linkedin profile page is given it scrapes the entire
100
102
  page and converts into a accessible object
101
103
  test_files:
102
- - spec/fixtures/jgrevich.html
103
- - spec/linkedin-scraper/profile_spec.rb
104
+ - spec/fixtures/jeffweiner08.html
105
+ - spec/linkedin_scraper/.DS_Store
106
+ - spec/linkedin_scraper/profile_spec.rb
104
107
  - spec/spec_helper.rb
@@ -1,225 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- module Linkedin
3
-
4
- class Profile
5
-
6
- USER_AGENTS = ['Windows IE 6', 'Windows IE 7', 'Windows Mozilla', 'Mac Safari', 'Mac FireFox', 'Mac Mozilla', 'Linux Mozilla', 'Linux Firefox', 'Linux Konqueror']
7
-
8
- ATTRIBUTES = %w(
9
- name
10
- first_name
11
- last_name
12
- title
13
- location
14
- country
15
- industry
16
- summary
17
- picture
18
- linkedin_url
19
- education
20
- groups
21
- websites
22
- languages
23
- skills
24
- certifications
25
- organizations
26
- past_companies
27
- current_companies
28
- recommended_visitors)
29
-
30
- attr_reader :page, :linkedin_url
31
-
32
- def self.get_profile(url)
33
- Linkedin::Profile.new(url)
34
- rescue => e
35
- puts e
36
- end
37
-
38
- def initialize(url)
39
- @linkedin_url = url
40
- @page = http_client.get(url)
41
- end
42
-
43
- def name
44
- "#{first_name} #{last_name}"
45
- end
46
-
47
- def first_name
48
- @first_name ||= (@page.at('.full-name').text.split(' ', 2)[0].strip if @page.at('.full-name'))
49
- end
50
-
51
- def last_name
52
- @last_name ||= (@page.at('.full-name').text.split(' ', 2)[1].strip if @page.at('.full-name'))
53
- end
54
-
55
- def title
56
- @title ||= (@page.at('.title').text.gsub(/\s+/, ' ').strip if @page.at('.title'))
57
- end
58
-
59
- def location
60
- @location ||= (@page.at('.locality').text.split(',').first.strip if @page.at('.locality'))
61
- end
62
-
63
- def country
64
- @country ||= (@page.at('.locality').text.split(',').last.strip if @page.at('.locality'))
65
- end
66
-
67
- def industry
68
- @industry ||= (@page.at('.industry').text.gsub(/\s+/, ' ').strip if @page.at('.industry'))
69
- end
70
-
71
- def summary
72
- @summary ||= (@page.at('.summary .description').text.gsub(/\s+/, ' ').strip if @page.at('.summary .description'))
73
- end
74
-
75
- def picture
76
- @picture ||= (@page.at('.profile-picture img').attributes['src'].value.strip if @page.at('.profile-picture img'))
77
- end
78
-
79
- def skills
80
- @skills ||= (@page.search('.skill-pill .endorse-item-name-text').map { |skill| skill.text.strip if skill.text } rescue nil)
81
- end
82
-
83
- def past_companies
84
- @past_companies ||= get_companies('past')
85
- end
86
-
87
- def current_companies
88
- @current_companies ||= get_companies('current')
89
- end
90
-
91
- def education
92
- @education ||= @page.search('.background-education .education').map do |item|
93
- name = item.at('h4').text.gsub(/\s+|\n/, ' ').strip if item.at('h4')
94
- desc = item.at('h5').text.gsub(/\s+|\n/, ' ').strip if item.at('h5')
95
- period = item.at('.education-date').text.gsub(/\s+|\n/, ' ').strip if item.at('.education-date')
96
-
97
- {:name => name, :description => desc, :period => period }
98
- end
99
- end
100
-
101
- def websites
102
- @websites ||= @page.search('#overview-summary-websites').flat_map do |site|
103
- url = "http://www.linkedin.com#{site.at('a')['href']}"
104
- CGI.parse(URI.parse(url).query)['url']
105
- end
106
-
107
- end
108
-
109
- def groups
110
- @groups ||= @page.search('.groups-name').map do |item|
111
- name = item.text.gsub(/\s+|\n/, ' ').strip
112
- link = "http://www.linkedin.com#{item.at('a')['href']}"
113
- { :name => name, :link => link }
114
- end
115
- end
116
-
117
- def organizations
118
- @organizations ||= @page.search('#background-organizations .section-item').map do |item|
119
- name = item.at('.summary').text.gsub(/\s+|\n/, ' ').strip rescue nil
120
- start_date, end_date = item.at('.organizations-date').text.gsub(/\s+|\n/, ' ').strip.split(' – ') rescue nil
121
- start_date = Date.parse(start_date) rescue nil
122
- end_date = Date.parse(end_date) rescue nil
123
- { :name => name, :start_date => start_date, :end_date => end_date }
124
- end
125
- end
126
-
127
- def languages
128
- @languages ||= @page.search('.background-languages #languages ol li').map do |item|
129
- language = item.at('h4').text rescue nil
130
- proficiency = item.at('div.languages-proficiency').text.gsub(/\s+|\n/, ' ').strip rescue nil
131
- { :language => language, :proficiency => proficiency }
132
- end
133
- end
134
-
135
- def certifications
136
- @certifications ||= @page.search('background-certifications').map do |item|
137
- name = item.at('h4').text.gsub(/\s+|\n/, ' ').strip rescue nil
138
- authority = item.at('h5').text.gsub(/\s+|\n/, ' ').strip rescue nil
139
- license = item.at('.specifics/.licence-number').text.gsub(/\s+|\n/, ' ').strip rescue nil
140
- start_date = item.at('.certification-date').text.gsub(/\s+|\n/, ' ').strip rescue nil
141
-
142
- { :name => name, :authority => authority, :license => license, :start_date => start_date }
143
- end
144
- end
145
-
146
-
147
- def recommended_visitors
148
- @recommended_visitors ||= @page.search('.insights-browse-map/ul/li').map do |visitor|
149
- v = {}
150
- v[:link] = visitor.at('a')['href']
151
- v[:name] = visitor.at('h4/a').text
152
- v[:title] = visitor.at('.browse-map-title').text.gsub('...', ' ').split(' at ').first
153
- v[:company] = visitor.at('.browse-map-title').text.gsub('...', ' ').split(' at ')[1]
154
- v
155
- end
156
- end
157
-
158
- def to_json
159
- require 'json'
160
- ATTRIBUTES.reduce({}){ |hash,attr| hash[attr.to_sym] = self.send(attr.to_sym);hash }.to_json
161
- end
162
-
163
- private
164
-
165
- def get_companies(type)
166
- companies = []
167
- if @page.search(".background-experience .#{type}-position").first
168
- @page.search(".background-experience .#{type}-position").each do |node|
169
-
170
- company = {}
171
- company[:title] = node.at('h4').text.gsub(/\s+|\n/, ' ').strip if node.at('h4')
172
- company[:company] = node.at('h4').next.text.gsub(/\s+|\n/, ' ').strip if node.at('h4').next
173
- company[:description] = node.at(".description").text.gsub(/\s+|\n/, ' ').strip if node.at(".description")
174
-
175
- start_date, end_date = node.at('.experience-date-locale').text.strip.split(" – ") rescue nil
176
- company[:start_date] = parse_date(start_date) rescue nil
177
- company[:end_date] = parse_date(end_date) rescue nil
178
-
179
- company_link = node.at('h4').next.at('a')['href'] if node.at('h4').next.at('a')
180
-
181
- result = get_company_details(company_link)
182
- companies << company.merge!(result)
183
- end
184
- end
185
- companies
186
- end
187
-
188
- def parse_date(date)
189
- date = "#{date}-01-01" if date =~ /^(19|20)\d{2}$/
190
- Date.parse(date)
191
- end
192
-
193
- def get_company_details(link)
194
- result = { :linkedin_company_url => get_linkedin_company_url(link) }
195
- page = http_client.get(result[:linkedin_company_url])
196
-
197
- result[:url] = page.at('.basic-info-about/ul/li/p/a').text if page.at('.basic-info-about/ul/li/p/a')
198
- node_2 = page.at('.basic-info-about/ul')
199
- if node_2
200
- node_2.search('p').zip(node_2.search('h4')).each do |value, title|
201
- result[title.text.gsub(' ', '_').downcase.to_sym] = value.text.strip
202
- end
203
- end
204
- result[:address] = page.at('.vcard.hq').at('.adr').text.gsub("\n", ' ').strip if page.at('.vcard.hq')
205
- result
206
- end
207
-
208
- def http_client
209
- Mechanize.new do |agent|
210
- agent.user_agent_alias = USER_AGENTS.sample
211
- agent.max_history = 0
212
- end
213
- end
214
-
215
- def get_linkedin_company_url(link)
216
- http = %r{http://www.linkedin.com/}
217
- https = %r{https://www.linkedin.com/}
218
- if http.match(link) || https.match(link)
219
- link
220
- else
221
- "http://www.linkedin.com/#{link}"
222
- end
223
- end
224
- end
225
- end
@@ -1,5 +0,0 @@
1
- require 'rubygems'
2
- require 'mechanize'
3
- require 'cgi'
4
- require 'net/http'
5
- Dir["#{File.expand_path(File.dirname(__FILE__))}/linkedin-scraper/*.rb"].each { |file| require file }