linkedin-scraper 0.0.6 → 0.0.7
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +76 -23
- data/lib/linkedin-scraper.rb +2 -1
- data/lib/linkedin-scraper/profile.rb +136 -26
- data/lib/linkedin-scraper/version.rb +1 -1
- metadata +2 -2
data/README.rdoc
CHANGED
@@ -32,34 +32,87 @@ Then you can see the scraped data like this:
|
|
32
32
|
profile.industry #the domain for which the contact belongs
|
33
33
|
|
34
34
|
profile.past_companies
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
35
|
+
#Array of hash containing its past job companies and job profile
|
36
|
+
#Example
|
37
|
+
# [
|
38
|
+
# [0] {
|
39
|
+
# :past_company => "Consumyze Software",
|
40
|
+
# :past_title => "Trainee",
|
41
|
+
# :past_company_website => "http://www.consumyze.com",
|
42
|
+
# :description => "Responsible for design and development"
|
43
|
+
# },
|
44
|
+
# [1] {
|
45
|
+
# :past_company => "SunGard Global Services",
|
46
|
+
# :past_title => "Project Intern",
|
47
|
+
# :past_company_website => "http://www.sungard.com/globalservices/learnmore",
|
48
|
+
# :description => "Fame PassPoint. Developed an entirely Ajax based online control panel for user management and Data access for Fame"
|
49
|
+
# }
|
50
|
+
# ]
|
47
51
|
profile.current_companies
|
48
52
|
#Array of hash containing its current job companies and job profile
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
53
|
+
#Example
|
54
|
+
# [
|
55
|
+
# [0] {
|
56
|
+
# :current_title => "Intern",
|
57
|
+
# :current_company => "Sungard"
|
58
|
+
# :current_company_url=>"http://www.betterlabs.net",
|
59
|
+
# :description=>"Responsible for design and development of projects on Ruby on Rails."
|
60
|
+
# },
|
61
|
+
# [1] {
|
62
|
+
# :current_title => "Software Developer",
|
63
|
+
# :current_company => "Microsoft"
|
64
|
+
# :current_company_url =>"http://www.microsoft.net",
|
65
|
+
# :description =>"Development and design"
|
66
|
+
|
67
|
+
# }
|
68
|
+
# ]
|
60
69
|
|
61
70
|
|
62
71
|
profile.linkedin_url #url of the profile
|
72
|
+
|
73
|
+
profile.websites
|
74
|
+
#Array of websites
|
75
|
+
#[
|
76
|
+
# [0] "http://www.yatishmehta.in"
|
77
|
+
#]
|
78
|
+
|
79
|
+
profile.groups
|
80
|
+
#array of hashes containing group name and link
|
81
|
+
# [
|
82
|
+
# [ 0] {
|
83
|
+
# :name => "Business on Rails",
|
84
|
+
# :link => "http://www.linkedin.com/groups/Business-on-Rails-27822"
|
85
|
+
# },
|
86
|
+
# [ 1] {
|
87
|
+
# :name => "HTML5 Technologies",
|
88
|
+
# :link => "http://www.linkedin.com/groups/HTML5-Technologies-2868882"
|
89
|
+
# },
|
90
|
+
# [ 2] {
|
91
|
+
# :name => "India on Rails",
|
92
|
+
# :link => "http://www.linkedin.com/groups/India-on-Rails-149940"
|
93
|
+
# :name => "Open Source",
|
94
|
+
# :link => "http://www.linkedin.com/groups?gid=43875"
|
95
|
+
# },
|
96
|
+
# [ 4] {
|
97
|
+
# :name => "Rails Developers",
|
98
|
+
# :link => "http://www.linkedin.com/groups?gid=77764"
|
99
|
+
# },
|
100
|
+
# ]
|
101
|
+
|
102
|
+
profile.education
|
103
|
+
#Array of hashes for eduction
|
104
|
+
# [
|
105
|
+
# [0] {
|
106
|
+
# :name => "Vishwakarma Institute of Technology",
|
107
|
+
# :description => "B.Tech, Computer Engineering",
|
108
|
+
# :period => "2007 – 2011"
|
109
|
+
# },
|
110
|
+
# [1] {
|
111
|
+
# :name => "St Ursula's High School",
|
112
|
+
# :description => "Secondary School Education",
|
113
|
+
# :period => nil
|
114
|
+
# }
|
115
|
+
# ]
|
63
116
|
|
64
117
|
profile.recommended_visitors
|
65
118
|
#Its the list of visitors "Viewers of this profile also viewed..."
|
data/lib/linkedin-scraper.rb
CHANGED
@@ -3,22 +3,68 @@ module Linkedin
|
|
3
3
|
class Profile
|
4
4
|
#the First name of the contact
|
5
5
|
attr_accessor :first_name,:last_name,:title,:location,:country,
|
6
|
-
|
7
|
-
|
6
|
+
:industry, :linkedin_url,:recommended_visitors,:page
|
7
|
+
#Array of hashes for eduction
|
8
|
+
# [
|
9
|
+
# [0] {
|
10
|
+
# :name => "Vishwakarma Institute of Technology",
|
11
|
+
# :description => "B.Tech, Computer Engineering",
|
12
|
+
# :period => "2007 – 2011"
|
13
|
+
# },
|
14
|
+
# [1] {
|
15
|
+
# :name => "St Ursula's High School",
|
16
|
+
# :description => "Secondary School Education",
|
17
|
+
# :period => nil
|
18
|
+
# }
|
19
|
+
# ]
|
20
|
+
attr_accessor :education
|
21
|
+
|
22
|
+
#Array of websites
|
23
|
+
#[
|
24
|
+
#[0] "http://www.yatishmehta.in"
|
25
|
+
#]
|
26
|
+
attr_accessor :websites
|
27
|
+
#array of hashes containing group name and link
|
28
|
+
# [
|
29
|
+
# [ 0] {
|
30
|
+
# :name => "Business on Rails",
|
31
|
+
# :link => "http://www.linkedin.com/groups/Business-on-Rails-27822"
|
32
|
+
# },
|
33
|
+
# [ 1] {
|
34
|
+
# :name => "HTML5 Technologies",
|
35
|
+
# :link => "http://www.linkedin.com/groups/HTML5-Technologies-2868882"
|
36
|
+
# },
|
37
|
+
# [ 2] {
|
38
|
+
# :name => "India on Rails",
|
39
|
+
# :link => "http://www.linkedin.com/groups/India-on-Rails-149940"
|
40
|
+
# :name => "Open Source",
|
41
|
+
# :link => "http://www.linkedin.com/groups?gid=43875"
|
42
|
+
# },
|
43
|
+
# [ 4] {
|
44
|
+
# :name => "Rails Developers",
|
45
|
+
# :link => "http://www.linkedin.com/groups?gid=77764"
|
46
|
+
# },
|
47
|
+
# ]
|
48
|
+
attr_accessor:groups
|
8
49
|
|
9
50
|
|
10
51
|
#Array of hash containing its past job companies and job profile
|
11
52
|
#Example
|
12
53
|
# [
|
13
|
-
#
|
14
|
-
#
|
15
|
-
#
|
16
|
-
#
|
17
|
-
#
|
18
|
-
#
|
19
|
-
#
|
20
|
-
#
|
21
|
-
#
|
54
|
+
# [0] {
|
55
|
+
# :past_company => "Consumyze Software",
|
56
|
+
# :past_title => "Trainee",
|
57
|
+
# :past_company_website => "http://www.consumyze.com",
|
58
|
+
# :description => "Responsible for design and development"
|
59
|
+
# },
|
60
|
+
# [1] {
|
61
|
+
# :past_company => "SunGard Global Services",
|
62
|
+
# :past_title => "Project Intern",
|
63
|
+
# :past_company_website => "http://www.sungard.com/globalservices/learnmore",
|
64
|
+
# :description => "Fame PassPoint. Developed an entirely Ajax based online control panel for user management and Data access for Fame"
|
65
|
+
# }
|
66
|
+
# ]
|
67
|
+
|
22
68
|
|
23
69
|
attr_accessor :past_companies
|
24
70
|
#Array of hash containing its current job companies and job profile
|
@@ -27,10 +73,15 @@ module Linkedin
|
|
27
73
|
# [0] {
|
28
74
|
# :current_title => "Intern",
|
29
75
|
# :current_company => "Sungard"
|
76
|
+
# :current_company_url=>"http://www.betterlabs.net",
|
77
|
+
# :description=>"Responsible for design and development of projects on Ruby on Rails."
|
30
78
|
# },
|
31
79
|
# [1] {
|
32
|
-
# :current_title
|
33
|
-
# :current_company
|
80
|
+
# :current_title => "Software Developer",
|
81
|
+
# :current_company => "Microsoft"
|
82
|
+
# :current_company_url =>"http://www.microsoft.net",
|
83
|
+
# :description =>"Development and design"
|
84
|
+
|
34
85
|
# }
|
35
86
|
# ]
|
36
87
|
attr_accessor :current_companies
|
@@ -47,10 +98,14 @@ module Linkedin
|
|
47
98
|
@current_companies=get_current_companies page
|
48
99
|
@past_companies=get_past_companies page
|
49
100
|
@recommended_visitors=get_recommended_visitors page
|
101
|
+
@education=get_education page
|
50
102
|
@linkedin_url=url
|
103
|
+
@websites=get_websites page
|
104
|
+
@groups=get_groups page
|
51
105
|
@page=page
|
52
106
|
end
|
53
107
|
#returns:nil if it gives a 404 request
|
108
|
+
|
54
109
|
def self.get_profile url
|
55
110
|
begin
|
56
111
|
@agent=Mechanize.new
|
@@ -63,6 +118,17 @@ module Linkedin
|
|
63
118
|
end
|
64
119
|
end
|
65
120
|
|
121
|
+
def get_company_url node
|
122
|
+
if node.at("h4/strong/a")
|
123
|
+
link=node.at("h4/strong/a")["href"]
|
124
|
+
@agent=Mechanize.new
|
125
|
+
@agent.user_agent_alias = USER_AGENTS.sample
|
126
|
+
@agent.max_history = 0
|
127
|
+
page=@agent.get("http://www.linkedin.com"+link)
|
128
|
+
url=page.at(".basic-info/div/dl/dd/a").text if page.at(".basic-info/div/dl/dd/a")
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
66
132
|
private
|
67
133
|
|
68
134
|
def get_first_name page
|
@@ -91,12 +157,13 @@ module Linkedin
|
|
91
157
|
|
92
158
|
def get_past_companies page
|
93
159
|
past_cs=[]
|
94
|
-
if page.search(".past").first
|
95
|
-
page.search(".past").
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
160
|
+
if page.search(".position.experience.vevent.vcard.summary-past").first
|
161
|
+
page.search(".position.experience.vevent.vcard.summary-past").each do |past_company|
|
162
|
+
url=get_company_url past_company
|
163
|
+
title=past_company.at("h3").text.gsub(/\s+|\n/, " ").strip if past_company.at("h3")
|
164
|
+
company=past_company.at("h4").text.gsub(/\s+|\n/, " ").strip if past_company.at("h4")
|
165
|
+
description=past_company.at(".description.past-position").text.gsub(/\s+|\n/, " ").strip if past_company.at(".description.past-position")
|
166
|
+
past_company={:past_company=>company,:past_title=> title,:past_company_website=>url,:description=>description}
|
100
167
|
past_cs<<past_company
|
101
168
|
end
|
102
169
|
return past_cs
|
@@ -105,18 +172,62 @@ module Linkedin
|
|
105
172
|
|
106
173
|
def get_current_companies page
|
107
174
|
current_cs=[]
|
108
|
-
if page.search(".current").first
|
109
|
-
page.search(".current").
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
175
|
+
if page.search(".position.experience.vevent.vcard.summary-current").first
|
176
|
+
page.search(".position.experience.vevent.vcard.summary-current").each do |current_company|
|
177
|
+
url=get_company_url current_company
|
178
|
+
title=current_company.at("h3").text.gsub(/\s+|\n/, " ").strip if current_company.at("h3")
|
179
|
+
company=current_company.at("h4").text.gsub(/\s+|\n/, " ").strip if current_company.at("h4")
|
180
|
+
description=current_company.at(".description.current-position").text.gsub(/\s+|\n/, " ").strip if current_company.at(".description.current-position")
|
181
|
+
current_company={:current_company=>company,:current_title=> title,:current_company_url=>url,:description=>description}
|
114
182
|
current_cs<<current_company
|
115
183
|
end
|
116
184
|
return current_cs
|
117
185
|
end
|
118
186
|
end
|
119
187
|
|
188
|
+
def get_education page
|
189
|
+
education=[]
|
190
|
+
if page.search(".position.education.vevent.vcard").first
|
191
|
+
page.search(".position.education.vevent.vcard").each do |item|
|
192
|
+
name=item.at("h3").text.gsub(/\s+|\n/, " ").strip if item.at("h3")
|
193
|
+
desc=item.at("h4").text.gsub(/\s+|\n/, " ").strip if item.at("h4")
|
194
|
+
period=item.at(".period").text.gsub(/\s+|\n/, " ").strip if item.at(".period")
|
195
|
+
edu={:name=>name,:description=>desc,:period=>period}
|
196
|
+
education<<edu
|
197
|
+
end
|
198
|
+
return education
|
199
|
+
end
|
200
|
+
end
|
201
|
+
|
202
|
+
def get_websites page
|
203
|
+
websites=[]
|
204
|
+
if page.search(".website").first
|
205
|
+
page.search(".website").each do |site|
|
206
|
+
url=site.at("a")["href"]
|
207
|
+
url="http://www.linkedin.com"+url
|
208
|
+
url=CGI.parse(URI.parse(url).query)["url"]
|
209
|
+
websites<<url
|
210
|
+
end
|
211
|
+
return websites.flatten!
|
212
|
+
end
|
213
|
+
end
|
214
|
+
|
215
|
+
def get_groups page
|
216
|
+
groups=[]
|
217
|
+
if page.search(".group-data").first
|
218
|
+
page.search(".group-data").each do |item|
|
219
|
+
name=item.text.gsub(/\s+|\n/, " ").strip
|
220
|
+
link="http://www.linkedin.com"+item.at("a")["href"]
|
221
|
+
groups<<{:name=>name,:link=>link}
|
222
|
+
end
|
223
|
+
return groups
|
224
|
+
end
|
225
|
+
|
226
|
+
end
|
227
|
+
|
228
|
+
|
229
|
+
|
230
|
+
|
120
231
|
def get_recommended_visitors page
|
121
232
|
recommended_vs=[]
|
122
233
|
if page.search(".browsemap").first
|
@@ -130,7 +241,6 @@ module Linkedin
|
|
130
241
|
end
|
131
242
|
return recommended_vs
|
132
243
|
end
|
133
|
-
|
134
244
|
end
|
135
245
|
end
|
136
246
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedin-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.7
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-08-02 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|