linkedin-scraper 0.0.7 → 0.0.8
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +256 -0
- data/lib/linkedin-scraper/profile.rb +93 -146
- data/lib/linkedin-scraper/version.rb +1 -1
- data/linkedin-scraper.gemspec +1 -4
- data/spec/linkedin-scraper/profile_spec.rb +13 -0
- data/spec/spec_helper.rb +18 -0
- metadata +17 -18
- data/README.rdoc +0 -134
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 4821d7e30aa48eac2fe54f5feaebe664f8c7e65f
|
4
|
+
data.tar.gz: 55f77b9d3dc2cee5a4e0f27b50116b058b585a72
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 83aa94cbaa5de9e2711d2cbb8672c9a81986ac48eb379fdcf99d9a64e3791bd5a945d76ea5266daa9781886de5feb5b1c3a0905107e3ea71894937e455e2aa7f
|
7
|
+
data.tar.gz: 6dbcdab792fb1551d9e2fc79909e94ed5f4fa2da5ece3ef8e0b37307f1bbcf69b1d1c9e8d773fb9ece146f5ff64eba7b04716b71a735ecf998740ecd34865a41
|
data/README.md
ADDED
@@ -0,0 +1,256 @@
|
|
1
|
+
Linkedin Scraper
|
2
|
+
================
|
3
|
+
|
4
|
+
Linkedin-scraper is a gem for scraping linkedin public profiles.
|
5
|
+
You give it an URL, and it lets you easily get its title, name, country, area, current_companies and much more.
|
6
|
+
|
7
|
+
Installation
|
8
|
+
------------
|
9
|
+
|
10
|
+
Install the gem from RubyGems:
|
11
|
+
|
12
|
+
gem install linkedin-scraper
|
13
|
+
|
14
|
+
This gem is tested on Ruby versions 1.8.7, 1.9.2 1.9.3 and 2.0.0
|
15
|
+
|
16
|
+
Usage
|
17
|
+
-----
|
18
|
+
|
19
|
+
Initialize a scraper instance for an URL, like this:
|
20
|
+
|
21
|
+
profile = Linkedin::Profile.get_profile("http://www.linkedin.com/in/jeffweiner08")
|
22
|
+
|
23
|
+
Then you can see the scraped data like this:
|
24
|
+
|
25
|
+
|
26
|
+
profile.first_name #the First name of the contact
|
27
|
+
|
28
|
+
profile.last_name #the last name of the contact
|
29
|
+
|
30
|
+
profile.title #the linkedin job title
|
31
|
+
|
32
|
+
profile.location #the location of the contact
|
33
|
+
|
34
|
+
profile.country #the country of the contact
|
35
|
+
|
36
|
+
profile.industry #the domain for which the contact belongs
|
37
|
+
|
38
|
+
profile.picture #the profile pic url of contact
|
39
|
+
|
40
|
+
profile.current_companies
|
41
|
+
|
42
|
+
[
|
43
|
+
[0] {
|
44
|
+
:current_company => "LinkedIn",
|
45
|
+
:current_title => "CEO",
|
46
|
+
:current_company_url => "http://www.linkedin.com",
|
47
|
+
:description => nil,
|
48
|
+
:linkedin_company_url => "http://www.linkedin.com/company/linkedin?trk=ppro_cprof",
|
49
|
+
:url => "http://www.linkedin.com",
|
50
|
+
:type => "Public Company",
|
51
|
+
:company_size => "1001-5000 employees",
|
52
|
+
:website => "http://www.linkedin.com",
|
53
|
+
:industry => "Internet",
|
54
|
+
:founded => "2003",
|
55
|
+
:address => "2029 Stierlin Court Mountain View, CA 94043 United States"
|
56
|
+
},
|
57
|
+
[1] {
|
58
|
+
:current_company => "Intuit",
|
59
|
+
:current_title => "Member, Board of Directors",
|
60
|
+
:current_company_url => "http://network.intuit.com/",
|
61
|
+
:description => nil,
|
62
|
+
:linkedin_company_url => "http://www.linkedin.com/company/intuit?trk=ppro_cprof",
|
63
|
+
:url => "http://network.intuit.com/",
|
64
|
+
:type => "Public Company",
|
65
|
+
:company_size => "5001-10,000 employees",
|
66
|
+
:website => "http://network.intuit.com/",
|
67
|
+
:industry => "Computer Software",
|
68
|
+
:founded => "1983",
|
69
|
+
:address => "2632 Marine Way Mountain View, CA 94043 United States"
|
70
|
+
},
|
71
|
+
[2] {
|
72
|
+
:current_company => "DonorsChoose",
|
73
|
+
:current_title => "Member, Board of Directors",
|
74
|
+
:current_company_url => "http://www.donorschoose.org",
|
75
|
+
:description => nil,
|
76
|
+
:linkedin_company_url => "http://www.linkedin.com/company/donorschoose.org?trk=ppro_cprof",
|
77
|
+
:url => "http://www.donorschoose.org",
|
78
|
+
:type => "Nonprofit",
|
79
|
+
:company_size => "51-200 employees",
|
80
|
+
:website => "http://www.donorschoose.org",
|
81
|
+
:industry => "Nonprofit Organization Management",
|
82
|
+
:founded => "2000",
|
83
|
+
:address => "213 West 35th Street 2nd Floor East New York, NY 10001 United States"
|
84
|
+
},
|
85
|
+
[3] {
|
86
|
+
:current_company => "Malaria No More",
|
87
|
+
:current_title => "Member, Board of Directors",
|
88
|
+
:current_company_url => nil,
|
89
|
+
:description => nil
|
90
|
+
},
|
91
|
+
[4] {
|
92
|
+
:current_company => "Venture For America",
|
93
|
+
:current_title => "Member, Advisory Board",
|
94
|
+
:current_company_url => "http://ventureforamerica.org/",
|
95
|
+
:description => nil,
|
96
|
+
:linkedin_company_url => "http://www.linkedin.com/company/venture-for-america?trk=ppro_cprof",
|
97
|
+
:url => "http://ventureforamerica.org/",
|
98
|
+
:type => "Nonprofit",
|
99
|
+
:company_size => "1-10 employees",
|
100
|
+
:website => "http://ventureforamerica.org/",
|
101
|
+
:industry => "Nonprofit Organization Management",
|
102
|
+
:founded => "2011"
|
103
|
+
}
|
104
|
+
]
|
105
|
+
|
106
|
+
|
107
|
+
profile.past_companies
|
108
|
+
#Array of hash containing its past job companies and job profile
|
109
|
+
#Example
|
110
|
+
[
|
111
|
+
[0] {
|
112
|
+
:past_company => "Accel Partners",
|
113
|
+
:past_title => "Executive in Residence",
|
114
|
+
:past_company_website => "http://www.facebook.com/accel",
|
115
|
+
:description => nil,
|
116
|
+
:linkedin_company_url => "http://www.linkedin.com/company/accel-partners?trk=ppro_cprof",
|
117
|
+
:url => "http://www.facebook.com/accel",
|
118
|
+
:type => "Partnership",
|
119
|
+
:company_size => "51-200 employees",
|
120
|
+
:website => "http://www.facebook.com/accel",
|
121
|
+
:industry => "Venture Capital & Private Equity",
|
122
|
+
:address => "428 University Palo Alto, CA 94301 United States"
|
123
|
+
},
|
124
|
+
[1] {
|
125
|
+
:past_company => "Greylock",
|
126
|
+
:past_title => "Executive in Residence",
|
127
|
+
:past_company_website => "http://www.greylock.com",
|
128
|
+
:description => nil,
|
129
|
+
:linkedin_company_url => "http://www.linkedin.com/company/greylock-partners?trk=ppro_cprof",
|
130
|
+
:url => "http://www.greylock.com",
|
131
|
+
:type => "Partnership",
|
132
|
+
:company_size => "51-200 employees",
|
133
|
+
:website => "http://www.greylock.com",
|
134
|
+
:industry => "Venture Capital & Private Equity",
|
135
|
+
:address => "2550 Sand Hill Road Menlo Park, CA 94025 United States"
|
136
|
+
},
|
137
|
+
[2] {
|
138
|
+
:past_company => "Yahoo!",
|
139
|
+
:past_title => "Executive Vice President Network Division",
|
140
|
+
:past_company_website => "http://www.yahoo.com",
|
141
|
+
:description => nil,
|
142
|
+
:linkedin_company_url => "http://www.linkedin.com/company/yahoo?trk=ppro_cprof",
|
143
|
+
:url => "http://www.yahoo.com",
|
144
|
+
:type => "Public Company",
|
145
|
+
:company_size => "10,001+ employees",
|
146
|
+
:website => "http://www.yahoo.com",
|
147
|
+
:industry => "Internet",
|
148
|
+
:founded => "1994",
|
149
|
+
:address => "701 First Avenue Sunnyvale, CA 94089 United States"
|
150
|
+
},
|
151
|
+
[3] {
|
152
|
+
:past_company => "Windsor Media",
|
153
|
+
:past_title => "Founding Partner",
|
154
|
+
:past_company_website => nil,
|
155
|
+
:description => nil
|
156
|
+
},
|
157
|
+
[4] {
|
158
|
+
:past_company => "Warner Bros.",
|
159
|
+
:past_title => "Vice President Online",
|
160
|
+
:past_company_website => "http://www.warnerbros.com/",
|
161
|
+
:description => nil,
|
162
|
+
:linkedin_company_url => "http://www.linkedin.com/company/warner-bros.-entertainment-group-of-companies?trk=ppro_cprof",
|
163
|
+
:url => "http://www.warnerbros.com/",
|
164
|
+
:type => "Public Company",
|
165
|
+
:company_size => "10,001+ employees",
|
166
|
+
:website => "http://www.warnerbros.com/",
|
167
|
+
:industry => "Entertainment",
|
168
|
+
:address => "4000 Warner Boulevard Burbank, CA 91522 United States"
|
169
|
+
}
|
170
|
+
]
|
171
|
+
|
172
|
+
|
173
|
+
profile.linkedin_url #url of the profile
|
174
|
+
|
175
|
+
profile.websites
|
176
|
+
#Array of websites
|
177
|
+
[
|
178
|
+
[0] "http://www.linkedin.com/"
|
179
|
+
]
|
180
|
+
|
181
|
+
profile.groups
|
182
|
+
#Array of hashes containing group name and link
|
183
|
+
|
184
|
+
|
185
|
+
profile.education
|
186
|
+
#Array of hashes for eduction
|
187
|
+
|
188
|
+
profile.skills
|
189
|
+
#Array of skills
|
190
|
+
|
191
|
+
profile.picture
|
192
|
+
#url of the profile picture
|
193
|
+
|
194
|
+
|
195
|
+
profile.recommended_visitors
|
196
|
+
#Its the list of visitors "Viewers of this profile also viewed..."
|
197
|
+
[
|
198
|
+
[0] {
|
199
|
+
:link => "http://www.linkedin.com/in/barackobama?trk=pub-pbmap",
|
200
|
+
:name => "Barack Obama",
|
201
|
+
:title => "President of the United States of ",
|
202
|
+
:company => nil
|
203
|
+
},
|
204
|
+
[1] {
|
205
|
+
:link => "http://www.linkedin.com/in/marissamayer?trk=pub-pbmap",
|
206
|
+
:name => "Marissa Mayer",
|
207
|
+
:title => "Yahoo!, President & CEO",
|
208
|
+
:company => nil
|
209
|
+
},
|
210
|
+
[2] {
|
211
|
+
:link => "http://www.linkedin.com/pub/sean-parker/0/1/826?trk=pub-pbmap",
|
212
|
+
:name => "Sean Parker",
|
213
|
+
:title => nil,
|
214
|
+
:company => nil
|
215
|
+
},
|
216
|
+
[3] {
|
217
|
+
:link => "http://www.linkedin.com/pub/eduardo-saverin/0/70a/31b?trk=pub-pbmap",
|
218
|
+
:name => "Eduardo Saverin",
|
219
|
+
:title => nil,
|
220
|
+
:company => nil
|
221
|
+
},
|
222
|
+
[4] {
|
223
|
+
:link => "http://www.linkedin.com/in/rbranson?trk=pub-pbmap",
|
224
|
+
:name => "Richard Branson",
|
225
|
+
:title => "Founder",
|
226
|
+
:company => "Virgin Group"
|
227
|
+
},
|
228
|
+
[5] {
|
229
|
+
:link => "http://www.linkedin.com/in/reidhoffman?trk=pub-pbmap",
|
230
|
+
:name => "Reid Hoffman",
|
231
|
+
:title => "Entrepreneur. Product Strategist. ",
|
232
|
+
:company => nil
|
233
|
+
},
|
234
|
+
[6] {
|
235
|
+
:link => "http://www.linkedin.com/in/mdell?trk=pub-pbmap",
|
236
|
+
:name => "Michael Dell",
|
237
|
+
:title => "Chairman and CEO",
|
238
|
+
:company => "Dell"
|
239
|
+
},
|
240
|
+
[7] {
|
241
|
+
:link => "http://www.linkedin.com/in/mittromney?trk=pub-pbmap",
|
242
|
+
:name => "Mitt Romney",
|
243
|
+
:title => "Believe in America",
|
244
|
+
:company => nil
|
245
|
+
},
|
246
|
+
[8] {
|
247
|
+
:link => "http://www.linkedin.com/pub/sheryl-sandberg/2/665/512?trk=pub-pbmap",
|
248
|
+
:name => "Sheryl Sandberg",
|
249
|
+
:title => nil,
|
250
|
+
:company => nil
|
251
|
+
}
|
252
|
+
]
|
253
|
+
|
254
|
+
|
255
|
+
|
256
|
+
You're welcome to fork this project and send pull requests. I want to thank specially:
|
@@ -1,132 +1,76 @@
|
|
1
|
-
|
1
|
+
# -*- coding: utf-8 -*-
|
2
2
|
module Linkedin
|
3
|
-
class Profile
|
4
|
-
#the First name of the contact
|
5
|
-
attr_accessor :first_name,:last_name,:title,:location,:country,
|
6
|
-
:industry, :linkedin_url,:recommended_visitors,:page
|
7
|
-
#Array of hashes for eduction
|
8
|
-
# [
|
9
|
-
# [0] {
|
10
|
-
# :name => "Vishwakarma Institute of Technology",
|
11
|
-
# :description => "B.Tech, Computer Engineering",
|
12
|
-
# :period => "2007 – 2011"
|
13
|
-
# },
|
14
|
-
# [1] {
|
15
|
-
# :name => "St Ursula's High School",
|
16
|
-
# :description => "Secondary School Education",
|
17
|
-
# :period => nil
|
18
|
-
# }
|
19
|
-
# ]
|
20
|
-
attr_accessor :education
|
3
|
+
class Profile
|
21
4
|
|
22
|
-
|
23
|
-
#[
|
24
|
-
#[0] "http://www.yatishmehta.in"
|
25
|
-
#]
|
26
|
-
attr_accessor :websites
|
27
|
-
#array of hashes containing group name and link
|
28
|
-
# [
|
29
|
-
# [ 0] {
|
30
|
-
# :name => "Business on Rails",
|
31
|
-
# :link => "http://www.linkedin.com/groups/Business-on-Rails-27822"
|
32
|
-
# },
|
33
|
-
# [ 1] {
|
34
|
-
# :name => "HTML5 Technologies",
|
35
|
-
# :link => "http://www.linkedin.com/groups/HTML5-Technologies-2868882"
|
36
|
-
# },
|
37
|
-
# [ 2] {
|
38
|
-
# :name => "India on Rails",
|
39
|
-
# :link => "http://www.linkedin.com/groups/India-on-Rails-149940"
|
40
|
-
# :name => "Open Source",
|
41
|
-
# :link => "http://www.linkedin.com/groups?gid=43875"
|
42
|
-
# },
|
43
|
-
# [ 4] {
|
44
|
-
# :name => "Rails Developers",
|
45
|
-
# :link => "http://www.linkedin.com/groups?gid=77764"
|
46
|
-
# },
|
47
|
-
# ]
|
48
|
-
attr_accessor:groups
|
5
|
+
USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
|
49
6
|
|
7
|
+
attr_accessor :first_name,:last_name,:title,:location,:country, :industry,:picture,:linkedin_url,:recommended_visitors,:page
|
8
|
+
|
9
|
+
attr_accessor :education
|
50
10
|
|
51
|
-
|
52
|
-
#Example
|
53
|
-
# [
|
54
|
-
# [0] {
|
55
|
-
# :past_company => "Consumyze Software",
|
56
|
-
# :past_title => "Trainee",
|
57
|
-
# :past_company_website => "http://www.consumyze.com",
|
58
|
-
# :description => "Responsible for design and development"
|
59
|
-
# },
|
60
|
-
# [1] {
|
61
|
-
# :past_company => "SunGard Global Services",
|
62
|
-
# :past_title => "Project Intern",
|
63
|
-
# :past_company_website => "http://www.sungard.com/globalservices/learnmore",
|
64
|
-
# :description => "Fame PassPoint. Developed an entirely Ajax based online control panel for user management and Data access for Fame"
|
65
|
-
# }
|
66
|
-
# ]
|
11
|
+
attr_accessor :websites
|
67
12
|
|
13
|
+
attr_accessor:groups
|
68
14
|
|
69
15
|
attr_accessor :past_companies
|
70
|
-
|
71
|
-
#Example
|
72
|
-
# [
|
73
|
-
# [0] {
|
74
|
-
# :current_title => "Intern",
|
75
|
-
# :current_company => "Sungard"
|
76
|
-
# :current_company_url=>"http://www.betterlabs.net",
|
77
|
-
# :description=>"Responsible for design and development of projects on Ruby on Rails."
|
78
|
-
# },
|
79
|
-
# [1] {
|
80
|
-
# :current_title => "Software Developer",
|
81
|
-
# :current_company => "Microsoft"
|
82
|
-
# :current_company_url =>"http://www.microsoft.net",
|
83
|
-
# :description =>"Development and design"
|
84
|
-
|
85
|
-
# }
|
86
|
-
# ]
|
16
|
+
|
87
17
|
attr_accessor :current_companies
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
def initialize(page,url)
|
92
|
-
@first_name=get_first_name(page)
|
93
|
-
@last_name=get_last_name(page)
|
94
|
-
@title=get_title(page)
|
95
|
-
@location=get_location(page)
|
96
|
-
@country=get_country(page)
|
97
|
-
@industry=get_industry(page)
|
98
|
-
@
|
99
|
-
@
|
100
|
-
@
|
101
|
-
@
|
102
|
-
@
|
103
|
-
@
|
104
|
-
@
|
105
|
-
@
|
18
|
+
|
19
|
+
attr_accessor :skills
|
20
|
+
|
21
|
+
def initialize(page,url)
|
22
|
+
@first_name = get_first_name(page)
|
23
|
+
@last_name = get_last_name(page)
|
24
|
+
@title = get_title(page)
|
25
|
+
@location = get_location(page)
|
26
|
+
@country = get_country(page)
|
27
|
+
@industry = get_industry(page)
|
28
|
+
@picture = get_picture(page)
|
29
|
+
@current_companies = get_current_companies(page)
|
30
|
+
@past_companies = get_past_companies(page)
|
31
|
+
@recommended_visitors = get_recommended_visitors(page)
|
32
|
+
@education = get_education(page)
|
33
|
+
@linkedin_url = url
|
34
|
+
@websites = get_websites(page)
|
35
|
+
@groups = get_groups(page)
|
36
|
+
@skills = get_skills(page)
|
37
|
+
@page = page
|
106
38
|
end
|
107
39
|
#returns:nil if it gives a 404 request
|
108
40
|
|
109
|
-
def self.get_profile
|
41
|
+
def self.get_profile(url)
|
110
42
|
begin
|
111
|
-
@agent=Mechanize.new
|
43
|
+
@agent = Mechanize.new
|
112
44
|
@agent.user_agent_alias = USER_AGENTS.sample
|
113
45
|
@agent.max_history = 0
|
114
|
-
page
|
46
|
+
page = @agent.get(url)
|
115
47
|
return Linkedin::Profile.new(page, url)
|
116
48
|
rescue=>e
|
117
49
|
puts e
|
118
50
|
end
|
119
51
|
end
|
120
52
|
|
121
|
-
def
|
53
|
+
def get_skills(page)
|
54
|
+
page.search('.competency.show-bean').map{|skill|skill.text.strip if skill.text}
|
55
|
+
end
|
56
|
+
|
57
|
+
def get_company_url(node)
|
58
|
+
result={}
|
122
59
|
if node.at("h4/strong/a")
|
123
|
-
link=node.at("h4/strong/a")["href"]
|
124
|
-
@agent=Mechanize.new
|
60
|
+
link = node.at("h4/strong/a")["href"]
|
61
|
+
@agent = Mechanize.new
|
125
62
|
@agent.user_agent_alias = USER_AGENTS.sample
|
126
63
|
@agent.max_history = 0
|
127
|
-
page
|
128
|
-
|
64
|
+
page = @agent.get("http://www.linkedin.com"+link)
|
65
|
+
result[:linkedin_company_url] = "http://www.linkedin.com"+link
|
66
|
+
result[:url] = page.at(".basic-info/div/dl/dd/a").text if page.at(".basic-info/div/dl/dd/a")
|
67
|
+
node_2 = page.at(".basic-info").at(".content.inner-mod")
|
68
|
+
node_2.search("dd").zip(node_2.search("dt")).each do |value,title|
|
69
|
+
result[title.text.gsub(" ","_").downcase.to_sym] = value.text.strip
|
70
|
+
end
|
71
|
+
result[:address] = page.at(".vcard.hq").at(".adr").text.gsub("\n"," ").strip if page.at(".vcard.hq")
|
129
72
|
end
|
73
|
+
result
|
130
74
|
end
|
131
75
|
|
132
76
|
private
|
@@ -155,89 +99,92 @@ module Linkedin
|
|
155
99
|
return page.at(".industry").text.gsub(/\s+/, " ").strip if page.search(".industry").first
|
156
100
|
end
|
157
101
|
|
102
|
+
def get_picture page
|
103
|
+
return page.at("#profile-picture/img.photo").attributes['src'].value.strip if page.search("#profile-picture/img.photo").first
|
104
|
+
end
|
105
|
+
|
158
106
|
def get_past_companies page
|
159
107
|
past_cs=[]
|
160
108
|
if page.search(".position.experience.vevent.vcard.summary-past").first
|
161
109
|
page.search(".position.experience.vevent.vcard.summary-past").each do |past_company|
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
110
|
+
result = get_company_url past_company
|
111
|
+
url = result[:url]
|
112
|
+
title = past_company.at("h3").text.gsub(/\s+|\n/, " ").strip if past_company.at("h3")
|
113
|
+
company = past_company.at("h4").text.gsub(/\s+|\n/, " ").strip if past_company.at("h4")
|
114
|
+
description = past_company.at(".description.past-position").text.gsub(/\s+|\n/, " ").strip if past_company.at(".description.past-position")
|
115
|
+
p_company = {:past_company=>company,:past_title=> title,:past_company_website=>url,:description=>description}
|
116
|
+
p_company = p_company.merge(result)
|
117
|
+
past_cs << p_company
|
168
118
|
end
|
169
119
|
return past_cs
|
170
120
|
end
|
171
121
|
end
|
172
122
|
|
173
123
|
def get_current_companies page
|
174
|
-
current_cs=[]
|
124
|
+
current_cs = []
|
175
125
|
if page.search(".position.experience.vevent.vcard.summary-current").first
|
176
126
|
page.search(".position.experience.vevent.vcard.summary-current").each do |current_company|
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
127
|
+
result = get_company_url current_company
|
128
|
+
url = result[:url]
|
129
|
+
title = current_company.at("h3").text.gsub(/\s+|\n/, " ").strip if current_company.at("h3")
|
130
|
+
company = current_company.at("h4").text.gsub(/\s+|\n/, " ").strip if current_company.at("h4")
|
131
|
+
description = current_company.at(".description.current-position").text.gsub(/\s+|\n/, " ").strip if current_company.at(".description.current-position")
|
132
|
+
current_company = {:current_company=>company,:current_title=> title,:current_company_url=>url,:description=>description}
|
133
|
+
current_cs << current_company.merge(result)
|
183
134
|
end
|
184
135
|
return current_cs
|
185
136
|
end
|
186
137
|
end
|
187
138
|
|
188
|
-
def get_education
|
139
|
+
def get_education(page)
|
189
140
|
education=[]
|
190
141
|
if page.search(".position.education.vevent.vcard").first
|
191
142
|
page.search(".position.education.vevent.vcard").each do |item|
|
192
|
-
name=item.at("h3").text.gsub(/\s+|\n/, " ").strip if item.at("h3")
|
193
|
-
desc=item.at("h4").text.gsub(/\s+|\n/, " ").strip if item.at("h4")
|
194
|
-
period=item.at(".period").text.gsub(/\s+|\n/, " ").strip if item.at(".period")
|
195
|
-
edu={:name=>name,:description=>desc,:period=>period}
|
196
|
-
education<<edu
|
143
|
+
name = item.at("h3").text.gsub(/\s+|\n/, " ").strip if item.at("h3")
|
144
|
+
desc = item.at("h4").text.gsub(/\s+|\n/, " ").strip if item.at("h4")
|
145
|
+
period = item.at(".period").text.gsub(/\s+|\n/, " ").strip if item.at(".period")
|
146
|
+
edu = {:name => name,:description => desc,:period => period}
|
147
|
+
education << edu
|
197
148
|
end
|
198
149
|
return education
|
199
150
|
end
|
200
151
|
end
|
201
152
|
|
202
|
-
def get_websites
|
153
|
+
def get_websites(page)
|
203
154
|
websites=[]
|
204
155
|
if page.search(".website").first
|
205
156
|
page.search(".website").each do |site|
|
206
|
-
url=site.at("a")["href"]
|
207
|
-
url="http://www.linkedin.com"+url
|
208
|
-
url=CGI.parse(URI.parse(url).query)["url"]
|
209
|
-
websites<<url
|
157
|
+
url = site.at("a")["href"]
|
158
|
+
url = "http://www.linkedin.com"+url
|
159
|
+
url = CGI.parse(URI.parse(url).query)["url"]
|
160
|
+
websites << url
|
210
161
|
end
|
211
162
|
return websites.flatten!
|
212
|
-
end
|
163
|
+
end
|
213
164
|
end
|
214
165
|
|
215
|
-
def get_groups
|
216
|
-
groups=[]
|
166
|
+
def get_groups(page)
|
167
|
+
groups = []
|
217
168
|
if page.search(".group-data").first
|
218
169
|
page.search(".group-data").each do |item|
|
219
|
-
name=item.text.gsub(/\s+|\n/, " ").strip
|
220
|
-
link="http://www.linkedin.com"+item.at("a")["href"]
|
221
|
-
groups<<{:name=>name,:link=>link}
|
170
|
+
name = item.text.gsub(/\s+|\n/, " ").strip
|
171
|
+
link = "http://www.linkedin.com"+item.at("a")["href"]
|
172
|
+
groups << {:name=>name,:link=>link}
|
222
173
|
end
|
223
174
|
return groups
|
224
175
|
end
|
225
|
-
|
226
176
|
end
|
227
177
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
def get_recommended_visitors page
|
178
|
+
def get_recommended_visitors(page)
|
232
179
|
recommended_vs=[]
|
233
180
|
if page.search(".browsemap").first
|
234
181
|
page.at(".browsemap").at("ul").search("li").each do |visitor|
|
235
|
-
v={}
|
236
|
-
v[:link]=visitor.at('a')
|
237
|
-
v[:name]=visitor.at('a').text
|
238
|
-
v[:title]=visitor.at('.headline').text.split(" at ").first
|
239
|
-
v[:company]=visitor.at('.headline').text.split(" at ")
|
240
|
-
recommended_vs<<v
|
182
|
+
v = {}
|
183
|
+
v[:link] = visitor.at('a')["href"]
|
184
|
+
v[:name] = visitor.at('strong/a').text
|
185
|
+
v[:title] = visitor.at('.headline').text.gsub("..."," ").split(" at ").first
|
186
|
+
v[:company] = visitor.at('.headline').text.gsub("..."," ").split(" at ")[1]
|
187
|
+
recommended_vs << v
|
241
188
|
end
|
242
189
|
return recommended_vs
|
243
190
|
end
|
data/linkedin-scraper.gemspec
CHANGED
@@ -3,13 +3,10 @@ require File.expand_path('../lib/linkedin-scraper/version', __FILE__)
|
|
3
3
|
|
4
4
|
Gem::Specification.new do |gem|
|
5
5
|
gem.authors = ["Yatish Mehta"]
|
6
|
-
gem.email = ["yatishmehta27@gmail.com"]
|
7
6
|
gem.description = %q{Scrapes the linkedin profile when a url is given }
|
8
7
|
gem.summary = %q{when a url of public linkedin profile page is given it scrapes the entire page and converts into a accessible object}
|
9
8
|
gem.homepage = "https://github.com/yatishmehta27/linkedin-scraper"
|
10
|
-
|
11
|
-
gem.add_dependency(%q<mechanize>, [">= 0"])
|
12
|
-
|
9
|
+
gem.add_dependency(%q<mechanize>, [">= 0"])
|
13
10
|
gem.files = `git ls-files`.split($\)
|
14
11
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
15
12
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
require 'linkedin-scraper'
|
3
|
+
|
4
|
+
|
5
|
+
describe Linkedin::Profile do
|
6
|
+
describe "::get_profile" do
|
7
|
+
it "Create an instance of profile class and populate it will all details" do
|
8
|
+
@profile = Linkedin::Profile.get_profile("http://www.linkedin.com/in/jeffweiner08")
|
9
|
+
@profile.first_name.should == "Jeff"
|
10
|
+
#other parameters may change with time
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
$: << File.join(File.dirname(__FILE__), '../lib')
|
2
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
3
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
4
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
5
|
+
# loaded once.
|
6
|
+
#
|
7
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
8
|
+
RSpec.configure do |config|
|
9
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
10
|
+
config.run_all_when_everything_filtered = true
|
11
|
+
config.filter_run :focus
|
12
|
+
|
13
|
+
# Run specs in random order to surface order dependencies. If you find an
|
14
|
+
# order dependency and want to debug it, you can fix the order by providing
|
15
|
+
# the seed, which is printed after each run.
|
16
|
+
# --seed 1234
|
17
|
+
config.order = 'random'
|
18
|
+
end
|
metadata
CHANGED
@@ -1,35 +1,31 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedin-scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
5
|
-
prerelease:
|
4
|
+
version: 0.0.8
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Yatish Mehta
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2013-03-12 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: mechanize
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
|
-
- -
|
17
|
+
- - '>='
|
20
18
|
- !ruby/object:Gem::Version
|
21
19
|
version: '0'
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
|
-
- -
|
24
|
+
- - '>='
|
28
25
|
- !ruby/object:Gem::Version
|
29
26
|
version: '0'
|
30
|
-
description:
|
31
|
-
email:
|
32
|
-
- yatishmehta27@gmail.com
|
27
|
+
description: 'Scrapes the linkedin profile when a url is given '
|
28
|
+
email:
|
33
29
|
executables: []
|
34
30
|
extensions: []
|
35
31
|
extra_rdoc_files: []
|
@@ -37,35 +33,38 @@ files:
|
|
37
33
|
- .gitignore
|
38
34
|
- Gemfile
|
39
35
|
- LICENSE
|
40
|
-
- README.
|
36
|
+
- README.md
|
41
37
|
- Rakefile
|
42
38
|
- lib/linkedin-scraper.rb
|
43
39
|
- lib/linkedin-scraper/profile.rb
|
44
40
|
- lib/linkedin-scraper/version.rb
|
45
41
|
- linkedin-scraper.gemspec
|
42
|
+
- spec/linkedin-scraper/profile_spec.rb
|
43
|
+
- spec/spec_helper.rb
|
46
44
|
homepage: https://github.com/yatishmehta27/linkedin-scraper
|
47
45
|
licenses: []
|
46
|
+
metadata: {}
|
48
47
|
post_install_message:
|
49
48
|
rdoc_options: []
|
50
49
|
require_paths:
|
51
50
|
- lib
|
52
51
|
required_ruby_version: !ruby/object:Gem::Requirement
|
53
|
-
none: false
|
54
52
|
requirements:
|
55
|
-
- -
|
53
|
+
- - '>='
|
56
54
|
- !ruby/object:Gem::Version
|
57
55
|
version: '0'
|
58
56
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
59
|
-
none: false
|
60
57
|
requirements:
|
61
|
-
- -
|
58
|
+
- - '>='
|
62
59
|
- !ruby/object:Gem::Version
|
63
60
|
version: '0'
|
64
61
|
requirements: []
|
65
62
|
rubyforge_project:
|
66
|
-
rubygems_version:
|
63
|
+
rubygems_version: 2.0.0
|
67
64
|
signing_key:
|
68
|
-
specification_version:
|
65
|
+
specification_version: 4
|
69
66
|
summary: when a url of public linkedin profile page is given it scrapes the entire
|
70
67
|
page and converts into a accessible object
|
71
|
-
test_files:
|
68
|
+
test_files:
|
69
|
+
- spec/linkedin-scraper/profile_spec.rb
|
70
|
+
- spec/spec_helper.rb
|
data/README.rdoc
DELETED
@@ -1,134 +0,0 @@
|
|
1
|
-
= Linkedin-Scraper {<img src="http://travis-ci.org/jaimeiniesta/metainspector.png" />}[http://travis-ci.org/jaimeiniesta/metainspector]
|
2
|
-
|
3
|
-
Linkedin-scraper is a gem for scraping linkedin public profiles. You give it an URL, and it lets you easily get its title,name,country,area,current_companies .
|
4
|
-
|
5
|
-
= Installation
|
6
|
-
|
7
|
-
Install the gem from RubyGems:
|
8
|
-
|
9
|
-
gem install linkedin-scraper
|
10
|
-
|
11
|
-
This gem is tested on Ruby versions 1.8.7, 1.9.2 and 1.9.3.
|
12
|
-
|
13
|
-
= Usage
|
14
|
-
|
15
|
-
Initialize a scraper instance for an URL, like this:
|
16
|
-
|
17
|
-
profile = Linkedin::Profile.get_profile('http://in.linkedin.com/pub/yatish-mehta/22/460/a86')
|
18
|
-
|
19
|
-
Then you can see the scraped data like this:
|
20
|
-
|
21
|
-
|
22
|
-
profile.first_name #the First name of the contact
|
23
|
-
|
24
|
-
profile.last_name #the last name of the contact
|
25
|
-
|
26
|
-
profile.title #the linkedin job title
|
27
|
-
|
28
|
-
profile.location #the location of the contact
|
29
|
-
|
30
|
-
profile.country #the country of the contact
|
31
|
-
|
32
|
-
profile.industry #the domain for which the contact belongs
|
33
|
-
|
34
|
-
profile.past_companies
|
35
|
-
#Array of hash containing its past job companies and job profile
|
36
|
-
#Example
|
37
|
-
# [
|
38
|
-
# [0] {
|
39
|
-
# :past_company => "Consumyze Software",
|
40
|
-
# :past_title => "Trainee",
|
41
|
-
# :past_company_website => "http://www.consumyze.com",
|
42
|
-
# :description => "Responsible for design and development"
|
43
|
-
# },
|
44
|
-
# [1] {
|
45
|
-
# :past_company => "SunGard Global Services",
|
46
|
-
# :past_title => "Project Intern",
|
47
|
-
# :past_company_website => "http://www.sungard.com/globalservices/learnmore",
|
48
|
-
# :description => "Fame PassPoint. Developed an entirely Ajax based online control panel for user management and Data access for Fame"
|
49
|
-
# }
|
50
|
-
# ]
|
51
|
-
profile.current_companies
|
52
|
-
#Array of hash containing its current job companies and job profile
|
53
|
-
#Example
|
54
|
-
# [
|
55
|
-
# [0] {
|
56
|
-
# :current_title => "Intern",
|
57
|
-
# :current_company => "Sungard"
|
58
|
-
# :current_company_url=>"http://www.betterlabs.net",
|
59
|
-
# :description=>"Responsible for design and development of projects on Ruby on Rails."
|
60
|
-
# },
|
61
|
-
# [1] {
|
62
|
-
# :current_title => "Software Developer",
|
63
|
-
# :current_company => "Microsoft"
|
64
|
-
# :current_company_url =>"http://www.microsoft.net",
|
65
|
-
# :description =>"Development and design"
|
66
|
-
|
67
|
-
# }
|
68
|
-
# ]
|
69
|
-
|
70
|
-
|
71
|
-
profile.linkedin_url #url of the profile
|
72
|
-
|
73
|
-
profile.websites
|
74
|
-
#Array of websites
|
75
|
-
#[
|
76
|
-
# [0] "http://www.yatishmehta.in"
|
77
|
-
#]
|
78
|
-
|
79
|
-
profile.groups
|
80
|
-
#array of hashes containing group name and link
|
81
|
-
# [
|
82
|
-
# [ 0] {
|
83
|
-
# :name => "Business on Rails",
|
84
|
-
# :link => "http://www.linkedin.com/groups/Business-on-Rails-27822"
|
85
|
-
# },
|
86
|
-
# [ 1] {
|
87
|
-
# :name => "HTML5 Technologies",
|
88
|
-
# :link => "http://www.linkedin.com/groups/HTML5-Technologies-2868882"
|
89
|
-
# },
|
90
|
-
# [ 2] {
|
91
|
-
# :name => "India on Rails",
|
92
|
-
# :link => "http://www.linkedin.com/groups/India-on-Rails-149940"
|
93
|
-
# :name => "Open Source",
|
94
|
-
# :link => "http://www.linkedin.com/groups?gid=43875"
|
95
|
-
# },
|
96
|
-
# [ 4] {
|
97
|
-
# :name => "Rails Developers",
|
98
|
-
# :link => "http://www.linkedin.com/groups?gid=77764"
|
99
|
-
# },
|
100
|
-
# ]
|
101
|
-
|
102
|
-
profile.education
|
103
|
-
#Array of hashes for eduction
|
104
|
-
# [
|
105
|
-
# [0] {
|
106
|
-
# :name => "Vishwakarma Institute of Technology",
|
107
|
-
# :description => "B.Tech, Computer Engineering",
|
108
|
-
# :period => "2007 – 2011"
|
109
|
-
# },
|
110
|
-
# [1] {
|
111
|
-
# :name => "St Ursula's High School",
|
112
|
-
# :description => "Secondary School Education",
|
113
|
-
# :period => nil
|
114
|
-
# }
|
115
|
-
# ]
|
116
|
-
|
117
|
-
profile.recommended_visitors
|
118
|
-
#Its the list of visitors "Viewers of this profile also viewed..."
|
119
|
-
#attr_accessor :recommended_visitors = [
|
120
|
-
# [0] {
|
121
|
-
# :link => href="http://in.linkedin.com/in/nileshavhad?trk=pub-pbmap",
|
122
|
-
# :name => "Nilesh Avhad",
|
123
|
-
# :title => "Engineering Manager",
|
124
|
-
# :company => "Better Labs"
|
125
|
-
# },
|
126
|
-
|
127
|
-
|
128
|
-
= ZOMG Fork! Thank you!
|
129
|
-
|
130
|
-
You're welcome to fork this project and send pull requests. I want to thank specially:
|
131
|
-
|
132
|
-
= To Do
|
133
|
-
*
|
134
|
-
Copyright (c) 2009-2012 Yatish Mehta, released under the MIT license
|