linkedin-scraper 0.0.7 → 0.0.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +256 -0
- data/lib/linkedin-scraper/profile.rb +93 -146
- data/lib/linkedin-scraper/version.rb +1 -1
- data/linkedin-scraper.gemspec +1 -4
- data/spec/linkedin-scraper/profile_spec.rb +13 -0
- data/spec/spec_helper.rb +18 -0
- metadata +17 -18
- data/README.rdoc +0 -134
checksums.yaml
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
---
|
|
2
|
+
SHA1:
|
|
3
|
+
metadata.gz: 4821d7e30aa48eac2fe54f5feaebe664f8c7e65f
|
|
4
|
+
data.tar.gz: 55f77b9d3dc2cee5a4e0f27b50116b058b585a72
|
|
5
|
+
SHA512:
|
|
6
|
+
metadata.gz: 83aa94cbaa5de9e2711d2cbb8672c9a81986ac48eb379fdcf99d9a64e3791bd5a945d76ea5266daa9781886de5feb5b1c3a0905107e3ea71894937e455e2aa7f
|
|
7
|
+
data.tar.gz: 6dbcdab792fb1551d9e2fc79909e94ed5f4fa2da5ece3ef8e0b37307f1bbcf69b1d1c9e8d773fb9ece146f5ff64eba7b04716b71a735ecf998740ecd34865a41
|
data/README.md
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
Linkedin Scraper
|
|
2
|
+
================
|
|
3
|
+
|
|
4
|
+
Linkedin-scraper is a gem for scraping linkedin public profiles.
|
|
5
|
+
You give it an URL, and it lets you easily get its title, name, country, area, current_companies and much more.
|
|
6
|
+
|
|
7
|
+
Installation
|
|
8
|
+
------------
|
|
9
|
+
|
|
10
|
+
Install the gem from RubyGems:
|
|
11
|
+
|
|
12
|
+
gem install linkedin-scraper
|
|
13
|
+
|
|
14
|
+
This gem is tested on Ruby versions 1.8.7, 1.9.2 1.9.3 and 2.0.0
|
|
15
|
+
|
|
16
|
+
Usage
|
|
17
|
+
-----
|
|
18
|
+
|
|
19
|
+
Initialize a scraper instance for an URL, like this:
|
|
20
|
+
|
|
21
|
+
profile = Linkedin::Profile.get_profile("http://www.linkedin.com/in/jeffweiner08")
|
|
22
|
+
|
|
23
|
+
Then you can see the scraped data like this:
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
profile.first_name #the First name of the contact
|
|
27
|
+
|
|
28
|
+
profile.last_name #the last name of the contact
|
|
29
|
+
|
|
30
|
+
profile.title #the linkedin job title
|
|
31
|
+
|
|
32
|
+
profile.location #the location of the contact
|
|
33
|
+
|
|
34
|
+
profile.country #the country of the contact
|
|
35
|
+
|
|
36
|
+
profile.industry #the domain for which the contact belongs
|
|
37
|
+
|
|
38
|
+
profile.picture #the profile pic url of contact
|
|
39
|
+
|
|
40
|
+
profile.current_companies
|
|
41
|
+
|
|
42
|
+
[
|
|
43
|
+
[0] {
|
|
44
|
+
:current_company => "LinkedIn",
|
|
45
|
+
:current_title => "CEO",
|
|
46
|
+
:current_company_url => "http://www.linkedin.com",
|
|
47
|
+
:description => nil,
|
|
48
|
+
:linkedin_company_url => "http://www.linkedin.com/company/linkedin?trk=ppro_cprof",
|
|
49
|
+
:url => "http://www.linkedin.com",
|
|
50
|
+
:type => "Public Company",
|
|
51
|
+
:company_size => "1001-5000 employees",
|
|
52
|
+
:website => "http://www.linkedin.com",
|
|
53
|
+
:industry => "Internet",
|
|
54
|
+
:founded => "2003",
|
|
55
|
+
:address => "2029 Stierlin Court Mountain View, CA 94043 United States"
|
|
56
|
+
},
|
|
57
|
+
[1] {
|
|
58
|
+
:current_company => "Intuit",
|
|
59
|
+
:current_title => "Member, Board of Directors",
|
|
60
|
+
:current_company_url => "http://network.intuit.com/",
|
|
61
|
+
:description => nil,
|
|
62
|
+
:linkedin_company_url => "http://www.linkedin.com/company/intuit?trk=ppro_cprof",
|
|
63
|
+
:url => "http://network.intuit.com/",
|
|
64
|
+
:type => "Public Company",
|
|
65
|
+
:company_size => "5001-10,000 employees",
|
|
66
|
+
:website => "http://network.intuit.com/",
|
|
67
|
+
:industry => "Computer Software",
|
|
68
|
+
:founded => "1983",
|
|
69
|
+
:address => "2632 Marine Way Mountain View, CA 94043 United States"
|
|
70
|
+
},
|
|
71
|
+
[2] {
|
|
72
|
+
:current_company => "DonorsChoose",
|
|
73
|
+
:current_title => "Member, Board of Directors",
|
|
74
|
+
:current_company_url => "http://www.donorschoose.org",
|
|
75
|
+
:description => nil,
|
|
76
|
+
:linkedin_company_url => "http://www.linkedin.com/company/donorschoose.org?trk=ppro_cprof",
|
|
77
|
+
:url => "http://www.donorschoose.org",
|
|
78
|
+
:type => "Nonprofit",
|
|
79
|
+
:company_size => "51-200 employees",
|
|
80
|
+
:website => "http://www.donorschoose.org",
|
|
81
|
+
:industry => "Nonprofit Organization Management",
|
|
82
|
+
:founded => "2000",
|
|
83
|
+
:address => "213 West 35th Street 2nd Floor East New York, NY 10001 United States"
|
|
84
|
+
},
|
|
85
|
+
[3] {
|
|
86
|
+
:current_company => "Malaria No More",
|
|
87
|
+
:current_title => "Member, Board of Directors",
|
|
88
|
+
:current_company_url => nil,
|
|
89
|
+
:description => nil
|
|
90
|
+
},
|
|
91
|
+
[4] {
|
|
92
|
+
:current_company => "Venture For America",
|
|
93
|
+
:current_title => "Member, Advisory Board",
|
|
94
|
+
:current_company_url => "http://ventureforamerica.org/",
|
|
95
|
+
:description => nil,
|
|
96
|
+
:linkedin_company_url => "http://www.linkedin.com/company/venture-for-america?trk=ppro_cprof",
|
|
97
|
+
:url => "http://ventureforamerica.org/",
|
|
98
|
+
:type => "Nonprofit",
|
|
99
|
+
:company_size => "1-10 employees",
|
|
100
|
+
:website => "http://ventureforamerica.org/",
|
|
101
|
+
:industry => "Nonprofit Organization Management",
|
|
102
|
+
:founded => "2011"
|
|
103
|
+
}
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
profile.past_companies
|
|
108
|
+
#Array of hash containing its past job companies and job profile
|
|
109
|
+
#Example
|
|
110
|
+
[
|
|
111
|
+
[0] {
|
|
112
|
+
:past_company => "Accel Partners",
|
|
113
|
+
:past_title => "Executive in Residence",
|
|
114
|
+
:past_company_website => "http://www.facebook.com/accel",
|
|
115
|
+
:description => nil,
|
|
116
|
+
:linkedin_company_url => "http://www.linkedin.com/company/accel-partners?trk=ppro_cprof",
|
|
117
|
+
:url => "http://www.facebook.com/accel",
|
|
118
|
+
:type => "Partnership",
|
|
119
|
+
:company_size => "51-200 employees",
|
|
120
|
+
:website => "http://www.facebook.com/accel",
|
|
121
|
+
:industry => "Venture Capital & Private Equity",
|
|
122
|
+
:address => "428 University Palo Alto, CA 94301 United States"
|
|
123
|
+
},
|
|
124
|
+
[1] {
|
|
125
|
+
:past_company => "Greylock",
|
|
126
|
+
:past_title => "Executive in Residence",
|
|
127
|
+
:past_company_website => "http://www.greylock.com",
|
|
128
|
+
:description => nil,
|
|
129
|
+
:linkedin_company_url => "http://www.linkedin.com/company/greylock-partners?trk=ppro_cprof",
|
|
130
|
+
:url => "http://www.greylock.com",
|
|
131
|
+
:type => "Partnership",
|
|
132
|
+
:company_size => "51-200 employees",
|
|
133
|
+
:website => "http://www.greylock.com",
|
|
134
|
+
:industry => "Venture Capital & Private Equity",
|
|
135
|
+
:address => "2550 Sand Hill Road Menlo Park, CA 94025 United States"
|
|
136
|
+
},
|
|
137
|
+
[2] {
|
|
138
|
+
:past_company => "Yahoo!",
|
|
139
|
+
:past_title => "Executive Vice President Network Division",
|
|
140
|
+
:past_company_website => "http://www.yahoo.com",
|
|
141
|
+
:description => nil,
|
|
142
|
+
:linkedin_company_url => "http://www.linkedin.com/company/yahoo?trk=ppro_cprof",
|
|
143
|
+
:url => "http://www.yahoo.com",
|
|
144
|
+
:type => "Public Company",
|
|
145
|
+
:company_size => "10,001+ employees",
|
|
146
|
+
:website => "http://www.yahoo.com",
|
|
147
|
+
:industry => "Internet",
|
|
148
|
+
:founded => "1994",
|
|
149
|
+
:address => "701 First Avenue Sunnyvale, CA 94089 United States"
|
|
150
|
+
},
|
|
151
|
+
[3] {
|
|
152
|
+
:past_company => "Windsor Media",
|
|
153
|
+
:past_title => "Founding Partner",
|
|
154
|
+
:past_company_website => nil,
|
|
155
|
+
:description => nil
|
|
156
|
+
},
|
|
157
|
+
[4] {
|
|
158
|
+
:past_company => "Warner Bros.",
|
|
159
|
+
:past_title => "Vice President Online",
|
|
160
|
+
:past_company_website => "http://www.warnerbros.com/",
|
|
161
|
+
:description => nil,
|
|
162
|
+
:linkedin_company_url => "http://www.linkedin.com/company/warner-bros.-entertainment-group-of-companies?trk=ppro_cprof",
|
|
163
|
+
:url => "http://www.warnerbros.com/",
|
|
164
|
+
:type => "Public Company",
|
|
165
|
+
:company_size => "10,001+ employees",
|
|
166
|
+
:website => "http://www.warnerbros.com/",
|
|
167
|
+
:industry => "Entertainment",
|
|
168
|
+
:address => "4000 Warner Boulevard Burbank, CA 91522 United States"
|
|
169
|
+
}
|
|
170
|
+
]
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
profile.linkedin_url #url of the profile
|
|
174
|
+
|
|
175
|
+
profile.websites
|
|
176
|
+
#Array of websites
|
|
177
|
+
[
|
|
178
|
+
[0] "http://www.linkedin.com/"
|
|
179
|
+
]
|
|
180
|
+
|
|
181
|
+
profile.groups
|
|
182
|
+
#Array of hashes containing group name and link
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
profile.education
|
|
186
|
+
#Array of hashes for eduction
|
|
187
|
+
|
|
188
|
+
profile.skills
|
|
189
|
+
#Array of skills
|
|
190
|
+
|
|
191
|
+
profile.picture
|
|
192
|
+
#url of the profile picture
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
profile.recommended_visitors
|
|
196
|
+
#Its the list of visitors "Viewers of this profile also viewed..."
|
|
197
|
+
[
|
|
198
|
+
[0] {
|
|
199
|
+
:link => "http://www.linkedin.com/in/barackobama?trk=pub-pbmap",
|
|
200
|
+
:name => "Barack Obama",
|
|
201
|
+
:title => "President of the United States of ",
|
|
202
|
+
:company => nil
|
|
203
|
+
},
|
|
204
|
+
[1] {
|
|
205
|
+
:link => "http://www.linkedin.com/in/marissamayer?trk=pub-pbmap",
|
|
206
|
+
:name => "Marissa Mayer",
|
|
207
|
+
:title => "Yahoo!, President & CEO",
|
|
208
|
+
:company => nil
|
|
209
|
+
},
|
|
210
|
+
[2] {
|
|
211
|
+
:link => "http://www.linkedin.com/pub/sean-parker/0/1/826?trk=pub-pbmap",
|
|
212
|
+
:name => "Sean Parker",
|
|
213
|
+
:title => nil,
|
|
214
|
+
:company => nil
|
|
215
|
+
},
|
|
216
|
+
[3] {
|
|
217
|
+
:link => "http://www.linkedin.com/pub/eduardo-saverin/0/70a/31b?trk=pub-pbmap",
|
|
218
|
+
:name => "Eduardo Saverin",
|
|
219
|
+
:title => nil,
|
|
220
|
+
:company => nil
|
|
221
|
+
},
|
|
222
|
+
[4] {
|
|
223
|
+
:link => "http://www.linkedin.com/in/rbranson?trk=pub-pbmap",
|
|
224
|
+
:name => "Richard Branson",
|
|
225
|
+
:title => "Founder",
|
|
226
|
+
:company => "Virgin Group"
|
|
227
|
+
},
|
|
228
|
+
[5] {
|
|
229
|
+
:link => "http://www.linkedin.com/in/reidhoffman?trk=pub-pbmap",
|
|
230
|
+
:name => "Reid Hoffman",
|
|
231
|
+
:title => "Entrepreneur. Product Strategist. ",
|
|
232
|
+
:company => nil
|
|
233
|
+
},
|
|
234
|
+
[6] {
|
|
235
|
+
:link => "http://www.linkedin.com/in/mdell?trk=pub-pbmap",
|
|
236
|
+
:name => "Michael Dell",
|
|
237
|
+
:title => "Chairman and CEO",
|
|
238
|
+
:company => "Dell"
|
|
239
|
+
},
|
|
240
|
+
[7] {
|
|
241
|
+
:link => "http://www.linkedin.com/in/mittromney?trk=pub-pbmap",
|
|
242
|
+
:name => "Mitt Romney",
|
|
243
|
+
:title => "Believe in America",
|
|
244
|
+
:company => nil
|
|
245
|
+
},
|
|
246
|
+
[8] {
|
|
247
|
+
:link => "http://www.linkedin.com/pub/sheryl-sandberg/2/665/512?trk=pub-pbmap",
|
|
248
|
+
:name => "Sheryl Sandberg",
|
|
249
|
+
:title => nil,
|
|
250
|
+
:company => nil
|
|
251
|
+
}
|
|
252
|
+
]
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
You're welcome to fork this project and send pull requests. I want to thank specially:
|
|
@@ -1,132 +1,76 @@
|
|
|
1
|
-
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
2
|
module Linkedin
|
|
3
|
-
class Profile
|
|
4
|
-
#the First name of the contact
|
|
5
|
-
attr_accessor :first_name,:last_name,:title,:location,:country,
|
|
6
|
-
:industry, :linkedin_url,:recommended_visitors,:page
|
|
7
|
-
#Array of hashes for eduction
|
|
8
|
-
# [
|
|
9
|
-
# [0] {
|
|
10
|
-
# :name => "Vishwakarma Institute of Technology",
|
|
11
|
-
# :description => "B.Tech, Computer Engineering",
|
|
12
|
-
# :period => "2007 – 2011"
|
|
13
|
-
# },
|
|
14
|
-
# [1] {
|
|
15
|
-
# :name => "St Ursula's High School",
|
|
16
|
-
# :description => "Secondary School Education",
|
|
17
|
-
# :period => nil
|
|
18
|
-
# }
|
|
19
|
-
# ]
|
|
20
|
-
attr_accessor :education
|
|
3
|
+
class Profile
|
|
21
4
|
|
|
22
|
-
|
|
23
|
-
#[
|
|
24
|
-
#[0] "http://www.yatishmehta.in"
|
|
25
|
-
#]
|
|
26
|
-
attr_accessor :websites
|
|
27
|
-
#array of hashes containing group name and link
|
|
28
|
-
# [
|
|
29
|
-
# [ 0] {
|
|
30
|
-
# :name => "Business on Rails",
|
|
31
|
-
# :link => "http://www.linkedin.com/groups/Business-on-Rails-27822"
|
|
32
|
-
# },
|
|
33
|
-
# [ 1] {
|
|
34
|
-
# :name => "HTML5 Technologies",
|
|
35
|
-
# :link => "http://www.linkedin.com/groups/HTML5-Technologies-2868882"
|
|
36
|
-
# },
|
|
37
|
-
# [ 2] {
|
|
38
|
-
# :name => "India on Rails",
|
|
39
|
-
# :link => "http://www.linkedin.com/groups/India-on-Rails-149940"
|
|
40
|
-
# :name => "Open Source",
|
|
41
|
-
# :link => "http://www.linkedin.com/groups?gid=43875"
|
|
42
|
-
# },
|
|
43
|
-
# [ 4] {
|
|
44
|
-
# :name => "Rails Developers",
|
|
45
|
-
# :link => "http://www.linkedin.com/groups?gid=77764"
|
|
46
|
-
# },
|
|
47
|
-
# ]
|
|
48
|
-
attr_accessor:groups
|
|
5
|
+
USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
|
|
49
6
|
|
|
7
|
+
attr_accessor :first_name,:last_name,:title,:location,:country, :industry,:picture,:linkedin_url,:recommended_visitors,:page
|
|
8
|
+
|
|
9
|
+
attr_accessor :education
|
|
50
10
|
|
|
51
|
-
|
|
52
|
-
#Example
|
|
53
|
-
# [
|
|
54
|
-
# [0] {
|
|
55
|
-
# :past_company => "Consumyze Software",
|
|
56
|
-
# :past_title => "Trainee",
|
|
57
|
-
# :past_company_website => "http://www.consumyze.com",
|
|
58
|
-
# :description => "Responsible for design and development"
|
|
59
|
-
# },
|
|
60
|
-
# [1] {
|
|
61
|
-
# :past_company => "SunGard Global Services",
|
|
62
|
-
# :past_title => "Project Intern",
|
|
63
|
-
# :past_company_website => "http://www.sungard.com/globalservices/learnmore",
|
|
64
|
-
# :description => "Fame PassPoint. Developed an entirely Ajax based online control panel for user management and Data access for Fame"
|
|
65
|
-
# }
|
|
66
|
-
# ]
|
|
11
|
+
attr_accessor :websites
|
|
67
12
|
|
|
13
|
+
attr_accessor:groups
|
|
68
14
|
|
|
69
15
|
attr_accessor :past_companies
|
|
70
|
-
|
|
71
|
-
#Example
|
|
72
|
-
# [
|
|
73
|
-
# [0] {
|
|
74
|
-
# :current_title => "Intern",
|
|
75
|
-
# :current_company => "Sungard"
|
|
76
|
-
# :current_company_url=>"http://www.betterlabs.net",
|
|
77
|
-
# :description=>"Responsible for design and development of projects on Ruby on Rails."
|
|
78
|
-
# },
|
|
79
|
-
# [1] {
|
|
80
|
-
# :current_title => "Software Developer",
|
|
81
|
-
# :current_company => "Microsoft"
|
|
82
|
-
# :current_company_url =>"http://www.microsoft.net",
|
|
83
|
-
# :description =>"Development and design"
|
|
84
|
-
|
|
85
|
-
# }
|
|
86
|
-
# ]
|
|
16
|
+
|
|
87
17
|
attr_accessor :current_companies
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
def initialize(page,url)
|
|
92
|
-
@first_name=get_first_name(page)
|
|
93
|
-
@last_name=get_last_name(page)
|
|
94
|
-
@title=get_title(page)
|
|
95
|
-
@location=get_location(page)
|
|
96
|
-
@country=get_country(page)
|
|
97
|
-
@industry=get_industry(page)
|
|
98
|
-
@
|
|
99
|
-
@
|
|
100
|
-
@
|
|
101
|
-
@
|
|
102
|
-
@
|
|
103
|
-
@
|
|
104
|
-
@
|
|
105
|
-
@
|
|
18
|
+
|
|
19
|
+
attr_accessor :skills
|
|
20
|
+
|
|
21
|
+
def initialize(page,url)
|
|
22
|
+
@first_name = get_first_name(page)
|
|
23
|
+
@last_name = get_last_name(page)
|
|
24
|
+
@title = get_title(page)
|
|
25
|
+
@location = get_location(page)
|
|
26
|
+
@country = get_country(page)
|
|
27
|
+
@industry = get_industry(page)
|
|
28
|
+
@picture = get_picture(page)
|
|
29
|
+
@current_companies = get_current_companies(page)
|
|
30
|
+
@past_companies = get_past_companies(page)
|
|
31
|
+
@recommended_visitors = get_recommended_visitors(page)
|
|
32
|
+
@education = get_education(page)
|
|
33
|
+
@linkedin_url = url
|
|
34
|
+
@websites = get_websites(page)
|
|
35
|
+
@groups = get_groups(page)
|
|
36
|
+
@skills = get_skills(page)
|
|
37
|
+
@page = page
|
|
106
38
|
end
|
|
107
39
|
#returns:nil if it gives a 404 request
|
|
108
40
|
|
|
109
|
-
def self.get_profile
|
|
41
|
+
def self.get_profile(url)
|
|
110
42
|
begin
|
|
111
|
-
@agent=Mechanize.new
|
|
43
|
+
@agent = Mechanize.new
|
|
112
44
|
@agent.user_agent_alias = USER_AGENTS.sample
|
|
113
45
|
@agent.max_history = 0
|
|
114
|
-
page
|
|
46
|
+
page = @agent.get(url)
|
|
115
47
|
return Linkedin::Profile.new(page, url)
|
|
116
48
|
rescue=>e
|
|
117
49
|
puts e
|
|
118
50
|
end
|
|
119
51
|
end
|
|
120
52
|
|
|
121
|
-
def
|
|
53
|
+
def get_skills(page)
|
|
54
|
+
page.search('.competency.show-bean').map{|skill|skill.text.strip if skill.text}
|
|
55
|
+
end
|
|
56
|
+
|
|
57
|
+
def get_company_url(node)
|
|
58
|
+
result={}
|
|
122
59
|
if node.at("h4/strong/a")
|
|
123
|
-
link=node.at("h4/strong/a")["href"]
|
|
124
|
-
@agent=Mechanize.new
|
|
60
|
+
link = node.at("h4/strong/a")["href"]
|
|
61
|
+
@agent = Mechanize.new
|
|
125
62
|
@agent.user_agent_alias = USER_AGENTS.sample
|
|
126
63
|
@agent.max_history = 0
|
|
127
|
-
page
|
|
128
|
-
|
|
64
|
+
page = @agent.get("http://www.linkedin.com"+link)
|
|
65
|
+
result[:linkedin_company_url] = "http://www.linkedin.com"+link
|
|
66
|
+
result[:url] = page.at(".basic-info/div/dl/dd/a").text if page.at(".basic-info/div/dl/dd/a")
|
|
67
|
+
node_2 = page.at(".basic-info").at(".content.inner-mod")
|
|
68
|
+
node_2.search("dd").zip(node_2.search("dt")).each do |value,title|
|
|
69
|
+
result[title.text.gsub(" ","_").downcase.to_sym] = value.text.strip
|
|
70
|
+
end
|
|
71
|
+
result[:address] = page.at(".vcard.hq").at(".adr").text.gsub("\n"," ").strip if page.at(".vcard.hq")
|
|
129
72
|
end
|
|
73
|
+
result
|
|
130
74
|
end
|
|
131
75
|
|
|
132
76
|
private
|
|
@@ -155,89 +99,92 @@ module Linkedin
|
|
|
155
99
|
return page.at(".industry").text.gsub(/\s+/, " ").strip if page.search(".industry").first
|
|
156
100
|
end
|
|
157
101
|
|
|
102
|
+
def get_picture page
|
|
103
|
+
return page.at("#profile-picture/img.photo").attributes['src'].value.strip if page.search("#profile-picture/img.photo").first
|
|
104
|
+
end
|
|
105
|
+
|
|
158
106
|
def get_past_companies page
|
|
159
107
|
past_cs=[]
|
|
160
108
|
if page.search(".position.experience.vevent.vcard.summary-past").first
|
|
161
109
|
page.search(".position.experience.vevent.vcard.summary-past").each do |past_company|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
110
|
+
result = get_company_url past_company
|
|
111
|
+
url = result[:url]
|
|
112
|
+
title = past_company.at("h3").text.gsub(/\s+|\n/, " ").strip if past_company.at("h3")
|
|
113
|
+
company = past_company.at("h4").text.gsub(/\s+|\n/, " ").strip if past_company.at("h4")
|
|
114
|
+
description = past_company.at(".description.past-position").text.gsub(/\s+|\n/, " ").strip if past_company.at(".description.past-position")
|
|
115
|
+
p_company = {:past_company=>company,:past_title=> title,:past_company_website=>url,:description=>description}
|
|
116
|
+
p_company = p_company.merge(result)
|
|
117
|
+
past_cs << p_company
|
|
168
118
|
end
|
|
169
119
|
return past_cs
|
|
170
120
|
end
|
|
171
121
|
end
|
|
172
122
|
|
|
173
123
|
def get_current_companies page
|
|
174
|
-
current_cs=[]
|
|
124
|
+
current_cs = []
|
|
175
125
|
if page.search(".position.experience.vevent.vcard.summary-current").first
|
|
176
126
|
page.search(".position.experience.vevent.vcard.summary-current").each do |current_company|
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
127
|
+
result = get_company_url current_company
|
|
128
|
+
url = result[:url]
|
|
129
|
+
title = current_company.at("h3").text.gsub(/\s+|\n/, " ").strip if current_company.at("h3")
|
|
130
|
+
company = current_company.at("h4").text.gsub(/\s+|\n/, " ").strip if current_company.at("h4")
|
|
131
|
+
description = current_company.at(".description.current-position").text.gsub(/\s+|\n/, " ").strip if current_company.at(".description.current-position")
|
|
132
|
+
current_company = {:current_company=>company,:current_title=> title,:current_company_url=>url,:description=>description}
|
|
133
|
+
current_cs << current_company.merge(result)
|
|
183
134
|
end
|
|
184
135
|
return current_cs
|
|
185
136
|
end
|
|
186
137
|
end
|
|
187
138
|
|
|
188
|
-
def get_education
|
|
139
|
+
def get_education(page)
|
|
189
140
|
education=[]
|
|
190
141
|
if page.search(".position.education.vevent.vcard").first
|
|
191
142
|
page.search(".position.education.vevent.vcard").each do |item|
|
|
192
|
-
name=item.at("h3").text.gsub(/\s+|\n/, " ").strip if item.at("h3")
|
|
193
|
-
desc=item.at("h4").text.gsub(/\s+|\n/, " ").strip if item.at("h4")
|
|
194
|
-
period=item.at(".period").text.gsub(/\s+|\n/, " ").strip if item.at(".period")
|
|
195
|
-
edu={:name=>name,:description=>desc,:period=>period}
|
|
196
|
-
education<<edu
|
|
143
|
+
name = item.at("h3").text.gsub(/\s+|\n/, " ").strip if item.at("h3")
|
|
144
|
+
desc = item.at("h4").text.gsub(/\s+|\n/, " ").strip if item.at("h4")
|
|
145
|
+
period = item.at(".period").text.gsub(/\s+|\n/, " ").strip if item.at(".period")
|
|
146
|
+
edu = {:name => name,:description => desc,:period => period}
|
|
147
|
+
education << edu
|
|
197
148
|
end
|
|
198
149
|
return education
|
|
199
150
|
end
|
|
200
151
|
end
|
|
201
152
|
|
|
202
|
-
def get_websites
|
|
153
|
+
def get_websites(page)
|
|
203
154
|
websites=[]
|
|
204
155
|
if page.search(".website").first
|
|
205
156
|
page.search(".website").each do |site|
|
|
206
|
-
url=site.at("a")["href"]
|
|
207
|
-
url="http://www.linkedin.com"+url
|
|
208
|
-
url=CGI.parse(URI.parse(url).query)["url"]
|
|
209
|
-
websites<<url
|
|
157
|
+
url = site.at("a")["href"]
|
|
158
|
+
url = "http://www.linkedin.com"+url
|
|
159
|
+
url = CGI.parse(URI.parse(url).query)["url"]
|
|
160
|
+
websites << url
|
|
210
161
|
end
|
|
211
162
|
return websites.flatten!
|
|
212
|
-
end
|
|
163
|
+
end
|
|
213
164
|
end
|
|
214
165
|
|
|
215
|
-
def get_groups
|
|
216
|
-
groups=[]
|
|
166
|
+
def get_groups(page)
|
|
167
|
+
groups = []
|
|
217
168
|
if page.search(".group-data").first
|
|
218
169
|
page.search(".group-data").each do |item|
|
|
219
|
-
name=item.text.gsub(/\s+|\n/, " ").strip
|
|
220
|
-
link="http://www.linkedin.com"+item.at("a")["href"]
|
|
221
|
-
groups<<{:name=>name,:link=>link}
|
|
170
|
+
name = item.text.gsub(/\s+|\n/, " ").strip
|
|
171
|
+
link = "http://www.linkedin.com"+item.at("a")["href"]
|
|
172
|
+
groups << {:name=>name,:link=>link}
|
|
222
173
|
end
|
|
223
174
|
return groups
|
|
224
175
|
end
|
|
225
|
-
|
|
226
176
|
end
|
|
227
177
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
def get_recommended_visitors page
|
|
178
|
+
def get_recommended_visitors(page)
|
|
232
179
|
recommended_vs=[]
|
|
233
180
|
if page.search(".browsemap").first
|
|
234
181
|
page.at(".browsemap").at("ul").search("li").each do |visitor|
|
|
235
|
-
v={}
|
|
236
|
-
v[:link]=visitor.at('a')
|
|
237
|
-
v[:name]=visitor.at('a').text
|
|
238
|
-
v[:title]=visitor.at('.headline').text.split(" at ").first
|
|
239
|
-
v[:company]=visitor.at('.headline').text.split(" at ")
|
|
240
|
-
recommended_vs<<v
|
|
182
|
+
v = {}
|
|
183
|
+
v[:link] = visitor.at('a')["href"]
|
|
184
|
+
v[:name] = visitor.at('strong/a').text
|
|
185
|
+
v[:title] = visitor.at('.headline').text.gsub("..."," ").split(" at ").first
|
|
186
|
+
v[:company] = visitor.at('.headline').text.gsub("..."," ").split(" at ")[1]
|
|
187
|
+
recommended_vs << v
|
|
241
188
|
end
|
|
242
189
|
return recommended_vs
|
|
243
190
|
end
|
data/linkedin-scraper.gemspec
CHANGED
|
@@ -3,13 +3,10 @@ require File.expand_path('../lib/linkedin-scraper/version', __FILE__)
|
|
|
3
3
|
|
|
4
4
|
Gem::Specification.new do |gem|
|
|
5
5
|
gem.authors = ["Yatish Mehta"]
|
|
6
|
-
gem.email = ["yatishmehta27@gmail.com"]
|
|
7
6
|
gem.description = %q{Scrapes the linkedin profile when a url is given }
|
|
8
7
|
gem.summary = %q{when a url of public linkedin profile page is given it scrapes the entire page and converts into a accessible object}
|
|
9
8
|
gem.homepage = "https://github.com/yatishmehta27/linkedin-scraper"
|
|
10
|
-
|
|
11
|
-
gem.add_dependency(%q<mechanize>, [">= 0"])
|
|
12
|
-
|
|
9
|
+
gem.add_dependency(%q<mechanize>, [">= 0"])
|
|
13
10
|
gem.files = `git ls-files`.split($\)
|
|
14
11
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
|
15
12
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
require 'spec_helper'
|
|
2
|
+
require 'linkedin-scraper'
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
describe Linkedin::Profile do
|
|
6
|
+
describe "::get_profile" do
|
|
7
|
+
it "Create an instance of profile class and populate it will all details" do
|
|
8
|
+
@profile = Linkedin::Profile.get_profile("http://www.linkedin.com/in/jeffweiner08")
|
|
9
|
+
@profile.first_name.should == "Jeff"
|
|
10
|
+
#other parameters may change with time
|
|
11
|
+
end
|
|
12
|
+
end
|
|
13
|
+
end
|
data/spec/spec_helper.rb
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
$: << File.join(File.dirname(__FILE__), '../lib')
|
|
2
|
+
# This file was generated by the `rspec --init` command. Conventionally, all
|
|
3
|
+
# specs live under a `spec` directory, which RSpec adds to the `$LOAD_PATH`.
|
|
4
|
+
# Require this file using `require "spec_helper"` to ensure that it is only
|
|
5
|
+
# loaded once.
|
|
6
|
+
#
|
|
7
|
+
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
|
8
|
+
RSpec.configure do |config|
|
|
9
|
+
config.treat_symbols_as_metadata_keys_with_true_values = true
|
|
10
|
+
config.run_all_when_everything_filtered = true
|
|
11
|
+
config.filter_run :focus
|
|
12
|
+
|
|
13
|
+
# Run specs in random order to surface order dependencies. If you find an
|
|
14
|
+
# order dependency and want to debug it, you can fix the order by providing
|
|
15
|
+
# the seed, which is printed after each run.
|
|
16
|
+
# --seed 1234
|
|
17
|
+
config.order = 'random'
|
|
18
|
+
end
|
metadata
CHANGED
|
@@ -1,35 +1,31 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: linkedin-scraper
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.0.
|
|
5
|
-
prerelease:
|
|
4
|
+
version: 0.0.8
|
|
6
5
|
platform: ruby
|
|
7
6
|
authors:
|
|
8
7
|
- Yatish Mehta
|
|
9
8
|
autorequire:
|
|
10
9
|
bindir: bin
|
|
11
10
|
cert_chain: []
|
|
12
|
-
date:
|
|
11
|
+
date: 2013-03-12 00:00:00.000000000 Z
|
|
13
12
|
dependencies:
|
|
14
13
|
- !ruby/object:Gem::Dependency
|
|
15
14
|
name: mechanize
|
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
|
17
|
-
none: false
|
|
18
16
|
requirements:
|
|
19
|
-
- -
|
|
17
|
+
- - '>='
|
|
20
18
|
- !ruby/object:Gem::Version
|
|
21
19
|
version: '0'
|
|
22
20
|
type: :runtime
|
|
23
21
|
prerelease: false
|
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
25
|
-
none: false
|
|
26
23
|
requirements:
|
|
27
|
-
- -
|
|
24
|
+
- - '>='
|
|
28
25
|
- !ruby/object:Gem::Version
|
|
29
26
|
version: '0'
|
|
30
|
-
description:
|
|
31
|
-
email:
|
|
32
|
-
- yatishmehta27@gmail.com
|
|
27
|
+
description: 'Scrapes the linkedin profile when a url is given '
|
|
28
|
+
email:
|
|
33
29
|
executables: []
|
|
34
30
|
extensions: []
|
|
35
31
|
extra_rdoc_files: []
|
|
@@ -37,35 +33,38 @@ files:
|
|
|
37
33
|
- .gitignore
|
|
38
34
|
- Gemfile
|
|
39
35
|
- LICENSE
|
|
40
|
-
- README.
|
|
36
|
+
- README.md
|
|
41
37
|
- Rakefile
|
|
42
38
|
- lib/linkedin-scraper.rb
|
|
43
39
|
- lib/linkedin-scraper/profile.rb
|
|
44
40
|
- lib/linkedin-scraper/version.rb
|
|
45
41
|
- linkedin-scraper.gemspec
|
|
42
|
+
- spec/linkedin-scraper/profile_spec.rb
|
|
43
|
+
- spec/spec_helper.rb
|
|
46
44
|
homepage: https://github.com/yatishmehta27/linkedin-scraper
|
|
47
45
|
licenses: []
|
|
46
|
+
metadata: {}
|
|
48
47
|
post_install_message:
|
|
49
48
|
rdoc_options: []
|
|
50
49
|
require_paths:
|
|
51
50
|
- lib
|
|
52
51
|
required_ruby_version: !ruby/object:Gem::Requirement
|
|
53
|
-
none: false
|
|
54
52
|
requirements:
|
|
55
|
-
- -
|
|
53
|
+
- - '>='
|
|
56
54
|
- !ruby/object:Gem::Version
|
|
57
55
|
version: '0'
|
|
58
56
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
59
|
-
none: false
|
|
60
57
|
requirements:
|
|
61
|
-
- -
|
|
58
|
+
- - '>='
|
|
62
59
|
- !ruby/object:Gem::Version
|
|
63
60
|
version: '0'
|
|
64
61
|
requirements: []
|
|
65
62
|
rubyforge_project:
|
|
66
|
-
rubygems_version:
|
|
63
|
+
rubygems_version: 2.0.0
|
|
67
64
|
signing_key:
|
|
68
|
-
specification_version:
|
|
65
|
+
specification_version: 4
|
|
69
66
|
summary: when a url of public linkedin profile page is given it scrapes the entire
|
|
70
67
|
page and converts into a accessible object
|
|
71
|
-
test_files:
|
|
68
|
+
test_files:
|
|
69
|
+
- spec/linkedin-scraper/profile_spec.rb
|
|
70
|
+
- spec/spec_helper.rb
|
data/README.rdoc
DELETED
|
@@ -1,134 +0,0 @@
|
|
|
1
|
-
= Linkedin-Scraper {<img src="http://travis-ci.org/jaimeiniesta/metainspector.png" />}[http://travis-ci.org/jaimeiniesta/metainspector]
|
|
2
|
-
|
|
3
|
-
Linkedin-scraper is a gem for scraping linkedin public profiles. You give it an URL, and it lets you easily get its title,name,country,area,current_companies .
|
|
4
|
-
|
|
5
|
-
= Installation
|
|
6
|
-
|
|
7
|
-
Install the gem from RubyGems:
|
|
8
|
-
|
|
9
|
-
gem install linkedin-scraper
|
|
10
|
-
|
|
11
|
-
This gem is tested on Ruby versions 1.8.7, 1.9.2 and 1.9.3.
|
|
12
|
-
|
|
13
|
-
= Usage
|
|
14
|
-
|
|
15
|
-
Initialize a scraper instance for an URL, like this:
|
|
16
|
-
|
|
17
|
-
profile = Linkedin::Profile.get_profile('http://in.linkedin.com/pub/yatish-mehta/22/460/a86')
|
|
18
|
-
|
|
19
|
-
Then you can see the scraped data like this:
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
profile.first_name #the First name of the contact
|
|
23
|
-
|
|
24
|
-
profile.last_name #the last name of the contact
|
|
25
|
-
|
|
26
|
-
profile.title #the linkedin job title
|
|
27
|
-
|
|
28
|
-
profile.location #the location of the contact
|
|
29
|
-
|
|
30
|
-
profile.country #the country of the contact
|
|
31
|
-
|
|
32
|
-
profile.industry #the domain for which the contact belongs
|
|
33
|
-
|
|
34
|
-
profile.past_companies
|
|
35
|
-
#Array of hash containing its past job companies and job profile
|
|
36
|
-
#Example
|
|
37
|
-
# [
|
|
38
|
-
# [0] {
|
|
39
|
-
# :past_company => "Consumyze Software",
|
|
40
|
-
# :past_title => "Trainee",
|
|
41
|
-
# :past_company_website => "http://www.consumyze.com",
|
|
42
|
-
# :description => "Responsible for design and development"
|
|
43
|
-
# },
|
|
44
|
-
# [1] {
|
|
45
|
-
# :past_company => "SunGard Global Services",
|
|
46
|
-
# :past_title => "Project Intern",
|
|
47
|
-
# :past_company_website => "http://www.sungard.com/globalservices/learnmore",
|
|
48
|
-
# :description => "Fame PassPoint. Developed an entirely Ajax based online control panel for user management and Data access for Fame"
|
|
49
|
-
# }
|
|
50
|
-
# ]
|
|
51
|
-
profile.current_companies
|
|
52
|
-
#Array of hash containing its current job companies and job profile
|
|
53
|
-
#Example
|
|
54
|
-
# [
|
|
55
|
-
# [0] {
|
|
56
|
-
# :current_title => "Intern",
|
|
57
|
-
# :current_company => "Sungard"
|
|
58
|
-
# :current_company_url=>"http://www.betterlabs.net",
|
|
59
|
-
# :description=>"Responsible for design and development of projects on Ruby on Rails."
|
|
60
|
-
# },
|
|
61
|
-
# [1] {
|
|
62
|
-
# :current_title => "Software Developer",
|
|
63
|
-
# :current_company => "Microsoft"
|
|
64
|
-
# :current_company_url =>"http://www.microsoft.net",
|
|
65
|
-
# :description =>"Development and design"
|
|
66
|
-
|
|
67
|
-
# }
|
|
68
|
-
# ]
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
profile.linkedin_url #url of the profile
|
|
72
|
-
|
|
73
|
-
profile.websites
|
|
74
|
-
#Array of websites
|
|
75
|
-
#[
|
|
76
|
-
# [0] "http://www.yatishmehta.in"
|
|
77
|
-
#]
|
|
78
|
-
|
|
79
|
-
profile.groups
|
|
80
|
-
#array of hashes containing group name and link
|
|
81
|
-
# [
|
|
82
|
-
# [ 0] {
|
|
83
|
-
# :name => "Business on Rails",
|
|
84
|
-
# :link => "http://www.linkedin.com/groups/Business-on-Rails-27822"
|
|
85
|
-
# },
|
|
86
|
-
# [ 1] {
|
|
87
|
-
# :name => "HTML5 Technologies",
|
|
88
|
-
# :link => "http://www.linkedin.com/groups/HTML5-Technologies-2868882"
|
|
89
|
-
# },
|
|
90
|
-
# [ 2] {
|
|
91
|
-
# :name => "India on Rails",
|
|
92
|
-
# :link => "http://www.linkedin.com/groups/India-on-Rails-149940"
|
|
93
|
-
# :name => "Open Source",
|
|
94
|
-
# :link => "http://www.linkedin.com/groups?gid=43875"
|
|
95
|
-
# },
|
|
96
|
-
# [ 4] {
|
|
97
|
-
# :name => "Rails Developers",
|
|
98
|
-
# :link => "http://www.linkedin.com/groups?gid=77764"
|
|
99
|
-
# },
|
|
100
|
-
# ]
|
|
101
|
-
|
|
102
|
-
profile.education
|
|
103
|
-
#Array of hashes for eduction
|
|
104
|
-
# [
|
|
105
|
-
# [0] {
|
|
106
|
-
# :name => "Vishwakarma Institute of Technology",
|
|
107
|
-
# :description => "B.Tech, Computer Engineering",
|
|
108
|
-
# :period => "2007 – 2011"
|
|
109
|
-
# },
|
|
110
|
-
# [1] {
|
|
111
|
-
# :name => "St Ursula's High School",
|
|
112
|
-
# :description => "Secondary School Education",
|
|
113
|
-
# :period => nil
|
|
114
|
-
# }
|
|
115
|
-
# ]
|
|
116
|
-
|
|
117
|
-
profile.recommended_visitors
|
|
118
|
-
#Its the list of visitors "Viewers of this profile also viewed..."
|
|
119
|
-
#attr_accessor :recommended_visitors = [
|
|
120
|
-
# [0] {
|
|
121
|
-
# :link => href="http://in.linkedin.com/in/nileshavhad?trk=pub-pbmap",
|
|
122
|
-
# :name => "Nilesh Avhad",
|
|
123
|
-
# :title => "Engineering Manager",
|
|
124
|
-
# :company => "Better Labs"
|
|
125
|
-
# },
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
= ZOMG Fork! Thank you!
|
|
129
|
-
|
|
130
|
-
You're welcome to fork this project and send pull requests. I want to thank specially:
|
|
131
|
-
|
|
132
|
-
= To Do
|
|
133
|
-
*
|
|
134
|
-
Copyright (c) 2009-2012 Yatish Mehta, released under the MIT license
|