linkedin-scraper 0.0.6 → 0.0.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -32,34 +32,87 @@ Then you can see the scraped data like this:
32
32
  profile.industry #the domain for which the contact belongs
33
33
 
34
34
  profile.past_companies
35
- #Array of hash containing its past job companies and job profile
36
- #Example
37
- # [
38
- # [0] {
39
- # :past_title => "Intern",
40
- # :past_company => "Sungard"
41
- # },
42
- # [1] {
43
- # :past_title => "Software Developer",
44
- # :past_company => "Microsoft"
45
- # }
46
- # ]
35
+ #Array of hash containing its past job companies and job profile
36
+ #Example
37
+ # [
38
+ # [0] {
39
+ # :past_company => "Consumyze Software",
40
+ # :past_title => "Trainee",
41
+ # :past_company_website => "http://www.consumyze.com",
42
+ # :description => "Responsible for design and development"
43
+ # },
44
+ # [1] {
45
+ # :past_company => "SunGard Global Services",
46
+ # :past_title => "Project Intern",
47
+ # :past_company_website => "http://www.sungard.com/globalservices/learnmore",
48
+ # :description => "Fame PassPoint. Developed an entirely Ajax based online control panel for user management and Data access for Fame"
49
+ # }
50
+ # ]
47
51
  profile.current_companies
48
52
  #Array of hash containing its current job companies and job profile
49
- #Example
50
- # [
51
- # [0] {
52
- # :current_title => "Intern",
53
- # :current_company => "Sungard"
54
- # },
55
- # [1] {
56
- # :current_title => "Software Developer",
57
- # :current_company => "Microsoft"
58
- # }
59
- # ]
53
+ #Example
54
+ # [
55
+ # [0] {
56
+ # :current_title => "Intern",
57
+ # :current_company => "Sungard"
58
+ # :current_company_url=>"http://www.betterlabs.net",
59
+ # :description=>"Responsible for design and development of projects on Ruby on Rails."
60
+ # },
61
+ # [1] {
62
+ # :current_title => "Software Developer",
63
+ # :current_company => "Microsoft"
64
+ # :current_company_url =>"http://www.microsoft.net",
65
+ # :description =>"Development and design"
66
+
67
+ # }
68
+ # ]
60
69
 
61
70
 
62
71
  profile.linkedin_url #url of the profile
72
+
73
+ profile.websites
74
+ #Array of websites
75
+ #[
76
+ # [0] "http://www.yatishmehta.in"
77
+ #]
78
+
79
+ profile.groups
80
+ #array of hashes containing group name and link
81
+ # [
82
+ # [ 0] {
83
+ # :name => "Business on Rails",
84
+ # :link => "http://www.linkedin.com/groups/Business-on-Rails-27822"
85
+ # },
86
+ # [ 1] {
87
+ # :name => "HTML5 Technologies",
88
+ # :link => "http://www.linkedin.com/groups/HTML5-Technologies-2868882"
89
+ # },
90
+ # [ 2] {
91
+ # :name => "India on Rails",
92
+ # :link => "http://www.linkedin.com/groups/India-on-Rails-149940"
93
+ # :name => "Open Source",
94
+ # :link => "http://www.linkedin.com/groups?gid=43875"
95
+ # },
96
+ # [ 4] {
97
+ # :name => "Rails Developers",
98
+ # :link => "http://www.linkedin.com/groups?gid=77764"
99
+ # },
100
+ # ]
101
+
102
+ profile.education
103
+ #Array of hashes for eduction
104
+ # [
105
+ # [0] {
106
+ # :name => "Vishwakarma Institute of Technology",
107
+ # :description => "B.Tech, Computer Engineering",
108
+ # :period => "2007 – 2011"
109
+ # },
110
+ # [1] {
111
+ # :name => "St Ursula's High School",
112
+ # :description => "Secondary School Education",
113
+ # :period => nil
114
+ # }
115
+ # ]
63
116
 
64
117
  profile.recommended_visitors
65
118
  #Its the list of visitors "Viewers of this profile also viewed..."
@@ -1,6 +1,7 @@
1
- require "linkedin-scraper/version"
2
1
  require "rubygems"
3
2
  require "mechanize"
3
+ require "cgi"
4
+ require "net/http"
4
5
  Dir["#{File.expand_path(File.dirname(__FILE__))}/linkedin-scraper/*.rb"].each {|file| require file }
5
6
 
6
7
 
@@ -3,22 +3,68 @@ module Linkedin
3
3
  class Profile
4
4
  #the First name of the contact
5
5
  attr_accessor :first_name,:last_name,:title,:location,:country,
6
- :industry, :linkedin_url,:recommended_visitors,:profile,
7
- :page
6
+ :industry, :linkedin_url,:recommended_visitors,:page
7
+ #Array of hashes for eduction
8
+ # [
9
+ # [0] {
10
+ # :name => "Vishwakarma Institute of Technology",
11
+ # :description => "B.Tech, Computer Engineering",
12
+ # :period => "2007 – 2011"
13
+ # },
14
+ # [1] {
15
+ # :name => "St Ursula's High School",
16
+ # :description => "Secondary School Education",
17
+ # :period => nil
18
+ # }
19
+ # ]
20
+ attr_accessor :education
21
+
22
+ #Array of websites
23
+ #[
24
+ #[0] "http://www.yatishmehta.in"
25
+ #]
26
+ attr_accessor :websites
27
+ #array of hashes containing group name and link
28
+ # [
29
+ # [ 0] {
30
+ # :name => "Business on Rails",
31
+ # :link => "http://www.linkedin.com/groups/Business-on-Rails-27822"
32
+ # },
33
+ # [ 1] {
34
+ # :name => "HTML5 Technologies",
35
+ # :link => "http://www.linkedin.com/groups/HTML5-Technologies-2868882"
36
+ # },
37
+ # [ 2] {
38
+ # :name => "India on Rails",
39
+ # :link => "http://www.linkedin.com/groups/India-on-Rails-149940"
40
+ # :name => "Open Source",
41
+ # :link => "http://www.linkedin.com/groups?gid=43875"
42
+ # },
43
+ # [ 4] {
44
+ # :name => "Rails Developers",
45
+ # :link => "http://www.linkedin.com/groups?gid=77764"
46
+ # },
47
+ # ]
48
+ attr_accessor:groups
8
49
 
9
50
 
10
51
  #Array of hash containing its past job companies and job profile
11
52
  #Example
12
53
  # [
13
- # [0] {
14
- # :past_title => "Intern",
15
- # :past_company => "Sungard"
16
- # },
17
- # [1] {
18
- # :past_title => "Software Developer",
19
- # :past_company => "Microsoft"
20
- # }
21
- # ]
54
+ # [0] {
55
+ # :past_company => "Consumyze Software",
56
+ # :past_title => "Trainee",
57
+ # :past_company_website => "http://www.consumyze.com",
58
+ # :description => "Responsible for design and development"
59
+ # },
60
+ # [1] {
61
+ # :past_company => "SunGard Global Services",
62
+ # :past_title => "Project Intern",
63
+ # :past_company_website => "http://www.sungard.com/globalservices/learnmore",
64
+ # :description => "Fame PassPoint. Developed an entirely Ajax based online control panel for user management and Data access for Fame"
65
+ # }
66
+ # ]
67
+
22
68
 
23
69
  attr_accessor :past_companies
24
70
  #Array of hash containing its current job companies and job profile
@@ -27,10 +73,15 @@ module Linkedin
27
73
  # [0] {
28
74
  # :current_title => "Intern",
29
75
  # :current_company => "Sungard"
76
+ # :current_company_url=>"http://www.betterlabs.net",
77
+ # :description=>"Responsible for design and development of projects on Ruby on Rails."
30
78
  # },
31
79
  # [1] {
32
- # :current_title => "Software Developer",
33
- # :current_company => "Microsoft"
80
+ # :current_title => "Software Developer",
81
+ # :current_company => "Microsoft"
82
+ # :current_company_url =>"http://www.microsoft.net",
83
+ # :description =>"Development and design"
84
+
34
85
  # }
35
86
  # ]
36
87
  attr_accessor :current_companies
@@ -47,10 +98,14 @@ module Linkedin
47
98
  @current_companies=get_current_companies page
48
99
  @past_companies=get_past_companies page
49
100
  @recommended_visitors=get_recommended_visitors page
101
+ @education=get_education page
50
102
  @linkedin_url=url
103
+ @websites=get_websites page
104
+ @groups=get_groups page
51
105
  @page=page
52
106
  end
53
107
  #returns:nil if it gives a 404 request
108
+
54
109
  def self.get_profile url
55
110
  begin
56
111
  @agent=Mechanize.new
@@ -63,6 +118,17 @@ module Linkedin
63
118
  end
64
119
  end
65
120
 
121
+ def get_company_url node
122
+ if node.at("h4/strong/a")
123
+ link=node.at("h4/strong/a")["href"]
124
+ @agent=Mechanize.new
125
+ @agent.user_agent_alias = USER_AGENTS.sample
126
+ @agent.max_history = 0
127
+ page=@agent.get("http://www.linkedin.com"+link)
128
+ url=page.at(".basic-info/div/dl/dd/a").text if page.at(".basic-info/div/dl/dd/a")
129
+ end
130
+ end
131
+
66
132
  private
67
133
 
68
134
  def get_first_name page
@@ -91,12 +157,13 @@ module Linkedin
91
157
 
92
158
  def get_past_companies page
93
159
  past_cs=[]
94
- if page.search(".past").first
95
- page.search(".past").search("li").each do |past_company|
96
- title,company=past_company.text.strip.split(" at ")
97
- company=company.gsub(/\s+/, " ").strip if company
98
- title=title.gsub(/\s+/, " ").strip if title
99
- past_company={:past_company=>company,:past_title=> title}
160
+ if page.search(".position.experience.vevent.vcard.summary-past").first
161
+ page.search(".position.experience.vevent.vcard.summary-past").each do |past_company|
162
+ url=get_company_url past_company
163
+ title=past_company.at("h3").text.gsub(/\s+|\n/, " ").strip if past_company.at("h3")
164
+ company=past_company.at("h4").text.gsub(/\s+|\n/, " ").strip if past_company.at("h4")
165
+ description=past_company.at(".description.past-position").text.gsub(/\s+|\n/, " ").strip if past_company.at(".description.past-position")
166
+ past_company={:past_company=>company,:past_title=> title,:past_company_website=>url,:description=>description}
100
167
  past_cs<<past_company
101
168
  end
102
169
  return past_cs
@@ -105,18 +172,62 @@ module Linkedin
105
172
 
106
173
  def get_current_companies page
107
174
  current_cs=[]
108
- if page.search(".current").first
109
- page.search(".current").search("li").each do |past_company|
110
- title,company=past_company.text.strip.split(" at ")
111
- company=company.gsub(/\s+/, " ").strip if company
112
- title=title.gsub(/\s+/, " ").strip if title
113
- current_company={:current_company=>company,:current_title=> title}
175
+ if page.search(".position.experience.vevent.vcard.summary-current").first
176
+ page.search(".position.experience.vevent.vcard.summary-current").each do |current_company|
177
+ url=get_company_url current_company
178
+ title=current_company.at("h3").text.gsub(/\s+|\n/, " ").strip if current_company.at("h3")
179
+ company=current_company.at("h4").text.gsub(/\s+|\n/, " ").strip if current_company.at("h4")
180
+ description=current_company.at(".description.current-position").text.gsub(/\s+|\n/, " ").strip if current_company.at(".description.current-position")
181
+ current_company={:current_company=>company,:current_title=> title,:current_company_url=>url,:description=>description}
114
182
  current_cs<<current_company
115
183
  end
116
184
  return current_cs
117
185
  end
118
186
  end
119
187
 
188
+ def get_education page
189
+ education=[]
190
+ if page.search(".position.education.vevent.vcard").first
191
+ page.search(".position.education.vevent.vcard").each do |item|
192
+ name=item.at("h3").text.gsub(/\s+|\n/, " ").strip if item.at("h3")
193
+ desc=item.at("h4").text.gsub(/\s+|\n/, " ").strip if item.at("h4")
194
+ period=item.at(".period").text.gsub(/\s+|\n/, " ").strip if item.at(".period")
195
+ edu={:name=>name,:description=>desc,:period=>period}
196
+ education<<edu
197
+ end
198
+ return education
199
+ end
200
+ end
201
+
202
+ def get_websites page
203
+ websites=[]
204
+ if page.search(".website").first
205
+ page.search(".website").each do |site|
206
+ url=site.at("a")["href"]
207
+ url="http://www.linkedin.com"+url
208
+ url=CGI.parse(URI.parse(url).query)["url"]
209
+ websites<<url
210
+ end
211
+ return websites.flatten!
212
+ end
213
+ end
214
+
215
+ def get_groups page
216
+ groups=[]
217
+ if page.search(".group-data").first
218
+ page.search(".group-data").each do |item|
219
+ name=item.text.gsub(/\s+|\n/, " ").strip
220
+ link="http://www.linkedin.com"+item.at("a")["href"]
221
+ groups<<{:name=>name,:link=>link}
222
+ end
223
+ return groups
224
+ end
225
+
226
+ end
227
+
228
+
229
+
230
+
120
231
  def get_recommended_visitors page
121
232
  recommended_vs=[]
122
233
  if page.search(".browsemap").first
@@ -130,7 +241,6 @@ module Linkedin
130
241
  end
131
242
  return recommended_vs
132
243
  end
133
-
134
244
  end
135
245
  end
136
246
  end
@@ -1,5 +1,5 @@
1
1
  module Linkedin
2
2
  module Scraper
3
- VERSION = "0.0.6"
3
+ VERSION = "0.0.7"
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: linkedin-scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.6
4
+ version: 0.0.7
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-23 00:00:00.000000000 Z
12
+ date: 2012-08-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize