linkedin-scraper 0.0.3 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/README.rdoc +0 -36
- data/lib/linkedin-scraper.rb +1 -4
- data/lib/linkedin-scraper/profile.rb +9 -22
- data/lib/linkedin-scraper/version.rb +1 -1
- data/linkedin-scraper.gemspec +3 -3
- metadata +34 -80
- data/lib/linkedin-scraper/client.rb +0 -125
- data/lib/linkedin-scraper/contact.rb +0 -134
data/README.rdoc
CHANGED
@@ -71,42 +71,6 @@ Then you can see the scraped data like this:
|
|
71
71
|
# :company => "Better Labs"
|
72
72
|
# },
|
73
73
|
|
74
|
-
= Examples
|
75
|
-
|
76
|
-
When a link is given, it scrapes the profile and gets the data
|
77
|
-
|
78
|
-
attr_accessor :country = "India",
|
79
|
-
attr_accessor :current_companies = [
|
80
|
-
[0] {
|
81
|
-
:current_company => "Better Labs",
|
82
|
-
:current_title => "Software Engineer Core Platform"
|
83
|
-
}
|
84
|
-
],
|
85
|
-
attr_accessor :first_name = "Yatish",
|
86
|
-
attr_accessor :industry = "Information Technology and Services",
|
87
|
-
attr_accessor :last_name = "Mehta",
|
88
|
-
attr_accessor :linkedin_url = "http://in.linkedin.com/pub/yatish-mehta/22/460/a86",
|
89
|
-
attr_accessor :location = "Pune",
|
90
|
-
attr_accessor :past_companies = [
|
91
|
-
[0] {
|
92
|
-
:past_company => "Consumyze Software",
|
93
|
-
:past_title => "Trainee"
|
94
|
-
},
|
95
|
-
[1] {
|
96
|
-
:past_company => "SunGard Global Services",
|
97
|
-
:past_title => "Project Intern"
|
98
|
-
}
|
99
|
-
],
|
100
|
-
attr_accessor :recommended_visitors = [
|
101
|
-
[0] {
|
102
|
-
:link => href="http://in.linkedin.com/in/nileshavhad?trk=pub-pbmap",
|
103
|
-
:name => "Nilesh Avhad",
|
104
|
-
:title => "Engineering Manager",
|
105
|
-
:company => "Better Labs"
|
106
|
-
},
|
107
|
-
],
|
108
|
-
attr_accessor :title = "Software Engineer Core Platform at BetterLabs"
|
109
|
-
|
110
74
|
|
111
75
|
= ZOMG Fork! Thank you!
|
112
76
|
|
data/lib/linkedin-scraper.rb
CHANGED
@@ -1,10 +1,7 @@
|
|
1
1
|
require "linkedin-scraper/version"
|
2
2
|
require "rubygems"
|
3
3
|
require "mechanize"
|
4
|
-
require
|
4
|
+
Dir["#{File.expand_path(File.dirname(__FILE__))}/linkedin-scraper/*.rb"].each {|file| require file }
|
5
5
|
|
6
|
-
%w(client contact profile).each do |file|
|
7
|
-
require File.join(File.dirname(__FILE__), 'linkedin-scraper', file)
|
8
|
-
end
|
9
6
|
|
10
7
|
|
@@ -1,22 +1,11 @@
|
|
1
|
-
|
2
|
-
# and open the template in the editor.
|
1
|
+
USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
|
3
2
|
module Linkedin
|
4
|
-
class Profile
|
5
|
-
USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
|
3
|
+
class Profile
|
6
4
|
#the First name of the contact
|
7
|
-
attr_accessor :first_name
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
attr_accessor :title
|
12
|
-
#the location of the contact
|
13
|
-
attr_accessor :location
|
14
|
-
#the country of the contact
|
15
|
-
attr_accessor :country
|
16
|
-
#the domain for which the contact belongs
|
17
|
-
attr_accessor :industry
|
18
|
-
#the entire profile of the contact
|
19
|
-
attr_accessor :profile
|
5
|
+
attr_accessor :first_name,:last_name,:title,:location,:country,
|
6
|
+
:industry, :linkedin_url,:recommended_visitors,:profile,
|
7
|
+
:page
|
8
|
+
|
20
9
|
|
21
10
|
#Array of hash containing its past job companies and job profile
|
22
11
|
#Example
|
@@ -46,9 +35,7 @@ module Linkedin
|
|
46
35
|
# ]
|
47
36
|
attr_accessor :current_companies
|
48
37
|
#url of the profile
|
49
|
-
|
50
|
-
#Array of hash containing its recommended visitors which come on the
|
51
|
-
attr_accessor :recommended_visitors
|
38
|
+
|
52
39
|
|
53
40
|
def initialize(page,url)
|
54
41
|
@first_name=get_first_name(page)
|
@@ -68,8 +55,8 @@ module Linkedin
|
|
68
55
|
@agent=Mechanize.new
|
69
56
|
@agent.user_agent_alias = USER_AGENTS.sample
|
70
57
|
@agent.max_history = 0
|
71
|
-
page=@agent.get url
|
72
|
-
return Linkedin::Profile.new(page, url)
|
58
|
+
@page=@agent.get url
|
59
|
+
return Linkedin::Profile.new(@page, url)
|
73
60
|
rescue=>e
|
74
61
|
puts e
|
75
62
|
end
|
data/linkedin-scraper.gemspec
CHANGED
@@ -7,9 +7,9 @@ Gem::Specification.new do |gem|
|
|
7
7
|
gem.description = %q{Scrapes the linkedin profile when a url is given }
|
8
8
|
gem.summary = %q{when a url of public linkedin profile page is given it scrapes the entire page and converts into a accessible object}
|
9
9
|
gem.homepage = "https://github.com/yatishmehta27/linkedin-scraper"
|
10
|
-
|
11
|
-
gem.add_dependency(%q<mechanize>, [">= 0"])
|
12
|
-
|
10
|
+
|
11
|
+
gem.add_dependency(%q<mechanize>, [">= 0"])
|
12
|
+
|
13
13
|
gem.files = `git ls-files`.split($\)
|
14
14
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
15
15
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
metadata
CHANGED
@@ -1,117 +1,71 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedin-scraper
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.5
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 0
|
9
|
-
- 3
|
10
|
-
version: 0.0.3
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Yatish Mehta
|
14
9
|
autorequire:
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
- !ruby/object:Gem::Dependency
|
21
|
-
name: httparty
|
22
|
-
prerelease: false
|
23
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
-
none: false
|
25
|
-
requirements:
|
26
|
-
- - ">="
|
27
|
-
- !ruby/object:Gem::Version
|
28
|
-
hash: 3
|
29
|
-
segments:
|
30
|
-
- 0
|
31
|
-
version: "0"
|
32
|
-
type: :runtime
|
33
|
-
version_requirements: *id001
|
34
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2012-07-23 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
35
15
|
name: mechanize
|
36
|
-
|
37
|
-
requirement: &id002 !ruby/object:Gem::Requirement
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
38
17
|
none: false
|
39
|
-
requirements:
|
40
|
-
- -
|
41
|
-
- !ruby/object:Gem::Version
|
42
|
-
|
43
|
-
segments:
|
44
|
-
- 0
|
45
|
-
version: "0"
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
46
22
|
type: :runtime
|
47
|
-
version_requirements: *id002
|
48
|
-
- !ruby/object:Gem::Dependency
|
49
|
-
name: awesome_print
|
50
23
|
prerelease: false
|
51
|
-
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
52
25
|
none: false
|
53
|
-
requirements:
|
54
|
-
- -
|
55
|
-
- !ruby/object:Gem::Version
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
version: "0"
|
60
|
-
type: :runtime
|
61
|
-
version_requirements: *id003
|
62
|
-
description: "Scrapes the linkedin profile when a url is given "
|
63
|
-
email:
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
description: ! 'Scrapes the linkedin profile when a url is given '
|
31
|
+
email:
|
64
32
|
- yatishmehta27@gmail.com
|
65
33
|
executables: []
|
66
|
-
|
67
34
|
extensions: []
|
68
|
-
|
69
35
|
extra_rdoc_files: []
|
70
|
-
|
71
|
-
files:
|
36
|
+
files:
|
72
37
|
- .gitignore
|
73
38
|
- Gemfile
|
74
39
|
- LICENSE
|
75
40
|
- README.rdoc
|
76
41
|
- Rakefile
|
77
42
|
- lib/linkedin-scraper.rb
|
78
|
-
- lib/linkedin-scraper/client.rb
|
79
|
-
- lib/linkedin-scraper/contact.rb
|
80
43
|
- lib/linkedin-scraper/profile.rb
|
81
44
|
- lib/linkedin-scraper/version.rb
|
82
45
|
- linkedin-scraper.gemspec
|
83
46
|
homepage: https://github.com/yatishmehta27/linkedin-scraper
|
84
47
|
licenses: []
|
85
|
-
|
86
48
|
post_install_message:
|
87
49
|
rdoc_options: []
|
88
|
-
|
89
|
-
require_paths:
|
50
|
+
require_paths:
|
90
51
|
- lib
|
91
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
52
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
92
53
|
none: false
|
93
|
-
requirements:
|
94
|
-
- -
|
95
|
-
- !ruby/object:Gem::Version
|
96
|
-
|
97
|
-
|
98
|
-
- 0
|
99
|
-
version: "0"
|
100
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
54
|
+
requirements:
|
55
|
+
- - ! '>='
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '0'
|
58
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
101
59
|
none: false
|
102
|
-
requirements:
|
103
|
-
- -
|
104
|
-
- !ruby/object:Gem::Version
|
105
|
-
|
106
|
-
segments:
|
107
|
-
- 0
|
108
|
-
version: "0"
|
60
|
+
requirements:
|
61
|
+
- - ! '>='
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
109
64
|
requirements: []
|
110
|
-
|
111
65
|
rubyforge_project:
|
112
|
-
rubygems_version: 1.8.
|
66
|
+
rubygems_version: 1.8.24
|
113
67
|
signing_key:
|
114
68
|
specification_version: 3
|
115
|
-
summary: when a url of public linkedin profile page is given it scrapes the entire
|
69
|
+
summary: when a url of public linkedin profile page is given it scrapes the entire
|
70
|
+
page and converts into a accessible object
|
116
71
|
test_files: []
|
117
|
-
|
@@ -1,125 +0,0 @@
|
|
1
|
-
# To change this template, choose Tools | Templates
|
2
|
-
# and open the template in the editor.
|
3
|
-
|
4
|
-
|
5
|
-
module Linkedin
|
6
|
-
class Client
|
7
|
-
USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
|
8
|
-
attr_accessor :contacts ,:matched_tag,:probability
|
9
|
-
|
10
|
-
def initialize(first_name,last_name ,company,options={})
|
11
|
-
@first_name=first_name.downcase
|
12
|
-
@last_name=last_name.downcase
|
13
|
-
@company=company
|
14
|
-
@country=options[:country] || "us"
|
15
|
-
@search_linkedin_url="http://#{@country}.linkedin.com/pub/dir/#{@first_name}/#{@last_name}"
|
16
|
-
@contacts=[]
|
17
|
-
@links=[]
|
18
|
-
get_agent
|
19
|
-
end
|
20
|
-
|
21
|
-
def get_agent
|
22
|
-
@agent=Mechanize.new
|
23
|
-
@agent.user_agent_alias = USER_AGENTS.sample
|
24
|
-
@agent.max_history = 0
|
25
|
-
@agent
|
26
|
-
end
|
27
|
-
|
28
|
-
def get_contacts
|
29
|
-
begin
|
30
|
-
sleep(2+rand(4))
|
31
|
-
puts "===>Father:Scrapping linkedin url "+ @search_linkedin_url
|
32
|
-
@page=@agent.get @search_linkedin_url
|
33
|
-
@page.search(".vcard").each do |node|
|
34
|
-
@contacts<<Linkedin::Contact.new(node)
|
35
|
-
end
|
36
|
-
rescue Mechanize::ResponseCodeError=>e
|
37
|
-
puts "RESCUE"
|
38
|
-
end
|
39
|
-
return @contacts
|
40
|
-
end
|
41
|
-
|
42
|
-
|
43
|
-
#TODO need to refactor this function need seperate function of each case
|
44
|
-
def get_verified_contact
|
45
|
-
get_contacts
|
46
|
-
@contacts.each do |contact|
|
47
|
-
#check current company
|
48
|
-
contact.current_companies.each do |company|
|
49
|
-
if company[:current_company]
|
50
|
-
if company[:current_company].match(/#{@company}/i)
|
51
|
-
@matched_tag="CURRENT"
|
52
|
-
return contact
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end if contact.current_companies
|
56
|
-
|
57
|
-
#title of profile
|
58
|
-
if contact.title.match(/#{@company}/i)
|
59
|
-
@matched_tag="CURRENT"
|
60
|
-
return contact
|
61
|
-
end
|
62
|
-
|
63
|
-
#check past companies
|
64
|
-
contact.past_companies.each do |company|
|
65
|
-
if company[:past_company]
|
66
|
-
if company[:past_company].match(/#{@company}/i)
|
67
|
-
@matched_tag="PAST"
|
68
|
-
return contact
|
69
|
-
end
|
70
|
-
end
|
71
|
-
end if contact.past_companies
|
72
|
-
#
|
73
|
-
#Going in to profile homepage and then checking
|
74
|
-
#
|
75
|
-
sleep(2+rand(4))
|
76
|
-
puts "===>Child:Scrapping linkedin url: "+ contact.linkedin_url
|
77
|
-
profile=contact.get_profile(get_agent.get(contact.linkedin_url),contact.linkedin_url)
|
78
|
-
#check current company
|
79
|
-
profile.current_companies.each do |company|
|
80
|
-
if company[:current_company]
|
81
|
-
if company[:current_company].match(/#{@company}/i)
|
82
|
-
@matched_tag="CURRENT"
|
83
|
-
return profile
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end if profile.current_companies
|
87
|
-
|
88
|
-
#title of profile
|
89
|
-
if profile.title
|
90
|
-
if profile.title.match(/#{@company}/i)
|
91
|
-
@matched_tag="CURRENT"
|
92
|
-
return profile
|
93
|
-
end
|
94
|
-
end
|
95
|
-
#check past companies
|
96
|
-
profile.past_companies.each do |company|
|
97
|
-
if company[:past_company]
|
98
|
-
if company[:past_company].match(/#{@company}/i)
|
99
|
-
@matched_tag="PAST"
|
100
|
-
return profile
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end if profile.past_companies
|
104
|
-
#check recommended visitors
|
105
|
-
if profile.recommended_visitors
|
106
|
-
cnt=0
|
107
|
-
profile.recommended_visitors.each do |visitor|
|
108
|
-
if visitor[:company]
|
109
|
-
if visitor[:company].match(/#{@company}/i)
|
110
|
-
cnt+=1
|
111
|
-
end
|
112
|
-
end
|
113
|
-
end
|
114
|
-
@probability=cnt/profile.recommended_visitors.length.to_f
|
115
|
-
@matched_tag="RECOMMENDED"
|
116
|
-
return profile if @probability>=0.5
|
117
|
-
end
|
118
|
-
|
119
|
-
end unless @contacts.empty?
|
120
|
-
return nil
|
121
|
-
end
|
122
|
-
|
123
|
-
|
124
|
-
end
|
125
|
-
end
|
@@ -1,134 +0,0 @@
|
|
1
|
-
# To change this template, choose Tools | Templates
|
2
|
-
# and open the template in the editor.
|
3
|
-
module Linkedin
|
4
|
-
|
5
|
-
class Contact
|
6
|
-
#the First name of the contact
|
7
|
-
attr_accessor :first_name
|
8
|
-
#the last name of the contact
|
9
|
-
attr_accessor :last_name
|
10
|
-
#the linkedin job title
|
11
|
-
attr_accessor :title
|
12
|
-
#the location of the contact
|
13
|
-
attr_accessor :location
|
14
|
-
#the country of the contact
|
15
|
-
attr_accessor :country
|
16
|
-
#the domain for which the contact belongs
|
17
|
-
attr_accessor :industry
|
18
|
-
#the entire profile of the contact
|
19
|
-
attr_accessor :profile
|
20
|
-
|
21
|
-
#Array of hash containing its past job companies and job profile
|
22
|
-
#Example
|
23
|
-
# [
|
24
|
-
# [0] {
|
25
|
-
# :past_title => "Intern",
|
26
|
-
# :past_company => "Sungard"
|
27
|
-
# },
|
28
|
-
# [1] {
|
29
|
-
# :past_title => "Software Developer",
|
30
|
-
# :past_company => "Microsoft"
|
31
|
-
# }
|
32
|
-
# ]
|
33
|
-
|
34
|
-
attr_accessor :past_companies
|
35
|
-
#Array of hash containing its current job companies and job profile
|
36
|
-
#Example
|
37
|
-
# [
|
38
|
-
# [0] {
|
39
|
-
# :current_title => "Intern",
|
40
|
-
# :current_company => "Sungard"
|
41
|
-
# },
|
42
|
-
# [1] {
|
43
|
-
# :current_title => "Software Developer",
|
44
|
-
# :current_company => "Microsoft"
|
45
|
-
# }
|
46
|
-
# ]
|
47
|
-
attr_accessor :current_companies
|
48
|
-
|
49
|
-
attr_accessor :linkedin_url
|
50
|
-
|
51
|
-
attr_accessor :profile
|
52
|
-
|
53
|
-
def initialize(node=[])
|
54
|
-
unless node.class==Array
|
55
|
-
@first_name=get_first_name(node)
|
56
|
-
@last_name=get_last_name(node)
|
57
|
-
@title=get_title(node)
|
58
|
-
@location=get_location(node)
|
59
|
-
@country=get_country(node)
|
60
|
-
@industry=get_industry(node)
|
61
|
-
@current_companies=get_current_companies node
|
62
|
-
@past_companies=get_past_companies node
|
63
|
-
@linkedin_url=get_linkedin_url node
|
64
|
-
end
|
65
|
-
end
|
66
|
-
#page is a Nokogiri::XML node of the profile page
|
67
|
-
#returns object of Linkedin::Profile
|
68
|
-
def get_profile page,url
|
69
|
-
@profile=Linkedin::Profile.new(page,url)
|
70
|
-
end
|
71
|
-
|
72
|
-
private
|
73
|
-
|
74
|
-
def get_first_name node
|
75
|
-
return node.at(".given-name").text.strip if node.search(".given-name").first
|
76
|
-
end
|
77
|
-
|
78
|
-
def get_last_name node
|
79
|
-
return node.at(".family-name").text.strip if node.search(".family-name").first
|
80
|
-
end
|
81
|
-
|
82
|
-
def get_title node
|
83
|
-
return node.at(".title").text.gsub(/\s+/, " ").strip if node.search(".title").first
|
84
|
-
end
|
85
|
-
|
86
|
-
def get_location node
|
87
|
-
return node.at(".location").text.split(",").first.strip if node.search(".location").first
|
88
|
-
|
89
|
-
end
|
90
|
-
|
91
|
-
def get_country node
|
92
|
-
return node.at(".location").text.split(",").last.strip if node.search(".location").first
|
93
|
-
|
94
|
-
end
|
95
|
-
|
96
|
-
def get_industry node
|
97
|
-
return node.at(".industry").text.strip if node.search(".industry").first
|
98
|
-
end
|
99
|
-
|
100
|
-
def get_linkedin_url node
|
101
|
-
node.at("h2/strong/a").attributes["href"]
|
102
|
-
end
|
103
|
-
|
104
|
-
def get_current_companies node
|
105
|
-
current_cs=[]
|
106
|
-
if node.search(".current-content").first
|
107
|
-
node.at(".current-content").text.split(",").each do |content|
|
108
|
-
title,company=content.split(" at ")
|
109
|
-
company=company.gsub(/\s+/, " ").strip if company
|
110
|
-
title=title.gsub(/\s+/, " ").strip if title
|
111
|
-
current_company={:current_company=>company,:current_title=> title}
|
112
|
-
current_cs<<current_company
|
113
|
-
end
|
114
|
-
return current_cs
|
115
|
-
end
|
116
|
-
end
|
117
|
-
|
118
|
-
def get_past_companies node
|
119
|
-
past_cs=[]
|
120
|
-
if node.search(".past-content").first
|
121
|
-
node.at(".past-content").text.split(",").each do |content|
|
122
|
-
title,company=content.split(" at ")
|
123
|
-
company=company.gsub(/\s+/, " ").strip if company
|
124
|
-
title=title.gsub(/\s+/, " ").strip if title
|
125
|
-
past_company={:past_company=>company,:past_title=> title }
|
126
|
-
past_cs<<past_company
|
127
|
-
end
|
128
|
-
return past_cs
|
129
|
-
end
|
130
|
-
end
|
131
|
-
|
132
|
-
end
|
133
|
-
|
134
|
-
end
|