linkedin-scraper 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.rdoc +0 -36
- data/lib/linkedin-scraper.rb +1 -4
- data/lib/linkedin-scraper/profile.rb +9 -22
- data/lib/linkedin-scraper/version.rb +1 -1
- data/linkedin-scraper.gemspec +3 -3
- metadata +34 -80
- data/lib/linkedin-scraper/client.rb +0 -125
- data/lib/linkedin-scraper/contact.rb +0 -134
data/README.rdoc
CHANGED
@@ -71,42 +71,6 @@ Then you can see the scraped data like this:
|
|
71
71
|
# :company => "Better Labs"
|
72
72
|
# },
|
73
73
|
|
74
|
-
= Examples
|
75
|
-
|
76
|
-
When a link is given, it scrapes the profile and gets the data
|
77
|
-
|
78
|
-
attr_accessor :country = "India",
|
79
|
-
attr_accessor :current_companies = [
|
80
|
-
[0] {
|
81
|
-
:current_company => "Better Labs",
|
82
|
-
:current_title => "Software Engineer Core Platform"
|
83
|
-
}
|
84
|
-
],
|
85
|
-
attr_accessor :first_name = "Yatish",
|
86
|
-
attr_accessor :industry = "Information Technology and Services",
|
87
|
-
attr_accessor :last_name = "Mehta",
|
88
|
-
attr_accessor :linkedin_url = "http://in.linkedin.com/pub/yatish-mehta/22/460/a86",
|
89
|
-
attr_accessor :location = "Pune",
|
90
|
-
attr_accessor :past_companies = [
|
91
|
-
[0] {
|
92
|
-
:past_company => "Consumyze Software",
|
93
|
-
:past_title => "Trainee"
|
94
|
-
},
|
95
|
-
[1] {
|
96
|
-
:past_company => "SunGard Global Services",
|
97
|
-
:past_title => "Project Intern"
|
98
|
-
}
|
99
|
-
],
|
100
|
-
attr_accessor :recommended_visitors = [
|
101
|
-
[0] {
|
102
|
-
:link => href="http://in.linkedin.com/in/nileshavhad?trk=pub-pbmap",
|
103
|
-
:name => "Nilesh Avhad",
|
104
|
-
:title => "Engineering Manager",
|
105
|
-
:company => "Better Labs"
|
106
|
-
},
|
107
|
-
],
|
108
|
-
attr_accessor :title = "Software Engineer Core Platform at BetterLabs"
|
109
|
-
|
110
74
|
|
111
75
|
= ZOMG Fork! Thank you!
|
112
76
|
|
data/lib/linkedin-scraper.rb
CHANGED
@@ -1,10 +1,7 @@
|
|
1
1
|
require "linkedin-scraper/version"
|
2
2
|
require "rubygems"
|
3
3
|
require "mechanize"
|
4
|
-
require
|
4
|
+
Dir["#{File.expand_path(File.dirname(__FILE__))}/linkedin-scraper/*.rb"].each {|file| require file }
|
5
5
|
|
6
|
-
%w(client contact profile).each do |file|
|
7
|
-
require File.join(File.dirname(__FILE__), 'linkedin-scraper', file)
|
8
|
-
end
|
9
6
|
|
10
7
|
|
@@ -1,22 +1,11 @@
|
|
1
|
-
|
2
|
-
# and open the template in the editor.
|
1
|
+
USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
|
3
2
|
module Linkedin
|
4
|
-
class Profile
|
5
|
-
USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
|
3
|
+
class Profile
|
6
4
|
#the First name of the contact
|
7
|
-
attr_accessor :first_name
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
attr_accessor :title
|
12
|
-
#the location of the contact
|
13
|
-
attr_accessor :location
|
14
|
-
#the country of the contact
|
15
|
-
attr_accessor :country
|
16
|
-
#the domain for which the contact belongs
|
17
|
-
attr_accessor :industry
|
18
|
-
#the entire profile of the contact
|
19
|
-
attr_accessor :profile
|
5
|
+
attr_accessor :first_name,:last_name,:title,:location,:country,
|
6
|
+
:industry, :linkedin_url,:recommended_visitors,:profile,
|
7
|
+
:page
|
8
|
+
|
20
9
|
|
21
10
|
#Array of hash containing its past job companies and job profile
|
22
11
|
#Example
|
@@ -46,9 +35,7 @@ module Linkedin
|
|
46
35
|
# ]
|
47
36
|
attr_accessor :current_companies
|
48
37
|
#url of the profile
|
49
|
-
|
50
|
-
#Array of hash containing its recommended visitors which come on the
|
51
|
-
attr_accessor :recommended_visitors
|
38
|
+
|
52
39
|
|
53
40
|
def initialize(page,url)
|
54
41
|
@first_name=get_first_name(page)
|
@@ -68,8 +55,8 @@ module Linkedin
|
|
68
55
|
@agent=Mechanize.new
|
69
56
|
@agent.user_agent_alias = USER_AGENTS.sample
|
70
57
|
@agent.max_history = 0
|
71
|
-
page=@agent.get url
|
72
|
-
return Linkedin::Profile.new(page, url)
|
58
|
+
@page=@agent.get url
|
59
|
+
return Linkedin::Profile.new(@page, url)
|
73
60
|
rescue=>e
|
74
61
|
puts e
|
75
62
|
end
|
data/linkedin-scraper.gemspec
CHANGED
@@ -7,9 +7,9 @@ Gem::Specification.new do |gem|
|
|
7
7
|
gem.description = %q{Scrapes the linkedin profile when a url is given }
|
8
8
|
gem.summary = %q{when a url of public linkedin profile page is given it scrapes the entire page and converts into a accessible object}
|
9
9
|
gem.homepage = "https://github.com/yatishmehta27/linkedin-scraper"
|
10
|
-
|
11
|
-
gem.add_dependency(%q<mechanize>, [">= 0"])
|
12
|
-
|
10
|
+
|
11
|
+
gem.add_dependency(%q<mechanize>, [">= 0"])
|
12
|
+
|
13
13
|
gem.files = `git ls-files`.split($\)
|
14
14
|
gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
|
15
15
|
gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
|
metadata
CHANGED
@@ -1,117 +1,71 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: linkedin-scraper
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.5
|
5
5
|
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 0
|
9
|
-
- 3
|
10
|
-
version: 0.0.3
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Yatish Mehta
|
14
9
|
autorequire:
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
- !ruby/object:Gem::Dependency
|
21
|
-
name: httparty
|
22
|
-
prerelease: false
|
23
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
24
|
-
none: false
|
25
|
-
requirements:
|
26
|
-
- - ">="
|
27
|
-
- !ruby/object:Gem::Version
|
28
|
-
hash: 3
|
29
|
-
segments:
|
30
|
-
- 0
|
31
|
-
version: "0"
|
32
|
-
type: :runtime
|
33
|
-
version_requirements: *id001
|
34
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2012-07-23 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
35
15
|
name: mechanize
|
36
|
-
|
37
|
-
requirement: &id002 !ruby/object:Gem::Requirement
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
38
17
|
none: false
|
39
|
-
requirements:
|
40
|
-
- -
|
41
|
-
- !ruby/object:Gem::Version
|
42
|
-
|
43
|
-
segments:
|
44
|
-
- 0
|
45
|
-
version: "0"
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
46
22
|
type: :runtime
|
47
|
-
version_requirements: *id002
|
48
|
-
- !ruby/object:Gem::Dependency
|
49
|
-
name: awesome_print
|
50
23
|
prerelease: false
|
51
|
-
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
52
25
|
none: false
|
53
|
-
requirements:
|
54
|
-
- -
|
55
|
-
- !ruby/object:Gem::Version
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
version: "0"
|
60
|
-
type: :runtime
|
61
|
-
version_requirements: *id003
|
62
|
-
description: "Scrapes the linkedin profile when a url is given "
|
63
|
-
email:
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
description: ! 'Scrapes the linkedin profile when a url is given '
|
31
|
+
email:
|
64
32
|
- yatishmehta27@gmail.com
|
65
33
|
executables: []
|
66
|
-
|
67
34
|
extensions: []
|
68
|
-
|
69
35
|
extra_rdoc_files: []
|
70
|
-
|
71
|
-
files:
|
36
|
+
files:
|
72
37
|
- .gitignore
|
73
38
|
- Gemfile
|
74
39
|
- LICENSE
|
75
40
|
- README.rdoc
|
76
41
|
- Rakefile
|
77
42
|
- lib/linkedin-scraper.rb
|
78
|
-
- lib/linkedin-scraper/client.rb
|
79
|
-
- lib/linkedin-scraper/contact.rb
|
80
43
|
- lib/linkedin-scraper/profile.rb
|
81
44
|
- lib/linkedin-scraper/version.rb
|
82
45
|
- linkedin-scraper.gemspec
|
83
46
|
homepage: https://github.com/yatishmehta27/linkedin-scraper
|
84
47
|
licenses: []
|
85
|
-
|
86
48
|
post_install_message:
|
87
49
|
rdoc_options: []
|
88
|
-
|
89
|
-
require_paths:
|
50
|
+
require_paths:
|
90
51
|
- lib
|
91
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
52
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
92
53
|
none: false
|
93
|
-
requirements:
|
94
|
-
- -
|
95
|
-
- !ruby/object:Gem::Version
|
96
|
-
|
97
|
-
|
98
|
-
- 0
|
99
|
-
version: "0"
|
100
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
54
|
+
requirements:
|
55
|
+
- - ! '>='
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '0'
|
58
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
101
59
|
none: false
|
102
|
-
requirements:
|
103
|
-
- -
|
104
|
-
- !ruby/object:Gem::Version
|
105
|
-
|
106
|
-
segments:
|
107
|
-
- 0
|
108
|
-
version: "0"
|
60
|
+
requirements:
|
61
|
+
- - ! '>='
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
109
64
|
requirements: []
|
110
|
-
|
111
65
|
rubyforge_project:
|
112
|
-
rubygems_version: 1.8.
|
66
|
+
rubygems_version: 1.8.24
|
113
67
|
signing_key:
|
114
68
|
specification_version: 3
|
115
|
-
summary: when a url of public linkedin profile page is given it scrapes the entire
|
69
|
+
summary: when a url of public linkedin profile page is given it scrapes the entire
|
70
|
+
page and converts into a accessible object
|
116
71
|
test_files: []
|
117
|
-
|
@@ -1,125 +0,0 @@
|
|
1
|
-
# To change this template, choose Tools | Templates
|
2
|
-
# and open the template in the editor.
|
3
|
-
|
4
|
-
|
5
|
-
module Linkedin
|
6
|
-
class Client
|
7
|
-
USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
|
8
|
-
attr_accessor :contacts ,:matched_tag,:probability
|
9
|
-
|
10
|
-
def initialize(first_name,last_name ,company,options={})
|
11
|
-
@first_name=first_name.downcase
|
12
|
-
@last_name=last_name.downcase
|
13
|
-
@company=company
|
14
|
-
@country=options[:country] || "us"
|
15
|
-
@search_linkedin_url="http://#{@country}.linkedin.com/pub/dir/#{@first_name}/#{@last_name}"
|
16
|
-
@contacts=[]
|
17
|
-
@links=[]
|
18
|
-
get_agent
|
19
|
-
end
|
20
|
-
|
21
|
-
def get_agent
|
22
|
-
@agent=Mechanize.new
|
23
|
-
@agent.user_agent_alias = USER_AGENTS.sample
|
24
|
-
@agent.max_history = 0
|
25
|
-
@agent
|
26
|
-
end
|
27
|
-
|
28
|
-
def get_contacts
|
29
|
-
begin
|
30
|
-
sleep(2+rand(4))
|
31
|
-
puts "===>Father:Scrapping linkedin url "+ @search_linkedin_url
|
32
|
-
@page=@agent.get @search_linkedin_url
|
33
|
-
@page.search(".vcard").each do |node|
|
34
|
-
@contacts<<Linkedin::Contact.new(node)
|
35
|
-
end
|
36
|
-
rescue Mechanize::ResponseCodeError=>e
|
37
|
-
puts "RESCUE"
|
38
|
-
end
|
39
|
-
return @contacts
|
40
|
-
end
|
41
|
-
|
42
|
-
|
43
|
-
#TODO need to refactor this function need seperate function of each case
|
44
|
-
def get_verified_contact
|
45
|
-
get_contacts
|
46
|
-
@contacts.each do |contact|
|
47
|
-
#check current company
|
48
|
-
contact.current_companies.each do |company|
|
49
|
-
if company[:current_company]
|
50
|
-
if company[:current_company].match(/#{@company}/i)
|
51
|
-
@matched_tag="CURRENT"
|
52
|
-
return contact
|
53
|
-
end
|
54
|
-
end
|
55
|
-
end if contact.current_companies
|
56
|
-
|
57
|
-
#title of profile
|
58
|
-
if contact.title.match(/#{@company}/i)
|
59
|
-
@matched_tag="CURRENT"
|
60
|
-
return contact
|
61
|
-
end
|
62
|
-
|
63
|
-
#check past companies
|
64
|
-
contact.past_companies.each do |company|
|
65
|
-
if company[:past_company]
|
66
|
-
if company[:past_company].match(/#{@company}/i)
|
67
|
-
@matched_tag="PAST"
|
68
|
-
return contact
|
69
|
-
end
|
70
|
-
end
|
71
|
-
end if contact.past_companies
|
72
|
-
#
|
73
|
-
#Going in to profile homepage and then checking
|
74
|
-
#
|
75
|
-
sleep(2+rand(4))
|
76
|
-
puts "===>Child:Scrapping linkedin url: "+ contact.linkedin_url
|
77
|
-
profile=contact.get_profile(get_agent.get(contact.linkedin_url),contact.linkedin_url)
|
78
|
-
#check current company
|
79
|
-
profile.current_companies.each do |company|
|
80
|
-
if company[:current_company]
|
81
|
-
if company[:current_company].match(/#{@company}/i)
|
82
|
-
@matched_tag="CURRENT"
|
83
|
-
return profile
|
84
|
-
end
|
85
|
-
end
|
86
|
-
end if profile.current_companies
|
87
|
-
|
88
|
-
#title of profile
|
89
|
-
if profile.title
|
90
|
-
if profile.title.match(/#{@company}/i)
|
91
|
-
@matched_tag="CURRENT"
|
92
|
-
return profile
|
93
|
-
end
|
94
|
-
end
|
95
|
-
#check past companies
|
96
|
-
profile.past_companies.each do |company|
|
97
|
-
if company[:past_company]
|
98
|
-
if company[:past_company].match(/#{@company}/i)
|
99
|
-
@matched_tag="PAST"
|
100
|
-
return profile
|
101
|
-
end
|
102
|
-
end
|
103
|
-
end if profile.past_companies
|
104
|
-
#check recommended visitors
|
105
|
-
if profile.recommended_visitors
|
106
|
-
cnt=0
|
107
|
-
profile.recommended_visitors.each do |visitor|
|
108
|
-
if visitor[:company]
|
109
|
-
if visitor[:company].match(/#{@company}/i)
|
110
|
-
cnt+=1
|
111
|
-
end
|
112
|
-
end
|
113
|
-
end
|
114
|
-
@probability=cnt/profile.recommended_visitors.length.to_f
|
115
|
-
@matched_tag="RECOMMENDED"
|
116
|
-
return profile if @probability>=0.5
|
117
|
-
end
|
118
|
-
|
119
|
-
end unless @contacts.empty?
|
120
|
-
return nil
|
121
|
-
end
|
122
|
-
|
123
|
-
|
124
|
-
end
|
125
|
-
end
|
@@ -1,134 +0,0 @@
|
|
1
|
-
# To change this template, choose Tools | Templates
|
2
|
-
# and open the template in the editor.
|
3
|
-
module Linkedin
|
4
|
-
|
5
|
-
class Contact
|
6
|
-
#the First name of the contact
|
7
|
-
attr_accessor :first_name
|
8
|
-
#the last name of the contact
|
9
|
-
attr_accessor :last_name
|
10
|
-
#the linkedin job title
|
11
|
-
attr_accessor :title
|
12
|
-
#the location of the contact
|
13
|
-
attr_accessor :location
|
14
|
-
#the country of the contact
|
15
|
-
attr_accessor :country
|
16
|
-
#the domain for which the contact belongs
|
17
|
-
attr_accessor :industry
|
18
|
-
#the entire profile of the contact
|
19
|
-
attr_accessor :profile
|
20
|
-
|
21
|
-
#Array of hash containing its past job companies and job profile
|
22
|
-
#Example
|
23
|
-
# [
|
24
|
-
# [0] {
|
25
|
-
# :past_title => "Intern",
|
26
|
-
# :past_company => "Sungard"
|
27
|
-
# },
|
28
|
-
# [1] {
|
29
|
-
# :past_title => "Software Developer",
|
30
|
-
# :past_company => "Microsoft"
|
31
|
-
# }
|
32
|
-
# ]
|
33
|
-
|
34
|
-
attr_accessor :past_companies
|
35
|
-
#Array of hash containing its current job companies and job profile
|
36
|
-
#Example
|
37
|
-
# [
|
38
|
-
# [0] {
|
39
|
-
# :current_title => "Intern",
|
40
|
-
# :current_company => "Sungard"
|
41
|
-
# },
|
42
|
-
# [1] {
|
43
|
-
# :current_title => "Software Developer",
|
44
|
-
# :current_company => "Microsoft"
|
45
|
-
# }
|
46
|
-
# ]
|
47
|
-
attr_accessor :current_companies
|
48
|
-
|
49
|
-
attr_accessor :linkedin_url
|
50
|
-
|
51
|
-
attr_accessor :profile
|
52
|
-
|
53
|
-
def initialize(node=[])
|
54
|
-
unless node.class==Array
|
55
|
-
@first_name=get_first_name(node)
|
56
|
-
@last_name=get_last_name(node)
|
57
|
-
@title=get_title(node)
|
58
|
-
@location=get_location(node)
|
59
|
-
@country=get_country(node)
|
60
|
-
@industry=get_industry(node)
|
61
|
-
@current_companies=get_current_companies node
|
62
|
-
@past_companies=get_past_companies node
|
63
|
-
@linkedin_url=get_linkedin_url node
|
64
|
-
end
|
65
|
-
end
|
66
|
-
#page is a Nokogiri::XML node of the profile page
|
67
|
-
#returns object of Linkedin::Profile
|
68
|
-
def get_profile page,url
|
69
|
-
@profile=Linkedin::Profile.new(page,url)
|
70
|
-
end
|
71
|
-
|
72
|
-
private
|
73
|
-
|
74
|
-
def get_first_name node
|
75
|
-
return node.at(".given-name").text.strip if node.search(".given-name").first
|
76
|
-
end
|
77
|
-
|
78
|
-
def get_last_name node
|
79
|
-
return node.at(".family-name").text.strip if node.search(".family-name").first
|
80
|
-
end
|
81
|
-
|
82
|
-
def get_title node
|
83
|
-
return node.at(".title").text.gsub(/\s+/, " ").strip if node.search(".title").first
|
84
|
-
end
|
85
|
-
|
86
|
-
def get_location node
|
87
|
-
return node.at(".location").text.split(",").first.strip if node.search(".location").first
|
88
|
-
|
89
|
-
end
|
90
|
-
|
91
|
-
def get_country node
|
92
|
-
return node.at(".location").text.split(",").last.strip if node.search(".location").first
|
93
|
-
|
94
|
-
end
|
95
|
-
|
96
|
-
def get_industry node
|
97
|
-
return node.at(".industry").text.strip if node.search(".industry").first
|
98
|
-
end
|
99
|
-
|
100
|
-
def get_linkedin_url node
|
101
|
-
node.at("h2/strong/a").attributes["href"]
|
102
|
-
end
|
103
|
-
|
104
|
-
def get_current_companies node
|
105
|
-
current_cs=[]
|
106
|
-
if node.search(".current-content").first
|
107
|
-
node.at(".current-content").text.split(",").each do |content|
|
108
|
-
title,company=content.split(" at ")
|
109
|
-
company=company.gsub(/\s+/, " ").strip if company
|
110
|
-
title=title.gsub(/\s+/, " ").strip if title
|
111
|
-
current_company={:current_company=>company,:current_title=> title}
|
112
|
-
current_cs<<current_company
|
113
|
-
end
|
114
|
-
return current_cs
|
115
|
-
end
|
116
|
-
end
|
117
|
-
|
118
|
-
def get_past_companies node
|
119
|
-
past_cs=[]
|
120
|
-
if node.search(".past-content").first
|
121
|
-
node.at(".past-content").text.split(",").each do |content|
|
122
|
-
title,company=content.split(" at ")
|
123
|
-
company=company.gsub(/\s+/, " ").strip if company
|
124
|
-
title=title.gsub(/\s+/, " ").strip if title
|
125
|
-
past_company={:past_company=>company,:past_title=> title }
|
126
|
-
past_cs<<past_company
|
127
|
-
end
|
128
|
-
return past_cs
|
129
|
-
end
|
130
|
-
end
|
131
|
-
|
132
|
-
end
|
133
|
-
|
134
|
-
end
|