linkedin-scraper 0.0.3 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
data/README.rdoc CHANGED
@@ -71,42 +71,6 @@ Then you can see the scraped data like this:
71
71
  # :company => "Better Labs"
72
72
  # },
73
73
 
74
- = Examples
75
-
76
- When a link is given, it scrapes the profile and gets the data
77
-
78
- attr_accessor :country = "India",
79
- attr_accessor :current_companies = [
80
- [0] {
81
- :current_company => "Better Labs",
82
- :current_title => "Software Engineer Core Platform"
83
- }
84
- ],
85
- attr_accessor :first_name = "Yatish",
86
- attr_accessor :industry = "Information Technology and Services",
87
- attr_accessor :last_name = "Mehta",
88
- attr_accessor :linkedin_url = "http://in.linkedin.com/pub/yatish-mehta/22/460/a86",
89
- attr_accessor :location = "Pune",
90
- attr_accessor :past_companies = [
91
- [0] {
92
- :past_company => "Consumyze Software",
93
- :past_title => "Trainee"
94
- },
95
- [1] {
96
- :past_company => "SunGard Global Services",
97
- :past_title => "Project Intern"
98
- }
99
- ],
100
- attr_accessor :recommended_visitors = [
101
- [0] {
102
- :link => href="http://in.linkedin.com/in/nileshavhad?trk=pub-pbmap",
103
- :name => "Nilesh Avhad",
104
- :title => "Engineering Manager",
105
- :company => "Better Labs"
106
- },
107
- ],
108
- attr_accessor :title = "Software Engineer Core Platform at BetterLabs"
109
-
110
74
 
111
75
  = ZOMG Fork! Thank you!
112
76
 
@@ -1,10 +1,7 @@
1
1
  require "linkedin-scraper/version"
2
2
  require "rubygems"
3
3
  require "mechanize"
4
- require "awesome_print"
4
+ Dir["#{File.expand_path(File.dirname(__FILE__))}/linkedin-scraper/*.rb"].each {|file| require file }
5
5
 
6
- %w(client contact profile).each do |file|
7
- require File.join(File.dirname(__FILE__), 'linkedin-scraper', file)
8
- end
9
6
 
10
7
 
@@ -1,22 +1,11 @@
1
- # To change this template, choose Tools | Templates
2
- # and open the template in the editor.
1
+ USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
3
2
  module Linkedin
4
- class Profile
5
- USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
3
+ class Profile
6
4
  #the First name of the contact
7
- attr_accessor :first_name
8
- #the last name of the contact
9
- attr_accessor :last_name
10
- #the linkedin job title
11
- attr_accessor :title
12
- #the location of the contact
13
- attr_accessor :location
14
- #the country of the contact
15
- attr_accessor :country
16
- #the domain for which the contact belongs
17
- attr_accessor :industry
18
- #the entire profile of the contact
19
- attr_accessor :profile
5
+ attr_accessor :first_name,:last_name,:title,:location,:country,
6
+ :industry, :linkedin_url,:recommended_visitors,:profile,
7
+ :page
8
+
20
9
 
21
10
  #Array of hash containing its past job companies and job profile
22
11
  #Example
@@ -46,9 +35,7 @@ module Linkedin
46
35
  # ]
47
36
  attr_accessor :current_companies
48
37
  #url of the profile
49
- attr_accessor :linkedin_url
50
- #Array of hash containing its recommended visitors which come on the
51
- attr_accessor :recommended_visitors
38
+
52
39
 
53
40
  def initialize(page,url)
54
41
  @first_name=get_first_name(page)
@@ -68,8 +55,8 @@ module Linkedin
68
55
  @agent=Mechanize.new
69
56
  @agent.user_agent_alias = USER_AGENTS.sample
70
57
  @agent.max_history = 0
71
- page=@agent.get url
72
- return Linkedin::Profile.new(page, url)
58
+ @page=@agent.get url
59
+ return Linkedin::Profile.new(@page, url)
73
60
  rescue=>e
74
61
  puts e
75
62
  end
@@ -1,5 +1,5 @@
1
1
  module Linkedin
2
2
  module Scraper
3
- VERSION = "0.0.3"
3
+ VERSION = "0.0.5"
4
4
  end
5
5
  end
@@ -7,9 +7,9 @@ Gem::Specification.new do |gem|
7
7
  gem.description = %q{Scrapes the linkedin profile when a url is given }
8
8
  gem.summary = %q{when a url of public linkedin profile page is given it scrapes the entire page and converts into a accessible object}
9
9
  gem.homepage = "https://github.com/yatishmehta27/linkedin-scraper"
10
- gem.add_dependency(%q<httparty>, [">= 0"])
11
- gem.add_dependency(%q<mechanize>, [">= 0"])
12
- gem.add_dependency(%q<awesome_print>, [">= 0"])
10
+
11
+ gem.add_dependency(%q<mechanize>, [">= 0"])
12
+
13
13
  gem.files = `git ls-files`.split($\)
14
14
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
15
15
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
metadata CHANGED
@@ -1,117 +1,71 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: linkedin-scraper
3
- version: !ruby/object:Gem::Version
4
- hash: 25
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.5
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 0
9
- - 3
10
- version: 0.0.3
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Yatish Mehta
14
9
  autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
-
18
- date: 2012-04-12 00:00:00 Z
19
- dependencies:
20
- - !ruby/object:Gem::Dependency
21
- name: httparty
22
- prerelease: false
23
- requirement: &id001 !ruby/object:Gem::Requirement
24
- none: false
25
- requirements:
26
- - - ">="
27
- - !ruby/object:Gem::Version
28
- hash: 3
29
- segments:
30
- - 0
31
- version: "0"
32
- type: :runtime
33
- version_requirements: *id001
34
- - !ruby/object:Gem::Dependency
12
+ date: 2012-07-23 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
35
15
  name: mechanize
36
- prerelease: false
37
- requirement: &id002 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
38
17
  none: false
39
- requirements:
40
- - - ">="
41
- - !ruby/object:Gem::Version
42
- hash: 3
43
- segments:
44
- - 0
45
- version: "0"
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
46
22
  type: :runtime
47
- version_requirements: *id002
48
- - !ruby/object:Gem::Dependency
49
- name: awesome_print
50
23
  prerelease: false
51
- requirement: &id003 !ruby/object:Gem::Requirement
24
+ version_requirements: !ruby/object:Gem::Requirement
52
25
  none: false
53
- requirements:
54
- - - ">="
55
- - !ruby/object:Gem::Version
56
- hash: 3
57
- segments:
58
- - 0
59
- version: "0"
60
- type: :runtime
61
- version_requirements: *id003
62
- description: "Scrapes the linkedin profile when a url is given "
63
- email:
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ description: ! 'Scrapes the linkedin profile when a url is given '
31
+ email:
64
32
  - yatishmehta27@gmail.com
65
33
  executables: []
66
-
67
34
  extensions: []
68
-
69
35
  extra_rdoc_files: []
70
-
71
- files:
36
+ files:
72
37
  - .gitignore
73
38
  - Gemfile
74
39
  - LICENSE
75
40
  - README.rdoc
76
41
  - Rakefile
77
42
  - lib/linkedin-scraper.rb
78
- - lib/linkedin-scraper/client.rb
79
- - lib/linkedin-scraper/contact.rb
80
43
  - lib/linkedin-scraper/profile.rb
81
44
  - lib/linkedin-scraper/version.rb
82
45
  - linkedin-scraper.gemspec
83
46
  homepage: https://github.com/yatishmehta27/linkedin-scraper
84
47
  licenses: []
85
-
86
48
  post_install_message:
87
49
  rdoc_options: []
88
-
89
- require_paths:
50
+ require_paths:
90
51
  - lib
91
- required_ruby_version: !ruby/object:Gem::Requirement
52
+ required_ruby_version: !ruby/object:Gem::Requirement
92
53
  none: false
93
- requirements:
94
- - - ">="
95
- - !ruby/object:Gem::Version
96
- hash: 3
97
- segments:
98
- - 0
99
- version: "0"
100
- required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ! '>='
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ required_rubygems_version: !ruby/object:Gem::Requirement
101
59
  none: false
102
- requirements:
103
- - - ">="
104
- - !ruby/object:Gem::Version
105
- hash: 3
106
- segments:
107
- - 0
108
- version: "0"
60
+ requirements:
61
+ - - ! '>='
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
109
64
  requirements: []
110
-
111
65
  rubyforge_project:
112
- rubygems_version: 1.8.10
66
+ rubygems_version: 1.8.24
113
67
  signing_key:
114
68
  specification_version: 3
115
- summary: when a url of public linkedin profile page is given it scrapes the entire page and converts into a accessible object
69
+ summary: when a url of public linkedin profile page is given it scrapes the entire
70
+ page and converts into a accessible object
116
71
  test_files: []
117
-
@@ -1,125 +0,0 @@
1
- # To change this template, choose Tools | Templates
2
- # and open the template in the editor.
3
-
4
-
5
- module Linkedin
6
- class Client
7
- USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
8
- attr_accessor :contacts ,:matched_tag,:probability
9
-
10
- def initialize(first_name,last_name ,company,options={})
11
- @first_name=first_name.downcase
12
- @last_name=last_name.downcase
13
- @company=company
14
- @country=options[:country] || "us"
15
- @search_linkedin_url="http://#{@country}.linkedin.com/pub/dir/#{@first_name}/#{@last_name}"
16
- @contacts=[]
17
- @links=[]
18
- get_agent
19
- end
20
-
21
- def get_agent
22
- @agent=Mechanize.new
23
- @agent.user_agent_alias = USER_AGENTS.sample
24
- @agent.max_history = 0
25
- @agent
26
- end
27
-
28
- def get_contacts
29
- begin
30
- sleep(2+rand(4))
31
- puts "===>Father:Scrapping linkedin url "+ @search_linkedin_url
32
- @page=@agent.get @search_linkedin_url
33
- @page.search(".vcard").each do |node|
34
- @contacts<<Linkedin::Contact.new(node)
35
- end
36
- rescue Mechanize::ResponseCodeError=>e
37
- puts "RESCUE"
38
- end
39
- return @contacts
40
- end
41
-
42
-
43
- #TODO need to refactor this function need seperate function of each case
44
- def get_verified_contact
45
- get_contacts
46
- @contacts.each do |contact|
47
- #check current company
48
- contact.current_companies.each do |company|
49
- if company[:current_company]
50
- if company[:current_company].match(/#{@company}/i)
51
- @matched_tag="CURRENT"
52
- return contact
53
- end
54
- end
55
- end if contact.current_companies
56
-
57
- #title of profile
58
- if contact.title.match(/#{@company}/i)
59
- @matched_tag="CURRENT"
60
- return contact
61
- end
62
-
63
- #check past companies
64
- contact.past_companies.each do |company|
65
- if company[:past_company]
66
- if company[:past_company].match(/#{@company}/i)
67
- @matched_tag="PAST"
68
- return contact
69
- end
70
- end
71
- end if contact.past_companies
72
- #
73
- #Going in to profile homepage and then checking
74
- #
75
- sleep(2+rand(4))
76
- puts "===>Child:Scrapping linkedin url: "+ contact.linkedin_url
77
- profile=contact.get_profile(get_agent.get(contact.linkedin_url),contact.linkedin_url)
78
- #check current company
79
- profile.current_companies.each do |company|
80
- if company[:current_company]
81
- if company[:current_company].match(/#{@company}/i)
82
- @matched_tag="CURRENT"
83
- return profile
84
- end
85
- end
86
- end if profile.current_companies
87
-
88
- #title of profile
89
- if profile.title
90
- if profile.title.match(/#{@company}/i)
91
- @matched_tag="CURRENT"
92
- return profile
93
- end
94
- end
95
- #check past companies
96
- profile.past_companies.each do |company|
97
- if company[:past_company]
98
- if company[:past_company].match(/#{@company}/i)
99
- @matched_tag="PAST"
100
- return profile
101
- end
102
- end
103
- end if profile.past_companies
104
- #check recommended visitors
105
- if profile.recommended_visitors
106
- cnt=0
107
- profile.recommended_visitors.each do |visitor|
108
- if visitor[:company]
109
- if visitor[:company].match(/#{@company}/i)
110
- cnt+=1
111
- end
112
- end
113
- end
114
- @probability=cnt/profile.recommended_visitors.length.to_f
115
- @matched_tag="RECOMMENDED"
116
- return profile if @probability>=0.5
117
- end
118
-
119
- end unless @contacts.empty?
120
- return nil
121
- end
122
-
123
-
124
- end
125
- end
@@ -1,134 +0,0 @@
1
- # To change this template, choose Tools | Templates
2
- # and open the template in the editor.
3
- module Linkedin
4
-
5
- class Contact
6
- #the First name of the contact
7
- attr_accessor :first_name
8
- #the last name of the contact
9
- attr_accessor :last_name
10
- #the linkedin job title
11
- attr_accessor :title
12
- #the location of the contact
13
- attr_accessor :location
14
- #the country of the contact
15
- attr_accessor :country
16
- #the domain for which the contact belongs
17
- attr_accessor :industry
18
- #the entire profile of the contact
19
- attr_accessor :profile
20
-
21
- #Array of hash containing its past job companies and job profile
22
- #Example
23
- # [
24
- # [0] {
25
- # :past_title => "Intern",
26
- # :past_company => "Sungard"
27
- # },
28
- # [1] {
29
- # :past_title => "Software Developer",
30
- # :past_company => "Microsoft"
31
- # }
32
- # ]
33
-
34
- attr_accessor :past_companies
35
- #Array of hash containing its current job companies and job profile
36
- #Example
37
- # [
38
- # [0] {
39
- # :current_title => "Intern",
40
- # :current_company => "Sungard"
41
- # },
42
- # [1] {
43
- # :current_title => "Software Developer",
44
- # :current_company => "Microsoft"
45
- # }
46
- # ]
47
- attr_accessor :current_companies
48
-
49
- attr_accessor :linkedin_url
50
-
51
- attr_accessor :profile
52
-
53
- def initialize(node=[])
54
- unless node.class==Array
55
- @first_name=get_first_name(node)
56
- @last_name=get_last_name(node)
57
- @title=get_title(node)
58
- @location=get_location(node)
59
- @country=get_country(node)
60
- @industry=get_industry(node)
61
- @current_companies=get_current_companies node
62
- @past_companies=get_past_companies node
63
- @linkedin_url=get_linkedin_url node
64
- end
65
- end
66
- #page is a Nokogiri::XML node of the profile page
67
- #returns object of Linkedin::Profile
68
- def get_profile page,url
69
- @profile=Linkedin::Profile.new(page,url)
70
- end
71
-
72
- private
73
-
74
- def get_first_name node
75
- return node.at(".given-name").text.strip if node.search(".given-name").first
76
- end
77
-
78
- def get_last_name node
79
- return node.at(".family-name").text.strip if node.search(".family-name").first
80
- end
81
-
82
- def get_title node
83
- return node.at(".title").text.gsub(/\s+/, " ").strip if node.search(".title").first
84
- end
85
-
86
- def get_location node
87
- return node.at(".location").text.split(",").first.strip if node.search(".location").first
88
-
89
- end
90
-
91
- def get_country node
92
- return node.at(".location").text.split(",").last.strip if node.search(".location").first
93
-
94
- end
95
-
96
- def get_industry node
97
- return node.at(".industry").text.strip if node.search(".industry").first
98
- end
99
-
100
- def get_linkedin_url node
101
- node.at("h2/strong/a").attributes["href"]
102
- end
103
-
104
- def get_current_companies node
105
- current_cs=[]
106
- if node.search(".current-content").first
107
- node.at(".current-content").text.split(",").each do |content|
108
- title,company=content.split(" at ")
109
- company=company.gsub(/\s+/, " ").strip if company
110
- title=title.gsub(/\s+/, " ").strip if title
111
- current_company={:current_company=>company,:current_title=> title}
112
- current_cs<<current_company
113
- end
114
- return current_cs
115
- end
116
- end
117
-
118
- def get_past_companies node
119
- past_cs=[]
120
- if node.search(".past-content").first
121
- node.at(".past-content").text.split(",").each do |content|
122
- title,company=content.split(" at ")
123
- company=company.gsub(/\s+/, " ").strip if company
124
- title=title.gsub(/\s+/, " ").strip if title
125
- past_company={:past_company=>company,:past_title=> title }
126
- past_cs<<past_company
127
- end
128
- return past_cs
129
- end
130
- end
131
-
132
- end
133
-
134
- end