linkedin-scraper 0.0.3 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.rdoc CHANGED
@@ -71,42 +71,6 @@ Then you can see the scraped data like this:
71
71
  # :company => "Better Labs"
72
72
  # },
73
73
 
74
- = Examples
75
-
76
- When a link is given, it scrapes the profile and gets the data
77
-
78
- attr_accessor :country = "India",
79
- attr_accessor :current_companies = [
80
- [0] {
81
- :current_company => "Better Labs",
82
- :current_title => "Software Engineer Core Platform"
83
- }
84
- ],
85
- attr_accessor :first_name = "Yatish",
86
- attr_accessor :industry = "Information Technology and Services",
87
- attr_accessor :last_name = "Mehta",
88
- attr_accessor :linkedin_url = "http://in.linkedin.com/pub/yatish-mehta/22/460/a86",
89
- attr_accessor :location = "Pune",
90
- attr_accessor :past_companies = [
91
- [0] {
92
- :past_company => "Consumyze Software",
93
- :past_title => "Trainee"
94
- },
95
- [1] {
96
- :past_company => "SunGard Global Services",
97
- :past_title => "Project Intern"
98
- }
99
- ],
100
- attr_accessor :recommended_visitors = [
101
- [0] {
102
- :link => href="http://in.linkedin.com/in/nileshavhad?trk=pub-pbmap",
103
- :name => "Nilesh Avhad",
104
- :title => "Engineering Manager",
105
- :company => "Better Labs"
106
- },
107
- ],
108
- attr_accessor :title = "Software Engineer Core Platform at BetterLabs"
109
-
110
74
 
111
75
  = ZOMG Fork! Thank you!
112
76
 
@@ -1,10 +1,7 @@
1
1
  require "linkedin-scraper/version"
2
2
  require "rubygems"
3
3
  require "mechanize"
4
- require "awesome_print"
4
+ Dir["#{File.expand_path(File.dirname(__FILE__))}/linkedin-scraper/*.rb"].each {|file| require file }
5
5
 
6
- %w(client contact profile).each do |file|
7
- require File.join(File.dirname(__FILE__), 'linkedin-scraper', file)
8
- end
9
6
 
10
7
 
@@ -1,22 +1,11 @@
1
- # To change this template, choose Tools | Templates
2
- # and open the template in the editor.
1
+ USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
3
2
  module Linkedin
4
- class Profile
5
- USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
3
+ class Profile
6
4
  #the First name of the contact
7
- attr_accessor :first_name
8
- #the last name of the contact
9
- attr_accessor :last_name
10
- #the linkedin job title
11
- attr_accessor :title
12
- #the location of the contact
13
- attr_accessor :location
14
- #the country of the contact
15
- attr_accessor :country
16
- #the domain for which the contact belongs
17
- attr_accessor :industry
18
- #the entire profile of the contact
19
- attr_accessor :profile
5
+ attr_accessor :first_name,:last_name,:title,:location,:country,
6
+ :industry, :linkedin_url,:recommended_visitors,:profile,
7
+ :page
8
+
20
9
 
21
10
  #Array of hash containing its past job companies and job profile
22
11
  #Example
@@ -46,9 +35,7 @@ module Linkedin
46
35
  # ]
47
36
  attr_accessor :current_companies
48
37
  #url of the profile
49
- attr_accessor :linkedin_url
50
- #Array of hash containing its recommended visitors which come on the
51
- attr_accessor :recommended_visitors
38
+
52
39
 
53
40
  def initialize(page,url)
54
41
  @first_name=get_first_name(page)
@@ -68,8 +55,8 @@ module Linkedin
68
55
  @agent=Mechanize.new
69
56
  @agent.user_agent_alias = USER_AGENTS.sample
70
57
  @agent.max_history = 0
71
- page=@agent.get url
72
- return Linkedin::Profile.new(page, url)
58
+ @page=@agent.get url
59
+ return Linkedin::Profile.new(@page, url)
73
60
  rescue=>e
74
61
  puts e
75
62
  end
@@ -1,5 +1,5 @@
1
1
  module Linkedin
2
2
  module Scraper
3
- VERSION = "0.0.3"
3
+ VERSION = "0.0.5"
4
4
  end
5
5
  end
@@ -7,9 +7,9 @@ Gem::Specification.new do |gem|
7
7
  gem.description = %q{Scrapes the linkedin profile when a url is given }
8
8
  gem.summary = %q{when a url of public linkedin profile page is given it scrapes the entire page and converts into a accessible object}
9
9
  gem.homepage = "https://github.com/yatishmehta27/linkedin-scraper"
10
- gem.add_dependency(%q<httparty>, [">= 0"])
11
- gem.add_dependency(%q<mechanize>, [">= 0"])
12
- gem.add_dependency(%q<awesome_print>, [">= 0"])
10
+
11
+ gem.add_dependency(%q<mechanize>, [">= 0"])
12
+
13
13
  gem.files = `git ls-files`.split($\)
14
14
  gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
15
15
  gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
metadata CHANGED
@@ -1,117 +1,71 @@
1
- --- !ruby/object:Gem::Specification
1
+ --- !ruby/object:Gem::Specification
2
2
  name: linkedin-scraper
3
- version: !ruby/object:Gem::Version
4
- hash: 25
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.5
5
5
  prerelease:
6
- segments:
7
- - 0
8
- - 0
9
- - 3
10
- version: 0.0.3
11
6
  platform: ruby
12
- authors:
7
+ authors:
13
8
  - Yatish Mehta
14
9
  autorequire:
15
10
  bindir: bin
16
11
  cert_chain: []
17
-
18
- date: 2012-04-12 00:00:00 Z
19
- dependencies:
20
- - !ruby/object:Gem::Dependency
21
- name: httparty
22
- prerelease: false
23
- requirement: &id001 !ruby/object:Gem::Requirement
24
- none: false
25
- requirements:
26
- - - ">="
27
- - !ruby/object:Gem::Version
28
- hash: 3
29
- segments:
30
- - 0
31
- version: "0"
32
- type: :runtime
33
- version_requirements: *id001
34
- - !ruby/object:Gem::Dependency
12
+ date: 2012-07-23 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
35
15
  name: mechanize
36
- prerelease: false
37
- requirement: &id002 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
38
17
  none: false
39
- requirements:
40
- - - ">="
41
- - !ruby/object:Gem::Version
42
- hash: 3
43
- segments:
44
- - 0
45
- version: "0"
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
46
22
  type: :runtime
47
- version_requirements: *id002
48
- - !ruby/object:Gem::Dependency
49
- name: awesome_print
50
23
  prerelease: false
51
- requirement: &id003 !ruby/object:Gem::Requirement
24
+ version_requirements: !ruby/object:Gem::Requirement
52
25
  none: false
53
- requirements:
54
- - - ">="
55
- - !ruby/object:Gem::Version
56
- hash: 3
57
- segments:
58
- - 0
59
- version: "0"
60
- type: :runtime
61
- version_requirements: *id003
62
- description: "Scrapes the linkedin profile when a url is given "
63
- email:
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ description: ! 'Scrapes the linkedin profile when a url is given '
31
+ email:
64
32
  - yatishmehta27@gmail.com
65
33
  executables: []
66
-
67
34
  extensions: []
68
-
69
35
  extra_rdoc_files: []
70
-
71
- files:
36
+ files:
72
37
  - .gitignore
73
38
  - Gemfile
74
39
  - LICENSE
75
40
  - README.rdoc
76
41
  - Rakefile
77
42
  - lib/linkedin-scraper.rb
78
- - lib/linkedin-scraper/client.rb
79
- - lib/linkedin-scraper/contact.rb
80
43
  - lib/linkedin-scraper/profile.rb
81
44
  - lib/linkedin-scraper/version.rb
82
45
  - linkedin-scraper.gemspec
83
46
  homepage: https://github.com/yatishmehta27/linkedin-scraper
84
47
  licenses: []
85
-
86
48
  post_install_message:
87
49
  rdoc_options: []
88
-
89
- require_paths:
50
+ require_paths:
90
51
  - lib
91
- required_ruby_version: !ruby/object:Gem::Requirement
52
+ required_ruby_version: !ruby/object:Gem::Requirement
92
53
  none: false
93
- requirements:
94
- - - ">="
95
- - !ruby/object:Gem::Version
96
- hash: 3
97
- segments:
98
- - 0
99
- version: "0"
100
- required_rubygems_version: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - ! '>='
56
+ - !ruby/object:Gem::Version
57
+ version: '0'
58
+ required_rubygems_version: !ruby/object:Gem::Requirement
101
59
  none: false
102
- requirements:
103
- - - ">="
104
- - !ruby/object:Gem::Version
105
- hash: 3
106
- segments:
107
- - 0
108
- version: "0"
60
+ requirements:
61
+ - - ! '>='
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
109
64
  requirements: []
110
-
111
65
  rubyforge_project:
112
- rubygems_version: 1.8.10
66
+ rubygems_version: 1.8.24
113
67
  signing_key:
114
68
  specification_version: 3
115
- summary: when a url of public linkedin profile page is given it scrapes the entire page and converts into a accessible object
69
+ summary: when a url of public linkedin profile page is given it scrapes the entire
70
+ page and converts into a accessible object
116
71
  test_files: []
117
-
@@ -1,125 +0,0 @@
1
- # To change this template, choose Tools | Templates
2
- # and open the template in the editor.
3
-
4
-
5
- module Linkedin
6
- class Client
7
- USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
8
- attr_accessor :contacts ,:matched_tag,:probability
9
-
10
- def initialize(first_name,last_name ,company,options={})
11
- @first_name=first_name.downcase
12
- @last_name=last_name.downcase
13
- @company=company
14
- @country=options[:country] || "us"
15
- @search_linkedin_url="http://#{@country}.linkedin.com/pub/dir/#{@first_name}/#{@last_name}"
16
- @contacts=[]
17
- @links=[]
18
- get_agent
19
- end
20
-
21
- def get_agent
22
- @agent=Mechanize.new
23
- @agent.user_agent_alias = USER_AGENTS.sample
24
- @agent.max_history = 0
25
- @agent
26
- end
27
-
28
- def get_contacts
29
- begin
30
- sleep(2+rand(4))
31
- puts "===>Father:Scrapping linkedin url "+ @search_linkedin_url
32
- @page=@agent.get @search_linkedin_url
33
- @page.search(".vcard").each do |node|
34
- @contacts<<Linkedin::Contact.new(node)
35
- end
36
- rescue Mechanize::ResponseCodeError=>e
37
- puts "RESCUE"
38
- end
39
- return @contacts
40
- end
41
-
42
-
43
- #TODO need to refactor this function need seperate function of each case
44
- def get_verified_contact
45
- get_contacts
46
- @contacts.each do |contact|
47
- #check current company
48
- contact.current_companies.each do |company|
49
- if company[:current_company]
50
- if company[:current_company].match(/#{@company}/i)
51
- @matched_tag="CURRENT"
52
- return contact
53
- end
54
- end
55
- end if contact.current_companies
56
-
57
- #title of profile
58
- if contact.title.match(/#{@company}/i)
59
- @matched_tag="CURRENT"
60
- return contact
61
- end
62
-
63
- #check past companies
64
- contact.past_companies.each do |company|
65
- if company[:past_company]
66
- if company[:past_company].match(/#{@company}/i)
67
- @matched_tag="PAST"
68
- return contact
69
- end
70
- end
71
- end if contact.past_companies
72
- #
73
- #Going in to profile homepage and then checking
74
- #
75
- sleep(2+rand(4))
76
- puts "===>Child:Scrapping linkedin url: "+ contact.linkedin_url
77
- profile=contact.get_profile(get_agent.get(contact.linkedin_url),contact.linkedin_url)
78
- #check current company
79
- profile.current_companies.each do |company|
80
- if company[:current_company]
81
- if company[:current_company].match(/#{@company}/i)
82
- @matched_tag="CURRENT"
83
- return profile
84
- end
85
- end
86
- end if profile.current_companies
87
-
88
- #title of profile
89
- if profile.title
90
- if profile.title.match(/#{@company}/i)
91
- @matched_tag="CURRENT"
92
- return profile
93
- end
94
- end
95
- #check past companies
96
- profile.past_companies.each do |company|
97
- if company[:past_company]
98
- if company[:past_company].match(/#{@company}/i)
99
- @matched_tag="PAST"
100
- return profile
101
- end
102
- end
103
- end if profile.past_companies
104
- #check recommended visitors
105
- if profile.recommended_visitors
106
- cnt=0
107
- profile.recommended_visitors.each do |visitor|
108
- if visitor[:company]
109
- if visitor[:company].match(/#{@company}/i)
110
- cnt+=1
111
- end
112
- end
113
- end
114
- @probability=cnt/profile.recommended_visitors.length.to_f
115
- @matched_tag="RECOMMENDED"
116
- return profile if @probability>=0.5
117
- end
118
-
119
- end unless @contacts.empty?
120
- return nil
121
- end
122
-
123
-
124
- end
125
- end
@@ -1,134 +0,0 @@
1
- # To change this template, choose Tools | Templates
2
- # and open the template in the editor.
3
- module Linkedin
4
-
5
- class Contact
6
- #the First name of the contact
7
- attr_accessor :first_name
8
- #the last name of the contact
9
- attr_accessor :last_name
10
- #the linkedin job title
11
- attr_accessor :title
12
- #the location of the contact
13
- attr_accessor :location
14
- #the country of the contact
15
- attr_accessor :country
16
- #the domain for which the contact belongs
17
- attr_accessor :industry
18
- #the entire profile of the contact
19
- attr_accessor :profile
20
-
21
- #Array of hash containing its past job companies and job profile
22
- #Example
23
- # [
24
- # [0] {
25
- # :past_title => "Intern",
26
- # :past_company => "Sungard"
27
- # },
28
- # [1] {
29
- # :past_title => "Software Developer",
30
- # :past_company => "Microsoft"
31
- # }
32
- # ]
33
-
34
- attr_accessor :past_companies
35
- #Array of hash containing its current job companies and job profile
36
- #Example
37
- # [
38
- # [0] {
39
- # :current_title => "Intern",
40
- # :current_company => "Sungard"
41
- # },
42
- # [1] {
43
- # :current_title => "Software Developer",
44
- # :current_company => "Microsoft"
45
- # }
46
- # ]
47
- attr_accessor :current_companies
48
-
49
- attr_accessor :linkedin_url
50
-
51
- attr_accessor :profile
52
-
53
- def initialize(node=[])
54
- unless node.class==Array
55
- @first_name=get_first_name(node)
56
- @last_name=get_last_name(node)
57
- @title=get_title(node)
58
- @location=get_location(node)
59
- @country=get_country(node)
60
- @industry=get_industry(node)
61
- @current_companies=get_current_companies node
62
- @past_companies=get_past_companies node
63
- @linkedin_url=get_linkedin_url node
64
- end
65
- end
66
- #page is a Nokogiri::XML node of the profile page
67
- #returns object of Linkedin::Profile
68
- def get_profile page,url
69
- @profile=Linkedin::Profile.new(page,url)
70
- end
71
-
72
- private
73
-
74
- def get_first_name node
75
- return node.at(".given-name").text.strip if node.search(".given-name").first
76
- end
77
-
78
- def get_last_name node
79
- return node.at(".family-name").text.strip if node.search(".family-name").first
80
- end
81
-
82
- def get_title node
83
- return node.at(".title").text.gsub(/\s+/, " ").strip if node.search(".title").first
84
- end
85
-
86
- def get_location node
87
- return node.at(".location").text.split(",").first.strip if node.search(".location").first
88
-
89
- end
90
-
91
- def get_country node
92
- return node.at(".location").text.split(",").last.strip if node.search(".location").first
93
-
94
- end
95
-
96
- def get_industry node
97
- return node.at(".industry").text.strip if node.search(".industry").first
98
- end
99
-
100
- def get_linkedin_url node
101
- node.at("h2/strong/a").attributes["href"]
102
- end
103
-
104
- def get_current_companies node
105
- current_cs=[]
106
- if node.search(".current-content").first
107
- node.at(".current-content").text.split(",").each do |content|
108
- title,company=content.split(" at ")
109
- company=company.gsub(/\s+/, " ").strip if company
110
- title=title.gsub(/\s+/, " ").strip if title
111
- current_company={:current_company=>company,:current_title=> title}
112
- current_cs<<current_company
113
- end
114
- return current_cs
115
- end
116
- end
117
-
118
- def get_past_companies node
119
- past_cs=[]
120
- if node.search(".past-content").first
121
- node.at(".past-content").text.split(",").each do |content|
122
- title,company=content.split(" at ")
123
- company=company.gsub(/\s+/, " ").strip if company
124
- title=title.gsub(/\s+/, " ").strip if title
125
- past_company={:past_company=>company,:past_title=> title }
126
- past_cs<<past_company
127
- end
128
- return past_cs
129
- end
130
- end
131
-
132
- end
133
-
134
- end