RubyGems - linkedin_scraper - Versions diffs - 0.1.0 - Mend

linkedin_scraper 0.1.0

Files changed (6) hide show

data/README.rdoc ADDED

@@ -0,0 +1,46 @@
+= Linkedin-Scraper
+Linkedin Scraper is a gem for finding linkedin public profiles. You give it name and company of required profile, it finds linkedin profile for same, and its title, name, area, connection, etc.
+= Installation
+Install the gem from RubyGems:
+  gem install linkedin_scraper
+This gem is tested on Ruby versions 1.8.7, 1.9.2 and 1.9.3.
+= Usage
+First Initialize an instance Linkedin class like this:
+  require 'linkedin_scraper'
+  linkedin = LinkedinScraper::linkedin.new
+This sets the mechanize object for scraping. Now you can feed name and companies:
+  profile = linkedin.get_profile_data(name: some_name, company: some_company)
+third option can be country: some_country (see list of supported countries below)
+# List of supported countries
+1) United States - country: "us"
+2) United Kingdom - country: "uk"
+3) Canada - country: "ca"
+4) India - country: "in"
+Then you can see the scraped data like this:
+  profile.full_name          #the First name of the profile
+  profile.title               #the linkedin job title
+  profile.location            #the location of the prfile
+  profile.connection           # number of connection of profile
+  profile.linkedin_url        #url of the profile
+Copyright (c) 2012 Bhushan Lodha, released under the MIT license.

data/lib/linkedin/duck.rb ADDED

@@ -0,0 +1,42 @@
+module LinkedinScraper
+   USER_AGENTS = ["Linux Firefox", "Linux Konqueror", "Linux Mozilla", "Mac FireFox", "Mac Mozilla", "Mac Safari", "Windows Mozilla"]
+  class DuckDuckGo
+    def initialize options
+      raise "TypeError", "Invalid Arguments" unless options.is_a? Hash
+      @query = options[:query]
+      @agent = Mechanize.new { |agent| agent.user_agent_alias = USER_AGENTS.sample }
+    end
+    def search
+      output_arr = []
+      query_to_url
+      previous_content = ""
+      page = @agent.get("http:\/\/duckduckgo.com\/d.js?q=#{@query}&l=us-en&p=1&s=0")
+      content = page.content.match(/\[.*\]/).to_s
+      raise "No Profile, Profile not found or does not exist" if (content.nil? || content.empty?)
+      output_arr = JSON.parse(content).map {|f| f['c'] }
+      output_arr = output_arr.flatten.uniq
+      output_arr.each do |f|
+        next unless f
+        output_arr.delete(f) if f.include? "https://encrypted.google.com"
+        output_arr.delete(f) if f == nil
+      end
+      output_arr
+    end
+    private
+    def query_to_url
+      char_hash = {' ' => '%20','$' => '%24','&' => '%26','`' => '60%',':' => '%3A', '<' => '%3C', '>' => '%3E','[' => '%5B', ']' => '%5D', '{' => '%7B', '}' => '%7D', '"' => '%22','+' => '%2B', '@' => '%40', '/' => '%2F', ';' => '%3B', '=' => '%3D','?' => '%3F', '\\' => '%5C', '^' => '%5E', '|' => '%7C', '~' => '%7E', '\'' => '%27',',' => '%2C'}
+      @query = @query.gsub(/%/,'%25')
+      char_hash.each {|k,v|  @query = @query.gsub(k,v) }
+    end
+  end
+end

data/lib/linkedin/linkedin.rb ADDED

@@ -0,0 +1,249 @@
+module LinkedinScraper
+  class Linkedin
+    # Interface for scraping linkedin public profile
+    # Full name of profile e.g "John Smith"
+    attr_accessor :full_name
+    # Linkedin url of profile e.g "http://www.linkedin.com/pub/in/john+smith"
+    attr_accessor :linkedin_url
+    # Current title of profile e.g "Ceo"
+    attr_accessor :current_title
+    # Past title of profile e.g "VP Business"
+    attr_accessor :past_title
+    # Current company of profile e.g "Pajama Labs"
+    attr_accessor :current_company
+    # Current job description as available on linkedin
+    attr_accessor :current_job_description
+    # Profile's summary as available on linkedin
+    attr_accessor :summary
+    # Total number of profile's connection e.g "44"
+    attr_accessor :connection
+    # Current location of profile "San Fransisco"
+    attr_accessor :location
+    # Total number of profile's recommendation e.g "44"
+    attr_accessor :recommendation
+    # All companies including current and past
+    attr_accessor :industry
+    # Experience description if mentioned
+    attr_accessor :experience
+    # Groups a profile is associated with
+    attr_accessor :groups
+    # Past companies of profile e.g "Google, Inc"
+    attr_accessor :past_company
+    # Schools profile has attended
+    attr_accessor :education
+    # Purposes for which profile can be contacted to (if provided)
+    attr_accessor :contact_for
+    # Websites associated with Profile
+    attr_accessor :websites
+    # Initializes mechnaize
+    def initialize
+      @agent = Mechanize.new { |agent| agent.user_agent_alias = USER_AGENTS.sample }
+    end
+    # Get data of the required profile
+    #
+    # _options_ contains the key :name *Required
+    # :name - name of profile/contact e.g "John Smith"
+    # _options_ contains the key :company *Required
+    # :company - current or past company of profile/contact e.g "Google"
+    # _options_ contains the key :country *Optional (see list of supported countries in Readme)
+    # :country - preferred country of profile to be found in
+    #
+    # Raises error if profile is not found or does not exist in linkedin or is not public profile
+    # Raises error if :name or :company or both are not defined
+    # Returns _self_
+    def get_profile_data options
+      @options = options
+      raise "TypeError", "Invalid Arguments" unless options.is_a?(Hash)
+      argument_error("name") unless options.has_key?(:name)
+      argument_error("company") unless options.has_key?(:company)
+      query = build_query
+      duck = LinkedinScraper::DuckDuckGo.new(query: query)
+      results = duck.search
+      raise "No Profile, Profile not found or does not exist" if results.empty?
+      page = get_li_page(results[0])
+      raise "No Profile, Profile not found or does not exist" unless page
+      if verify_profile(@options[:name], @options[:company], page)
+        self.data page
+        return self
+      else
+        raise "No Profile, Profile not found or does not exist"
+      end
+    end
+    def data page
+      self.full_name = li_full_name(page)
+      self.linkedin_url = page.uri.to_s
+      self.current_title = li_current_title(page)
+      self.past_title = li_past_title(page)
+      self.current_company = li_current_companies(page)
+      self.current_job_description = li_current_job_description(page)
+      self.summary = li_summary(page)
+      self.connection = li_connection(page)
+      self.location = li_location(page)
+      self.recommendation = li_recommendation(page)
+      self.industry =  li_current_companies(page) + li_past_companies(page)
+      self.experience = li_experience(page)
+      self.groups = li_groups(page)
+      self.past_company = li_past_companies(page)
+      self.education = li_education(page)
+      self.contact_for = li_contact_for(page)
+      self.websites = li_websites(page)
+    end
+    private
+    def build_query
+      name = @options[:name] if @options.has_key? :name
+      #title = @options[:title] if @options.has_key? :title
+      company = @options[:company] if @options.has_key? :company
+      subdomain = @options[:country] || "www"
+      if name && company
+        return "site:#{subdomain}.linkedin.com \"#{name}\" + \"at #{company}\""
+      end
+    end
+    def get_li_page(url)
+      page = nil
+      begin
+        page = @agent.get(url)
+        if page
+          if page.parser.xpath('//*[(@id = "member-1")]//*[contains(concat( " ", @class, " " ), concat( " ", "title", " " ))]').text.empty?
+            unless page.parser.xpath('//*[(@id = "result-set")]//*[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//strong//a').text.empty?
+              return @agent.click(page.link_with(:text => page.parser.xpath('//*[(@id = "result-set")]//*[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//strong//a').text))
+            else
+              return nil
+            end
+          end
+        end
+      rescue Exception => e
+      end
+      return page
+    end
+    def verify_profile(name, company, page)
+      full_name = li_full_name(page)
+      companies = li_current_companies(page)+' '+li_past_companies(page)
+      return true if full_name=~/#{name}/i && companies=~/#{company}/i
+    end
+    def li_past_title(page)
+      stack = []
+      page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "past", " " ))]//li').each do |past|
+        stack << past.text.gsub(/\s+/, " ").downcase.split(' at ')[0]
+      end
+      stack.join(",")
+    end
+    def li_full_profile(page)
+      page.parser.xpath('//*[(@id = "content")]').text.gsub(/\s+/, " ").downcase
+    end
+    def li_current_job_description(page)
+      page.parser.xpath('//*[(@id = "profile-experience")]//*[contains(concat( " ", @class, " " ),
+      concat( " ", "first", " " ))]').text.gsub(/\s+/, " ").downcase
+    end
+    def li_full_name(page)
+      page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "full-name", " " ))]').text.downcase
+    end
+    def li_current_companies(page)
+      stack = []
+      page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "current", " " ))]//li').each do |com|
+        stack << com.text.gsub(/\s+/, " ").downcase.split(' at ')[-1]
+      end
+      stack.join(",")
+    end
+    def li_current_title(page)
+      stack = []
+      page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "current", " " ))]//li').each do |current|
+        stack << current.text.gsub(/\s+/, " ").downcase.split(' at ')[0]
+      end
+      stack.join(",")
+    end
+    def li_past_companies(page)
+      stack = []
+      page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "past", " " ))]//li').each do |com|
+        stack << com.text.gsub(/\s+/, " ").downcase.split(' at ')[-1]
+      end
+      stack.join(",")
+    end
+    def li_colleges(page)
+      stack = []
+      page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "summary-education", " " ))]//li').each do |com|
+        stack << com.text.gsub(/\s+/, " ").downcase
+      end
+      stack.join(",")
+    end
+    def li_recommendation(page)
+      page.parser.xpath('//dd[(((count(preceding-sibling::*) + 1) = 8) and parent::*)]//strong').text.strip
+    end
+    def li_websites(page)
+      websites = []
+      page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "websites", " " ))]//a').each do |web|
+        websites << web['href']
+      end
+      websites.join(",")
+    end
+    def li_summary(page)
+      page.parser.xpath('//*[(@id = "profile-summary")]').text.gsub(/\s+/, " ").downcase
+    end
+    def li_experience(page)
+      page.parser.xpath('//*[(@id = "profile-experience")]').text.gsub(/\s+/, " ").downcase
+    end
+    def li_education(page)
+      page.parser.xpath('//*[(@id = "profile-education")]').text.gsub(/\s+/, " ").downcase
+    end
+    def li_additional_info(page)
+      page.parser.xpath('//*[(@id = "profile-additional")]').text.gsub(/\s+/, " ").downcase
+    end
+    def li_contact_for(page)
+      page.parser.xpath('//*[(@id = "profile-contact")]').text.gsub(/\s+/, " ").downcase
+    end
+    def li_connection(page)
+      page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "overview-connections", " " ))]//p').text.gsub(/\s+/, " ").downcase
+    end
+    def li_location(page)
+      page.parser.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "locality", " " ))]').text.gsub(/\s+/, " ").downcase
+    end
+    def li_groups(page)
+      page.parser.xpath('//*[(@id = "pubgroups")]//*[contains(concat( " ", @class, " " ), concat( " ", "org", " " ))]').text.gsub(/\s+/, " ").downcase
+    end
+    def li_websites(page)
+      websites=[]
+      if page.search(".website").first
+        page.search(".website").each do |site|
+          url=site.at("a")["href"]
+          url="http://www.linkedin.com"+url
+          url=CGI.parse(URI.parse(url).query)["url"]
+          websites<<url
+        end
+        return websites.flatten!
+      end
+    end
+    def argument_error(argument)
+      raise "Argument Error, missing argument :#{argument}"
+    end
+  end
+end

data/lib/linkedin/version.rb ADDED

@@ -0,0 +1,3 @@
+module LinkedinScraper
+  VERSION = '0.1.0'
+end

data/lib/linkedin_scraper.rb ADDED

@@ -0,0 +1,8 @@
+require 'rubygems'
+require 'mechanize'
+require 'json'
+require 'cgi'
+require "#{File.dirname(File.expand_path(__FILE__))}/linkedin/duck.rb"
+require "#{File.dirname(File.expand_path(__FILE__))}/linkedin/linkedin.rb"
+require "#{File.dirname(File.expand_path(__FILE__))}/linkedin/version.rb"

metadata ADDED

@@ -0,0 +1,82 @@
+--- !ruby/object:Gem::Specification
+name: linkedin_scraper
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+  prerelease:
+platform: ruby
+authors:
+- Bhushan Lodha
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2012-08-31 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: mechanize
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: json
+  requirement: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    none: false
+    requirements:
+    - - ! '>='
+      - !ruby/object:Gem::Version
+        version: '0'
+description: Find linkedin profile based on name and company of profile and scrapes
+  data if profile is found
+email: bhushanlodha@gmail.com
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- README.rdoc
+- lib/linkedin/duck.rb
+- lib/linkedin/linkedin.rb
+- lib/linkedin/version.rb
+- lib/linkedin_scraper.rb
+homepage: https://github.com/bhushanlodha/linkedin_scraper
+licenses: []
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  none: false
+  requirements:
+  - - ! '>='
+    - !ruby/object:Gem::Version
+      version: '0'
+requirements: []
+rubyforge_project:
+rubygems_version: 1.8.24
+signing_key:
+specification_version: 3
+summary: Find linkedin profiles and scrapes data
+test_files: []