RubyGems - linkedin-scraper-v2 - Versions diffs - 0.1.8 - Mend

linkedin-scraper-v2 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

checksums.yaml +7 -0
data/.gitignore +23 -0
data/.rubocop.yml +11 -0
data/.travis.yml +8 -0
data/Gemfile +4 -0
data/LICENSE +22 -0
data/README.md +272 -0
data/Rakefile +3 -0
data/bin/linkedin-scraper +5 -0
data/lib/linkedin_scraper.rb +5 -0
data/lib/linkedin_scraper/profile.rb +252 -0
data/lib/linkedin_scraper/version.rb +5 -0
data/linkedin-scraper.gemspec +22 -0
data/spec/fixtures/jeffweiner08.html +308 -0
data/spec/linkedin_scraper/profile_spec.rb +110 -0
data/spec/spec_helper.rb +17 -0
metadata +107 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: ea72cf17a2f1766bdb4eb7b320d3251392d21824
+  data.tar.gz: ebccbf3dcd9f3511b452e103986916067a502e6d
+SHA512:
+  metadata.gz: cd1a28b3081b4d1ab7f86423cc169af184822cdb0ae96b5077a59314b6012a8e3ed837cc03e032c0878df7bc32c672bcfb1c32e8f3cbf56ea989b075c5468d71
+  data.tar.gz: 7d5c7acbde250b8976ddb64d0b0b496131baf3784993e5f6df17756762a9cdd37461771fcc12146e6174b64444ec64c460c2a7e9dd78b0edcd58ddbcd9327d15

data/.gitignore ADDED Viewed

@@ -0,0 +1,23 @@
+*.gem
+*.rbc
+.bundle
+.config
+.rspec
+.yardoc
+Gemfile.lock
+InstalledFiles
+_yardoc
+coverage
+doc/
+lib/bundler/man
+pkg
+rdoc
+spec/reports
+test/tmp
+test/version_tmp
+tmp
+.ruby-version
+.ruby-gemset
+.projectile
+*.DS_Store
+.idea/*

data/.rubocop.yml ADDED Viewed

@@ -0,0 +1,11 @@
+Documentation:
+  Enabled: false
+DotPosition:
+  Enabled: false
+LineLength:
+  Enabled: false
+MethodLength:
+  Enabled: false

data/.travis.yml ADDED Viewed

@@ -0,0 +1,8 @@
+language: ruby
+rvm:
+  - 2.2.0
+  - 2.0.0
+  - 1.9.3
+  - jruby-19mode
+  - 2.1.1
+  - 2.2.3

data/Gemfile ADDED Viewed

@@ -0,0 +1,4 @@
+source 'https://rubygems.org'
+# Specify your gem's dependencies in linkedin-scraper.gemspec
+gemspec

data/LICENSE ADDED Viewed

@@ -0,0 +1,22 @@
+Copyright (c) 2012 Yatish Mehta
+MIT License
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,272 @@
+[![Build Status](https://secure.travis-ci.org/yatish27/linkedin-scraper.png)](http://travis-ci.org/yatish27/linkedin-scraper)
+[![Gem Version](https://badge.fury.io/rb/linkedin-scraper.png)](http://badge.fury.io/rb/linkedin-scraper)
+Linkedin Scraper
+================
+Linkedin-scraper is a gem for scraping linkedin public profiles.
+Given the URL of the profile, it gets the name, country, title, area, current companies, past companies,
+organizations, skills, groups, etc
+## Installation
+Install the gem from RubyGems:
+    gem install linkedin-scraper
+This gem is tested on 1.9.2, 1.9.3, 2.0.0, JRuby1.9, rbx1.9,
+## Usage
+Initialize a scraper instance
+    profile = Linkedin::Profile.get_profile("http://www.linkedin.com/in/jeffweiner08")
+The returning object responds to the following methods
+    profile.first_name          # The first name of the contact
+    profile.last_name           # The last name of the contact
+    profile.name                # The full name of the profile
+    profile.title               # The job title
+	profile.summary             # The summary of the profile
+    profile.location            # The location of the contact
+    profile.country             # The country of the contact
+    profile.industry            # The domain for which the contact belongs
+    profile.picture             # The profile picture link of profile
+    profile.skills              # Array of skills of the profile
+    profile.organizations       # Array organizations of the profile
+    profile.education           # Array of hashes for education
+    profile.websites            # Array of websites
+	profile.groups              # Array of groups
+	profile.languages           # Array of languages
+	profile.certifications      # Array of certifications
+	profile.number_of_connections # The number of connections as a string
+For current and past companies it also provides the details of the companies like company size, industry, address, etc
+    profile.current_companies
+    [
+    [0] {
+             :current_company => "LinkedIn",
+               :current_title => "CEO",
+         :current_company_url => "http://www.linkedin.com",
+                 :description => nil,
+        :linkedin_company_url => "http://www.linkedin.com/company/linkedin?trk=ppro_cprof",
+                         :url => "http://www.linkedin.com",
+                        :type => "Public Company",
+                :company_size => "1001-5000 employees",
+                     :website => "http://www.linkedin.com",
+                    :industry => "Internet",
+                     :founded => "2003",
+                     :address => "2029 Stierlin Court  Mountain View, CA 94043 United States"
+    },
+    [1] {
+             :current_company => "Intuit",
+               :current_title => "Member, Board of Directors",
+         :current_company_url => "http://network.intuit.com/",
+                 :description => nil,
+        :linkedin_company_url => "http://www.linkedin.com/company/intuit?trk=ppro_cprof",
+                         :url => "http://network.intuit.com/",
+                        :type => "Public Company",
+                :company_size => "5001-10,000 employees",
+                     :website => "http://network.intuit.com/",
+                    :industry => "Computer Software",
+                     :founded => "1983",
+                     :address => "2632 Marine Way  Mountain View, CA 94043 United States"
+    },
+    [2] {
+             :current_company => "DonorsChoose",
+               :current_title => "Member, Board of Directors",
+         :current_company_url => "http://www.donorschoose.org",
+                 :description => nil,
+        :linkedin_company_url => "http://www.linkedin.com/company/donorschoose.org?trk=ppro_cprof",
+                         :url => "http://www.donorschoose.org",
+                        :type => "Nonprofit",
+                :company_size => "51-200 employees",
+                     :website => "http://www.donorschoose.org",
+                    :industry => "Nonprofit Organization Management",
+                     :founded => "2000",
+                     :address => "213 West 35th Street 2nd Floor East New York, NY 10001 United States"
+    },
+    [3] {
+            :current_company => "Malaria No More",
+              :current_title => "Member, Board of Directors",
+        :current_company_url => nil,
+                :description => nil
+    },
+    [4] {
+             :current_company => "Venture For America",
+               :current_title => "Member, Advisory Board",
+         :current_company_url => "http://ventureforamerica.org/",
+                 :description => nil,
+        :linkedin_company_url => "http://www.linkedin.com/company/venture-for-america?trk=ppro_cprof",
+                         :url => "http://ventureforamerica.org/",
+                        :type => "Nonprofit",
+                :company_size => "1-10 employees",
+                     :website => "http://ventureforamerica.org/",
+                    :industry => "Nonprofit Organization Management",
+                     :founded => "2011"
+    }
+    ]
+    profile.past_companies
+    [
+    [0] {
+                :past_company => "Accel Partners",
+                  :past_title => "Executive in Residence",
+        :past_company_website => "http://www.facebook.com/accel",
+                 :description => nil,
+        :linkedin_company_url => "http://www.linkedin.com/company/accel-partners?trk=ppro_cprof",
+                         :url => "http://www.facebook.com/accel",
+                        :type => "Partnership",
+                :company_size => "51-200 employees",
+                     :website => "http://www.facebook.com/accel",
+                    :industry => "Venture Capital & Private Equity",
+                     :address => "428 University Palo Alto, CA 94301 United States"
+    },
+    [1] {
+                :past_company => "Greylock",
+                  :past_title => "Executive in Residence",
+        :past_company_website => "http://www.greylock.com",
+                 :description => nil,
+        :linkedin_company_url => "http://www.linkedin.com/company/greylock-partners?trk=ppro_cprof",
+                         :url => "http://www.greylock.com",
+                        :type => "Partnership",
+                :company_size => "51-200 employees",
+                     :website => "http://www.greylock.com",
+                    :industry => "Venture Capital & Private Equity",
+                     :address => "2550 Sand Hill Road  Menlo Park, CA 94025 United States"
+    },
+    [2] {
+                :past_company => "Yahoo!",
+                  :past_title => "Executive Vice President Network Division",
+        :past_company_website => "http://www.yahoo.com",
+                 :description => nil,
+        :linkedin_company_url => "http://www.linkedin.com/company/yahoo?trk=ppro_cprof",
+                         :url => "http://www.yahoo.com",
+                        :type => "Public Company",
+                :company_size => "10,001+ employees",
+                     :website => "http://www.yahoo.com",
+                    :industry => "Internet",
+                     :founded => "1994",
+                     :address => "701 First Avenue  Sunnyvale, CA 94089 United States"
+    },
+    [3] {
+                :past_company => "Windsor Media",
+                  :past_title => "Founding Partner",
+        :past_company_website => nil,
+                 :description => nil
+    },
+    [4] {
+                :past_company => "Warner Bros.",
+                  :past_title => "Vice President Online",
+        :past_company_website => "http://www.warnerbros.com/",
+                 :description => nil,
+        :linkedin_company_url => "http://www.linkedin.com/company/warner-bros.-entertainment-group-of-companies?trk=ppro_cprof",
+                         :url => "http://www.warnerbros.com/",
+                        :type => "Public Company",
+                :company_size => "10,001+ employees",
+                     :website => "http://www.warnerbros.com/",
+                    :industry => "Entertainment",
+                     :address => "4000 Warner Boulevard  Burbank, CA 91522 United States"
+    }
+    ]
+    profile.recommended_visitors
+    #It is the list of visitors "Viewers of this profile also viewed..."
+    [
+    [0] {
+           :link => "http://www.linkedin.com/in/barackobama?trk=pub-pbmap",
+           :name => "Barack Obama",
+          :title => "President of the United States of ",
+        :company => nil
+    },
+    [1] {
+           :link => "http://www.linkedin.com/in/marissamayer?trk=pub-pbmap",
+           :name => "Marissa Mayer",
+          :title => "Yahoo!, President & CEO",
+        :company => nil
+    },
+    [2] {
+           :link => "http://www.linkedin.com/pub/sean-parker/0/1/826?trk=pub-pbmap",
+           :name => "Sean Parker",
+          :title => nil,
+        :company => nil
+    },
+    [3] {
+           :link => "http://www.linkedin.com/pub/eduardo-saverin/0/70a/31b?trk=pub-pbmap",
+           :name => "Eduardo Saverin",
+          :title => nil,
+        :company => nil
+    },
+    [4] {
+           :link => "http://www.linkedin.com/in/rbranson?trk=pub-pbmap",
+           :name => "Richard Branson",
+          :title => "Founder",
+        :company => "Virgin Group"
+    },
+    [5] {
+           :link => "http://www.linkedin.com/in/reidhoffman?trk=pub-pbmap",
+           :name => "Reid Hoffman",
+          :title => "Entrepreneur. Product Strategist.  ",
+        :company => nil
+    },
+    [6] {
+           :link => "http://www.linkedin.com/in/mdell?trk=pub-pbmap",
+           :name => "Michael Dell",
+          :title => "Chairman and CEO",
+        :company => "Dell"
+    },
+    [7] {
+           :link => "http://www.linkedin.com/in/mittromney?trk=pub-pbmap",
+           :name => "Mitt Romney",
+          :title => "Believe in America",
+        :company => nil
+    },
+    [8] {
+           :link => "http://www.linkedin.com/pub/sheryl-sandberg/2/665/512?trk=pub-pbmap",
+           :name => "Sheryl Sandberg",
+          :title => nil,
+        :company => nil
+    }
+    ]
+The gem also comes with a binary and can be used from the command line to get a json response of the scraped data.
+It takes the url as the first argument.
+    linkedin-scraper http://www.linkedin.com/in/jeffweiner08
+## Contributing
+Bug reports and pull requests are welcome on GitHub at https://github.com/yatish27/linkedin-scraper.
+This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the
+[Contributor Covenant](contributor-covenant.org) code of conduct.
+## License
+The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).

data/Rakefile ADDED Viewed

@@ -0,0 +1,3 @@
+require 'rspec/core/rake_task'
+task :default => :spec
+RSpec::Core::RakeTask.new

data/bin/linkedin-scraper ADDED Viewed

@@ -0,0 +1,5 @@
+#!/usr/bin/env ruby
+require_relative '../lib/linkedin_scraper'
+profile = Linkedin::Profile.new(ARGV[0])
+puts JSON.pretty_generate JSON.parse(profile.to_json)

data/lib/linkedin_scraper.rb ADDED Viewed

@@ -0,0 +1,5 @@
+require "rubygems"
+require "mechanize"
+require "cgi"
+require "net/http"
+Dir["#{File.expand_path(File.dirname(__FILE__))}/linkedin_scraper/*.rb"].each { |file| require file }

data/lib/linkedin_scraper/profile.rb ADDED Viewed

@@ -0,0 +1,252 @@
+# -*- encoding: utf-8 -*-
+module Linkedin
+  class Profile
+    USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
+    ATTRIBUTES = %w(
+    name
+    first_name
+    last_name
+    title
+    location
+    number_of_connections
+    country
+    industry
+    summary
+    picture
+    projects
+    linkedin_url
+    education
+    groups
+    websites
+    languages
+    skills
+    certifications
+    organizations
+    past_companies
+    current_companies
+    recommended_visitors)
+    attr_reader :page, :linkedin_url
+    def self.get_profile(url, options = {})
+      Linkedin::Profile.new(url, options)
+    rescue => e
+      puts e
+    end
+    def initialize(url, options = {})
+      @linkedin_url = url
+      @options = options
+      @page = http_client.get(url)
+    end
+    def name
+      "#{first_name} #{last_name}"
+    end
+    def first_name
+      @first_name ||= (@page.at(".full-name").text.split(" ", 2)[0].strip if @page.at(".full-name"))
+    end
+    def last_name
+      @last_name ||= (@page.at(".full-name").text.split(" ", 2)[1].strip if @page.at(".full-name"))
+    end
+    def title
+      @title ||= (@page.at(".title").text.gsub(/\s+/, " ").strip if @page.at(".title"))
+    end
+    def location
+      @location ||= (@page.at(".locality").text.split(",").first.strip if @page.at(".locality"))
+    end
+    def number_of_connections
+      @connections ||= (@page.at(".member-connections").text.match(/[0-9]+[\+]{0,1}/)[0])
+    end
+    def country
+      @country ||= (@page.at(".locality").text.split(",").last.strip if @page.at(".locality"))
+    end
+    def industry
+      @industry ||= (@page.at(".industry").text.gsub(/\s+/, " ").strip if @page.at(".industry"))
+    end
+    def summary
+      @summary ||= (@page.at(".summary .description").text.gsub(/\s+/, " ").strip if @page.at(".summary .description"))
+    end
+    def picture
+      @picture ||= (@page.at(".profile-picture img").attributes["src"].value.strip if @page.at(".profile-picture img"))
+    end
+    def skills
+      @skills ||= (@page.search(".skill-pill .endorse-item-name-text").map { |skill| skill.text.strip if skill.text } rescue nil)
+    end
+    def past_companies
+      @past_companies ||= get_companies("past")
+    end
+    def current_companies
+      @current_companies ||= get_companies("current")
+    end
+    def education
+      @education ||= @page.search(".background-education .education").map do |item|
+        name = item.at("h4").text.gsub(/\s+|\n/, " ").strip if item.at("h4")
+        desc = item.search("h5").last.text.gsub(/\s+|\n/, " ").strip if item.search("h5").last
+        degree = item.search("h5").last.at(".degree").text.gsub(/\s+|\n/, " ").strip.gsub(/,$/, "") if item.search("h5").last.at(".degree")
+        major = item.search("h5").last.at(".major").text.gsub(/\s+|\n/, " ").strip      if item.search("h5").last.at(".major")
+        period = item.at(".education-date").text.gsub(/\s+|\n/, " ").strip if item.at(".education-date")
+        start_date, end_date = item.at(".education-date").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
+        {:name => name, :description => desc, :degree => degree, :major => major, :period => period, :start_date => start_date, :end_date => end_date }
+      end
+    end
+    def websites
+      @websites ||= @page.search("#overview-summary-websites").flat_map do |site|
+        url = "http://www.linkedin.com#{site.at("a")["href"]}"
+        CGI.parse(URI.parse(url).query)["url"]
+      end
+    end
+    def groups
+      @groups ||= @page.search(".groups-name").map do |item|
+        name = item.text.gsub(/\s+|\n/, " ").strip
+        link = "http://www.linkedin.com#{item.at("a")["href"]}"
+        { :name => name, :link => link }
+      end
+    end
+    def organizations
+      @organizations ||= @page.search("#background-organizations .section-item").map do |item|
+        name = item.at(".summary").text.gsub(/\s+|\n/, " ").strip rescue nil
+        start_date, end_date = item.at(".organizations-date").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
+        start_date = Date.parse(start_date) rescue nil
+        end_date = Date.parse(end_date)   rescue nil
+        { :name => name, :start_date => start_date, :end_date => end_date }
+      end
+    end
+    def languages
+      @languages ||= @page.search(".background-languages #languages ol li").map do |item|
+        language = item.at("h4").text rescue nil
+        proficiency = item.at("div.languages-proficiency").text.gsub(/\s+|\n/, " ").strip rescue nil
+        { :language => language, :proficiency => proficiency }
+      end
+    end
+    def certifications
+      @certifications ||= @page.search("background-certifications").map do |item|
+        name       = item.at("h4").text.gsub(/\s+|\n/, " ").strip rescue nil
+        authority  = item.at("h5").text.gsub(/\s+|\n/, " ").strip rescue nil
+        license    = item.at(".specifics/.licence-number").text.gsub(/\s+|\n/, " ").strip rescue nil
+        start_date = item.at(".certification-date").text.gsub(/\s+|\n/, " ").strip rescue nil
+        { :name => name, :authority => authority, :license => license, :start_date => start_date }
+      end
+    end
+    def recommended_visitors
+      @recommended_visitors ||= @page.search(".insights-browse-map/ul/li").map do |visitor|
+        v = {}
+        v[:link] = visitor.at("a")["href"]
+        v[:name] = visitor.at("h4/a").text
+        v[:title] = visitor.at(".browse-map-title").text.gsub("...", " ").split(" at ").first
+        v[:company] = visitor.at(".browse-map-title").text.gsub("...", " ").split(" at ")[1]
+        v
+      end
+    end
+    def projects
+      @projects ||= @page.search(".background-projects/div").map do |project|
+        project = project.at("div")
+        p = {}
+        start_date, end_date = project.at(".projects-date").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
+        p[:title] = project.at("hgroup/h4 span:first-of-type").text rescue nil
+        p[:link] =  project.at("hgroup/h4 a:first-of-type")['href'] rescue nil
+        p[:start_date] = parse_date(start_date) rescue nil
+        p[:end_date] = parse_date(end_date)  rescue nil
+        p[:description] = project.at(".description").text rescue nil
+        p[:associates] = project.at(".associated-list ul").children.map{ |c| c.at("a").text } rescue nil
+        p
+      end
+    end
+    def to_json
+      require "json"
+      ATTRIBUTES.reduce({}){ |hash,attr| hash[attr.to_sym] = self.send(attr.to_sym);hash }.to_json
+    end
+    private
+    def get_companies(type)
+      companies = []
+      if @page.search(".background-experience .#{type}-position").first
+        @page.search(".background-experience .#{type}-position").each do |node|
+          company = {}
+          company[:title] = node.at("h4").text.gsub(/\s+|\n/, " ").strip if node.at("h4")
+          company[:company] = node.at("h4").next.text.gsub(/\s+|\n/, " ").strip if node.at("h4").next
+          company[:description] = node.at(".description").text.gsub(/\s+|\n/, " ").strip if node.at(".description")
+          start_date, end_date = node.at(".experience-date-locale").text.strip.split(" – ") rescue nil
+          company[:duration] = node.at(".experience-date-locale").text[/.*\((.*)\)/, 1]
+          company[:start_date] = parse_date(start_date) rescue nil
+          company[:end_date] = parse_date(end_date) rescue nil
+          company_link = node.at("h4").next.at("a")["href"] if node.at("h4").next.at("a")
+          result = get_company_details(company_link)
+          companies << company.merge!(result)
+        end
+      end
+      companies
+    end
+    def parse_date(date)
+      date = "#{date}-01-01" if date =~ /^(19|20)\d{2}$/
+      Date.parse(date)
+    end
+    def get_company_details(link)
+      result = { :linkedin_company_url => get_linkedin_company_url(link) }
+      page = http_client.get(result[:linkedin_company_url])
+      result[:url] = page.at(".basic-info-about/ul/li/p/a").text if page.at(".basic-info-about/ul/li/p/a")
+      node_2 = page.at(".basic-info-about/ul")
+      if node_2
+        node_2.search("p").zip(node_2.search("h4")).each do |value, title|
+          result[title.text.gsub(" ", "_").downcase.to_sym] = value.text.strip
+        end
+      end
+      result[:address] = page.at(".vcard.hq").at(".adr").text.gsub("\n", " ").strip if page.at(".vcard.hq")
+      result
+    end
+    def http_client
+      Mechanize.new do |agent|
+        agent.user_agent_alias = USER_AGENTS.sample
+        unless @options.empty?
+          agent.set_proxy(@options[:proxy_ip], @options[:proxy_port])
+        end
+        agent.max_history = 0
+      end
+    end
+    def get_linkedin_company_url(link)
+      http = %r{http://www.linkedin.com/}
+      https = %r{https://www.linkedin.com/}
+      if http.match(link) || https.match(link)
+        link
+      else
+        "http://www.linkedin.com/#{link}"
+      end
+    end
+  end
+end