RubyGems - linkedin-scraper - Versions diffs - 0.1.5 → 0.1.7 - Mend

linkedin-scraper 0.1.5 → 0.1.7

Files changed (16) hide show

checksums.yaml +4 -4
data/.gitignore +3 -1
data/README.md +17 -7
data/bin/linkedin-scraper +1 -1
data/lib/linkedin_scraper/profile.rb +243 -0
data/lib/{linkedin-scraper → linkedin_scraper}/version.rb +1 -1
data/lib/linkedin_scraper.rb +5 -0
data/linkedin-scraper.gemspec +7 -6
data/spec/fixtures/jeffweiner08.html +308 -0
data/spec/linkedin_scraper/.DS_Store +0 -0
data/spec/linkedin_scraper/profile_spec.rb +104 -0
metadata +27 -24
data/lib/linkedin-scraper/profile.rb +0 -225
data/lib/linkedin-scraper.rb +0 -5
data/spec/fixtures/jgrevich.html +0 -9300
data/spec/linkedin-scraper/profile_spec.rb +0 -154

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: fc9d61aa2d048763671b81e1c5389761897c9a95
-  data.tar.gz: a092ceaaf5ab5e5a622bf400539f0d77697f4cb3
+  metadata.gz: 2030446ef750ed1a95c9818d63d0cf97a0cbd60a
+  data.tar.gz: 1639e466dadbee02704a853fe13f0ae10bb42f94
 SHA512:
-  metadata.gz: 732f37109c5194922ed01d7a2a0db4d69696a8a87ad0ffb0b181eff550cac8ebcbdfb62b4a4c7de30066d4610b1ee488872ce9da992dee633bb0a30ca887ccd4
-  data.tar.gz: 7b8d05df2f6533fb4428e1852390f6fb16065828a27367f0a3fe0a476a7596eb59d31d14b819190e4da71eb3f88b0767e336c6d6ab2fb78221886e035e51d225
+  metadata.gz: dd080bec613c77eb50a439ccd3628932ba0b9ed0ddf7b5e03781036d89722f423e1c439d8b9a08e49e10cf50744926cbae05cc5218ce71a6b923ff793b118b93
+  data.tar.gz: b796724e23fb34f49c3f1012c97c9bd2a0d38372d652699095a1e0592f918a9778caaf9ecacce51217289a6500a49d2319bdf1a8e5ab54fe4d0ccc5e9afc64b3

data/.gitignore CHANGED Viewed

@@ -18,4 +18,6 @@ test/version_tmp
 tmp
 .ruby-version
 .ruby-gemset
-.projectile
+.projectile
+*.DS_Store
+.idea/*

data/README.md CHANGED Viewed

@@ -5,11 +5,11 @@ Linkedin Scraper
 ================
 Linkedin-scraper is a gem for scraping linkedin public profiles.
-Given the URL of the profile, it gets the name, country, title, area, current companies, past comapnies,organizations, skills, groups, etc
+Given the URL of the profile, it gets the name, country, title, area, current companies, past companies,
+organizations, skills, groups, etc
-##Installation
+## Installation
 Install the gem from RubyGems:
@@ -17,7 +17,7 @@ Install the gem from RubyGems:
 This gem is tested on 1.9.2, 1.9.3, 2.0.0, JRuby1.9, rbx1.9,
-##Usage
+## Usage
 Initialize a scraper instance
@@ -59,7 +59,7 @@ The returning object responds to the following methods
 	profile.certifications      # Array of certifications
-For current and past comapnies it also provides the details of the companies like comapny size, industry, address, etc
+For current and past companies it also provides the details of the companies like company size, industry, address, etc
     profile.current_companies
@@ -252,8 +252,18 @@ For current and past comapnies it also provides the details of the companies lik
     ]
-The gem also comes with a binary and can be used from the command line to get a json response of the scraped data. It takes the url as the first argument.
+The gem also comes with a binary and can be used from the command line to get a json response of the scraped data.
+It takes the url as the first argument.
     linkedin-scraper http://www.linkedin.com/in/jeffweiner08
-You're welcome to fork this project and send pull requests
+## Contributing
+Bug reports and pull requests are welcome on GitHub at https://github.com/yatish27/linkedin-scraper.
+This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the
+[Contributor Covenant](contributor-covenant.org) code of conduct.
+## License
+The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).

data/bin/linkedin-scraper CHANGED Viewed

@@ -1,5 +1,5 @@
 #!/usr/bin/env ruby
-require_relative '../lib/linkedin-scraper'
+require_relative '../lib/linkedin_scraper'
 profile = Linkedin::Profile.new(ARGV[0])
 puts JSON.pretty_generate JSON.parse(profile.to_json)

data/lib/linkedin_scraper/profile.rb ADDED Viewed

@@ -0,0 +1,243 @@
+# -*- encoding: utf-8 -*-
+module Linkedin
+  class Profile
+    USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
+    ATTRIBUTES = %w(
+    name
+    first_name
+    last_name
+    title
+    location
+    country
+    industry
+    summary
+    picture
+    projects
+    linkedin_url
+    education
+    groups
+    websites
+    languages
+    skills
+    certifications
+    organizations
+    past_companies
+    current_companies
+    recommended_visitors)
+    attr_reader :page, :linkedin_url
+    def self.get_profile(url)
+      Linkedin::Profile.new(url)
+    rescue => e
+      puts e
+    end
+    def initialize(url)
+      @linkedin_url = url
+      @page = http_client.get(url)
+    end
+    def name
+      "#{first_name} #{last_name}"
+    end
+    def first_name
+      @first_name ||= (@page.at(".full-name").text.split(" ", 2)[0].strip if @page.at(".full-name"))
+    end
+    def last_name
+      @last_name ||= (@page.at(".full-name").text.split(" ", 2)[1].strip if @page.at(".full-name"))
+    end
+    def title
+      @title ||= (@page.at(".title").text.gsub(/\s+/, " ").strip if @page.at(".title"))
+    end
+    def location
+      @location ||= (@page.at(".locality").text.split(",").first.strip if @page.at(".locality"))
+    end
+    def country
+      @country ||= (@page.at(".locality").text.split(",").last.strip if @page.at(".locality"))
+    end
+    def industry
+      @industry ||= (@page.at(".industry").text.gsub(/\s+/, " ").strip if @page.at(".industry"))
+    end
+    def summary
+      @summary ||= (@page.at(".summary .description").text.gsub(/\s+/, " ").strip if @page.at(".summary .description"))
+    end
+    def picture
+      @picture ||= (@page.at(".profile-picture img").attributes["src"].value.strip if @page.at(".profile-picture img"))
+    end
+    def skills
+      @skills ||= (@page.search(".skill-pill .endorse-item-name-text").map { |skill| skill.text.strip if skill.text } rescue nil)
+    end
+    def past_companies
+      @past_companies ||= get_companies("past")
+    end
+    def current_companies
+      @current_companies ||= get_companies("current")
+    end
+    def education
+      @education ||= @page.search(".background-education .education").map do |item|
+        name = item.at("h4").text.gsub(/\s+|\n/, " ").strip if item.at("h4")
+        desc = item.search("h5").last.text.gsub(/\s+|\n/, " ").strip if item.search("h5").last
+        degree = item.search("h5").last.at(".degree").text.gsub(/\s+|\n/, " ").strip.gsub(/,$/, "") if item.search("h5").last.at(".degree")
+        major = item.search("h5").last.at(".major").text.gsub(/\s+|\n/, " ").strip      if item.search("h5").last.at(".major")
+        period = item.at(".education-date").text.gsub(/\s+|\n/, " ").strip if item.at(".education-date")
+        start_date, end_date = item.at(".education-date").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
+        {:name => name, :description => desc, :degree => degree, :major => major, :period => period, :start_date => start_date, :end_date => end_date }
+      end
+    end
+    def websites
+      @websites ||= @page.search("#overview-summary-websites").flat_map do |site|
+        url = "http://www.linkedin.com#{site.at("a")["href"]}"
+        CGI.parse(URI.parse(url).query)["url"]
+      end
+    end
+    def groups
+      @groups ||= @page.search(".groups-name").map do |item|
+        name = item.text.gsub(/\s+|\n/, " ").strip
+        link = "http://www.linkedin.com#{item.at("a")["href"]}"
+        { :name => name, :link => link }
+      end
+    end
+    def organizations
+      @organizations ||= @page.search("#background-organizations .section-item").map do |item|
+        name = item.at(".summary").text.gsub(/\s+|\n/, " ").strip rescue nil
+        start_date, end_date = item.at(".organizations-date").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
+        start_date = Date.parse(start_date) rescue nil
+        end_date = Date.parse(end_date)   rescue nil
+        { :name => name, :start_date => start_date, :end_date => end_date }
+      end
+    end
+    def languages
+      @languages ||= @page.search(".background-languages #languages ol li").map do |item|
+        language = item.at("h4").text rescue nil
+        proficiency = item.at("div.languages-proficiency").text.gsub(/\s+|\n/, " ").strip rescue nil
+        { :language => language, :proficiency => proficiency }
+      end
+    end
+    def certifications
+      @certifications ||= @page.search("background-certifications").map do |item|
+        name       = item.at("h4").text.gsub(/\s+|\n/, " ").strip rescue nil
+        authority  = item.at("h5").text.gsub(/\s+|\n/, " ").strip rescue nil
+        license    = item.at(".specifics/.licence-number").text.gsub(/\s+|\n/, " ").strip rescue nil
+        start_date = item.at(".certification-date").text.gsub(/\s+|\n/, " ").strip rescue nil
+        { :name => name, :authority => authority, :license => license, :start_date => start_date }
+      end
+    end
+    def recommended_visitors
+      @recommended_visitors ||= @page.search(".insights-browse-map/ul/li").map do |visitor|
+        v = {}
+        v[:link] = visitor.at("a")["href"]
+        v[:name] = visitor.at("h4/a").text
+        v[:title] = visitor.at(".browse-map-title").text.gsub("...", " ").split(" at ").first
+        v[:company] = visitor.at(".browse-map-title").text.gsub("...", " ").split(" at ")[1]
+        v
+      end
+    end
+    def projects
+      @projects ||= @page.search(".background-projects/div").map do |project|
+        project = project.at("div")
+        p = {}
+        start_date, end_date = project.at(".projects-date").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
+        p[:title] = project.at("hgroup/h4 span:first-of-type").text rescue nil
+        p[:link] =  project.at("hgroup/h4 a:first-of-type")['href'] rescue nil
+        p[:start_date] = parse_date(start_date) rescue nil
+        p[:end_date] = parse_date(end_date)  rescue nil
+        p[:description] = project.at(".description").text rescue nil
+        p[:associates] = project.at(".associated-list ul").children.map{ |c| c.at("a").text } rescue nil
+        p
+      end
+    end
+    def to_json
+      require "json"
+      ATTRIBUTES.reduce({}){ |hash,attr| hash[attr.to_sym] = self.send(attr.to_sym);hash }.to_json
+    end
+    private
+    def get_companies(type)
+      companies = []
+      if @page.search(".background-experience .#{type}-position").first
+        @page.search(".background-experience .#{type}-position").each do |node|
+          company = {}
+          company[:title] = node.at("h4").text.gsub(/\s+|\n/, " ").strip if node.at("h4")
+          company[:company] = node.at("h4").next.text.gsub(/\s+|\n/, " ").strip if node.at("h4").next
+          company[:description] = node.at(".description").text.gsub(/\s+|\n/, " ").strip if node.at(".description")
+          start_date, end_date = node.at(".experience-date-locale").text.strip.split(" – ") rescue nil
+          company[:duration] = node.at(".experience-date-locale").text[/.*\((.*)\)/, 1]
+          company[:start_date] = parse_date(start_date) rescue nil
+          company[:end_date] = parse_date(end_date) rescue nil
+          company_link = node.at("h4").next.at("a")["href"] if node.at("h4").next.at("a")
+          result = get_company_details(company_link)
+          companies << company.merge!(result)
+        end
+      end
+      companies
+    end
+    def parse_date(date)
+      date = "#{date}-01-01" if date =~ /^(19|20)\d{2}$/
+      Date.parse(date)
+    end
+    def get_company_details(link)
+      result = { :linkedin_company_url => get_linkedin_company_url(link) }
+      page = http_client.get(result[:linkedin_company_url])
+      result[:url] = page.at(".basic-info-about/ul/li/p/a").text if page.at(".basic-info-about/ul/li/p/a")
+      node_2 = page.at(".basic-info-about/ul")
+      if node_2
+        node_2.search("p").zip(node_2.search("h4")).each do |value, title|
+          result[title.text.gsub(" ", "_").downcase.to_sym] = value.text.strip
+        end
+      end
+      result[:address] = page.at(".vcard.hq").at(".adr").text.gsub("\n", " ").strip if page.at(".vcard.hq")
+      result
+    end
+    def http_client
+      Mechanize.new do |agent|
+        agent.user_agent_alias = USER_AGENTS.sample
+        agent.max_history = 0
+      end
+    end
+    def get_linkedin_company_url(link)
+      http = %r{http://www.linkedin.com/}
+      https = %r{https://www.linkedin.com/}
+      if http.match(link) || https.match(link)
+        link
+      else
+        "http://www.linkedin.com/#{link}"
+      end
+    end
+  end
+end

data/lib/{linkedin-scraper → linkedin_scraper}/version.rb RENAMED Viewed

@@ -1,5 +1,5 @@
 module Linkedin
   module Scraper
-    VERSION = '0.1.5'
+    VERSION = '0.1.7'
   end
 end

data/lib/linkedin_scraper.rb ADDED Viewed

@@ -0,0 +1,5 @@
+require "rubygems"
+require "mechanize"
+require "cgi"
+require "net/http"
+Dir["#{File.expand_path(File.dirname(__FILE__))}/linkedin_scraper/*.rb"].each { |file| require file }

data/linkedin-scraper.gemspec CHANGED Viewed

@@ -1,9 +1,9 @@
 # -*- encoding: utf-8 -*-
-require File.expand_path('../lib/linkedin-scraper/version', __FILE__)
+require File.expand_path('../lib/linkedin_scraper/version', __FILE__)
 Gem::Specification.new do |gem|
   gem.authors       = ['Yatish Mehta']
-  gem.description   = %q{Scrapes the linkedin profile when a url is given }
+  gem.description   = %q{Scrapes the LinkedIn profile using the public url }
   gem.summary       = %q{when a url of  public linkedin profile page is given it scrapes the entire page and converts into a accessible object}
   gem.homepage      = 'https://github.com/yatishmehta27/linkedin-scraper'
   gem.files         = `git ls-files`.split($\)
@@ -13,9 +13,10 @@ Gem::Specification.new do |gem|
   gem.require_paths = ['lib']
   gem.version       = Linkedin::Scraper::VERSION
-  gem.add_dependency(%q<mechanize>, ['>= 0'])
-  gem.add_development_dependency 'rspec', '>=0'
-  gem.add_development_dependency 'rake'
+  gem.license       = "MIT"
+  gem.add_dependency 'mechanize', '~> 2'
+  gem.add_development_dependency 'rspec', '~> 3'
+  gem.add_development_dependency 'rake', '~> 10'
 end