RubyGems - linkedin-scraper - Versions diffs - 0.1.5 → 0.1.7 - Mend

linkedin-scraper 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

checksums.yaml +4 -4
data/.gitignore +3 -1
data/README.md +17 -7
data/bin/linkedin-scraper +1 -1
data/lib/linkedin_scraper/profile.rb +243 -0
data/lib/{linkedin-scraper → linkedin_scraper}/version.rb +1 -1
data/lib/linkedin_scraper.rb +5 -0
data/linkedin-scraper.gemspec +7 -6
data/spec/fixtures/jeffweiner08.html +308 -0
data/spec/linkedin_scraper/.DS_Store +0 -0
data/spec/linkedin_scraper/profile_spec.rb +104 -0
metadata +27 -24
data/lib/linkedin-scraper/profile.rb +0 -225
data/lib/linkedin-scraper.rb +0 -5
data/spec/fixtures/jgrevich.html +0 -9300
data/spec/linkedin-scraper/profile_spec.rb +0 -154

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: fc9d61aa2d048763671b81e1c5389761897c9a95
-  data.tar.gz: a092ceaaf5ab5e5a622bf400539f0d77697f4cb3
+  metadata.gz: 2030446ef750ed1a95c9818d63d0cf97a0cbd60a
+  data.tar.gz: 1639e466dadbee02704a853fe13f0ae10bb42f94
 SHA512:
-  metadata.gz: 732f37109c5194922ed01d7a2a0db4d69696a8a87ad0ffb0b181eff550cac8ebcbdfb62b4a4c7de30066d4610b1ee488872ce9da992dee633bb0a30ca887ccd4
-  data.tar.gz: 7b8d05df2f6533fb4428e1852390f6fb16065828a27367f0a3fe0a476a7596eb59d31d14b819190e4da71eb3f88b0767e336c6d6ab2fb78221886e035e51d225
+  metadata.gz: dd080bec613c77eb50a439ccd3628932ba0b9ed0ddf7b5e03781036d89722f423e1c439d8b9a08e49e10cf50744926cbae05cc5218ce71a6b923ff793b118b93
+  data.tar.gz: b796724e23fb34f49c3f1012c97c9bd2a0d38372d652699095a1e0592f918a9778caaf9ecacce51217289a6500a49d2319bdf1a8e5ab54fe4d0ccc5e9afc64b3

data/.gitignore CHANGED Viewed

@@ -18,4 +18,6 @@ test/version_tmp
 tmp
 .ruby-version
 .ruby-gemset
-.projectile
+.projectile
+*.DS_Store
+.idea/*

data/README.md CHANGED Viewed

@@ -5,11 +5,11 @@ Linkedin Scraper
 ================
 Linkedin-scraper is a gem for scraping linkedin public profiles.
-Given the URL of the profile, it gets the name, country, title, area, current companies, past comapnies,organizations, skills, groups, etc
+Given the URL of the profile, it gets the name, country, title, area, current companies, past companies,
+organizations, skills, groups, etc
-##Installation
+## Installation
 Install the gem from RubyGems:
@@ -17,7 +17,7 @@ Install the gem from RubyGems:
 This gem is tested on 1.9.2, 1.9.3, 2.0.0, JRuby1.9, rbx1.9,
-##Usage
+## Usage
 Initialize a scraper instance
@@ -59,7 +59,7 @@ The returning object responds to the following methods
 	profile.certifications      # Array of certifications
-For current and past comapnies it also provides the details of the companies like comapny size, industry, address, etc
+For current and past companies it also provides the details of the companies like company size, industry, address, etc
     profile.current_companies
@@ -252,8 +252,18 @@ For current and past comapnies it also provides the details of the companies lik
     ]
-The gem also comes with a binary and can be used from the command line to get a json response of the scraped data. It takes the url as the first argument.
+The gem also comes with a binary and can be used from the command line to get a json response of the scraped data.
+It takes the url as the first argument.
     linkedin-scraper http://www.linkedin.com/in/jeffweiner08
-You're welcome to fork this project and send pull requests
+## Contributing
+Bug reports and pull requests are welcome on GitHub at https://github.com/yatish27/linkedin-scraper.
+This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the
+[Contributor Covenant](contributor-covenant.org) code of conduct.
+## License
+The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).

data/bin/linkedin-scraper CHANGED Viewed

@@ -1,5 +1,5 @@
 #!/usr/bin/env ruby
-require_relative '../lib/linkedin-scraper'
+require_relative '../lib/linkedin_scraper'
 profile = Linkedin::Profile.new(ARGV[0])
 puts JSON.pretty_generate JSON.parse(profile.to_json)

data/lib/linkedin_scraper/profile.rb ADDED Viewed

@@ -0,0 +1,243 @@
+# -*- encoding: utf-8 -*-
+module Linkedin
+  class Profile
+    USER_AGENTS = ["Windows IE 6", "Windows IE 7", "Windows Mozilla", "Mac Safari", "Mac FireFox", "Mac Mozilla", "Linux Mozilla", "Linux Firefox", "Linux Konqueror"]
+    ATTRIBUTES = %w(
+    name
+    first_name
+    last_name
+    title
+    location
+    country
+    industry
+    summary
+    picture
+    projects
+    linkedin_url
+    education
+    groups
+    websites
+    languages
+    skills
+    certifications
+    organizations
+    past_companies
+    current_companies
+    recommended_visitors)
+    attr_reader :page, :linkedin_url
+    def self.get_profile(url)
+      Linkedin::Profile.new(url)
+    rescue => e
+      puts e
+    end
+    def initialize(url)
+      @linkedin_url = url
+      @page = http_client.get(url)
+    end
+    def name
+      "#{first_name} #{last_name}"
+    end
+    def first_name
+      @first_name ||= (@page.at(".full-name").text.split(" ", 2)[0].strip if @page.at(".full-name"))
+    end
+    def last_name
+      @last_name ||= (@page.at(".full-name").text.split(" ", 2)[1].strip if @page.at(".full-name"))
+    end
+    def title
+      @title ||= (@page.at(".title").text.gsub(/\s+/, " ").strip if @page.at(".title"))
+    end
+    def location
+      @location ||= (@page.at(".locality").text.split(",").first.strip if @page.at(".locality"))
+    end
+    def country
+      @country ||= (@page.at(".locality").text.split(",").last.strip if @page.at(".locality"))
+    end
+    def industry
+      @industry ||= (@page.at(".industry").text.gsub(/\s+/, " ").strip if @page.at(".industry"))
+    end
+    def summary
+      @summary ||= (@page.at(".summary .description").text.gsub(/\s+/, " ").strip if @page.at(".summary .description"))
+    end
+    def picture
+      @picture ||= (@page.at(".profile-picture img").attributes["src"].value.strip if @page.at(".profile-picture img"))
+    end
+    def skills
+      @skills ||= (@page.search(".skill-pill .endorse-item-name-text").map { |skill| skill.text.strip if skill.text } rescue nil)
+    end
+    def past_companies
+      @past_companies ||= get_companies("past")
+    end
+    def current_companies
+      @current_companies ||= get_companies("current")
+    end
+    def education
+      @education ||= @page.search(".background-education .education").map do |item|
+        name = item.at("h4").text.gsub(/\s+|\n/, " ").strip if item.at("h4")
+        desc = item.search("h5").last.text.gsub(/\s+|\n/, " ").strip if item.search("h5").last
+        degree = item.search("h5").last.at(".degree").text.gsub(/\s+|\n/, " ").strip.gsub(/,$/, "") if item.search("h5").last.at(".degree")
+        major = item.search("h5").last.at(".major").text.gsub(/\s+|\n/, " ").strip      if item.search("h5").last.at(".major")
+        period = item.at(".education-date").text.gsub(/\s+|\n/, " ").strip if item.at(".education-date")
+        start_date, end_date = item.at(".education-date").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
+        {:name => name, :description => desc, :degree => degree, :major => major, :period => period, :start_date => start_date, :end_date => end_date }
+      end
+    end
+    def websites
+      @websites ||= @page.search("#overview-summary-websites").flat_map do |site|
+        url = "http://www.linkedin.com#{site.at("a")["href"]}"
+        CGI.parse(URI.parse(url).query)["url"]
+      end
+    end
+    def groups
+      @groups ||= @page.search(".groups-name").map do |item|
+        name = item.text.gsub(/\s+|\n/, " ").strip
+        link = "http://www.linkedin.com#{item.at("a")["href"]}"
+        { :name => name, :link => link }
+      end
+    end
+    def organizations
+      @organizations ||= @page.search("#background-organizations .section-item").map do |item|
+        name = item.at(".summary").text.gsub(/\s+|\n/, " ").strip rescue nil
+        start_date, end_date = item.at(".organizations-date").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
+        start_date = Date.parse(start_date) rescue nil
+        end_date = Date.parse(end_date)   rescue nil
+        { :name => name, :start_date => start_date, :end_date => end_date }
+      end
+    end
+    def languages
+      @languages ||= @page.search(".background-languages #languages ol li").map do |item|
+        language = item.at("h4").text rescue nil
+        proficiency = item.at("div.languages-proficiency").text.gsub(/\s+|\n/, " ").strip rescue nil
+        { :language => language, :proficiency => proficiency }
+      end
+    end
+    def certifications
+      @certifications ||= @page.search("background-certifications").map do |item|
+        name       = item.at("h4").text.gsub(/\s+|\n/, " ").strip rescue nil
+        authority  = item.at("h5").text.gsub(/\s+|\n/, " ").strip rescue nil
+        license    = item.at(".specifics/.licence-number").text.gsub(/\s+|\n/, " ").strip rescue nil
+        start_date = item.at(".certification-date").text.gsub(/\s+|\n/, " ").strip rescue nil
+        { :name => name, :authority => authority, :license => license, :start_date => start_date }
+      end
+    end
+    def recommended_visitors
+      @recommended_visitors ||= @page.search(".insights-browse-map/ul/li").map do |visitor|
+        v = {}
+        v[:link] = visitor.at("a")["href"]
+        v[:name] = visitor.at("h4/a").text
+        v[:title] = visitor.at(".browse-map-title").text.gsub("...", " ").split(" at ").first
+        v[:company] = visitor.at(".browse-map-title").text.gsub("...", " ").split(" at ")[1]
+        v
+      end
+    end
+    def projects
+      @projects ||= @page.search(".background-projects/div").map do |project|
+        project = project.at("div")
+        p = {}
+        start_date, end_date = project.at(".projects-date").text.gsub(/\s+|\n/, " ").strip.split(" – ") rescue nil
+        p[:title] = project.at("hgroup/h4 span:first-of-type").text rescue nil
+        p[:link] =  project.at("hgroup/h4 a:first-of-type")['href'] rescue nil
+        p[:start_date] = parse_date(start_date) rescue nil
+        p[:end_date] = parse_date(end_date)  rescue nil
+        p[:description] = project.at(".description").text rescue nil
+        p[:associates] = project.at(".associated-list ul").children.map{ |c| c.at("a").text } rescue nil
+        p
+      end
+    end
+    def to_json
+      require "json"
+      ATTRIBUTES.reduce({}){ |hash,attr| hash[attr.to_sym] = self.send(attr.to_sym);hash }.to_json
+    end
+    private
+    def get_companies(type)
+      companies = []
+      if @page.search(".background-experience .#{type}-position").first
+        @page.search(".background-experience .#{type}-position").each do |node|
+          company = {}
+          company[:title] = node.at("h4").text.gsub(/\s+|\n/, " ").strip if node.at("h4")
+          company[:company] = node.at("h4").next.text.gsub(/\s+|\n/, " ").strip if node.at("h4").next
+          company[:description] = node.at(".description").text.gsub(/\s+|\n/, " ").strip if node.at(".description")
+          start_date, end_date = node.at(".experience-date-locale").text.strip.split(" – ") rescue nil
+          company[:duration] = node.at(".experience-date-locale").text[/.*\((.*)\)/, 1]
+          company[:start_date] = parse_date(start_date) rescue nil
+          company[:end_date] = parse_date(end_date) rescue nil
+          company_link = node.at("h4").next.at("a")["href"] if node.at("h4").next.at("a")
+          result = get_company_details(company_link)
+          companies << company.merge!(result)
+        end
+      end
+      companies
+    end
+    def parse_date(date)
+      date = "#{date}-01-01" if date =~ /^(19|20)\d{2}$/
+      Date.parse(date)
+    end
+    def get_company_details(link)
+      result = { :linkedin_company_url => get_linkedin_company_url(link) }
+      page = http_client.get(result[:linkedin_company_url])
+      result[:url] = page.at(".basic-info-about/ul/li/p/a").text if page.at(".basic-info-about/ul/li/p/a")
+      node_2 = page.at(".basic-info-about/ul")
+      if node_2
+        node_2.search("p").zip(node_2.search("h4")).each do |value, title|
+          result[title.text.gsub(" ", "_").downcase.to_sym] = value.text.strip
+        end
+      end
+      result[:address] = page.at(".vcard.hq").at(".adr").text.gsub("\n", " ").strip if page.at(".vcard.hq")
+      result
+    end
+    def http_client
+      Mechanize.new do |agent|
+        agent.user_agent_alias = USER_AGENTS.sample
+        agent.max_history = 0
+      end
+    end
+    def get_linkedin_company_url(link)
+      http = %r{http://www.linkedin.com/}
+      https = %r{https://www.linkedin.com/}
+      if http.match(link) || https.match(link)
+        link
+      else
+        "http://www.linkedin.com/#{link}"
+      end
+    end
+  end
+end

data/lib/{linkedin-scraper → linkedin_scraper}/version.rb RENAMED Viewed

@@ -1,5 +1,5 @@
 module Linkedin
   module Scraper
-    VERSION = '0.1.5'
+    VERSION = '0.1.7'
   end
 end

data/lib/linkedin_scraper.rb ADDED Viewed

@@ -0,0 +1,5 @@
+require "rubygems"
+require "mechanize"
+require "cgi"
+require "net/http"
+Dir["#{File.expand_path(File.dirname(__FILE__))}/linkedin_scraper/*.rb"].each { |file| require file }

data/linkedin-scraper.gemspec CHANGED Viewed

@@ -1,9 +1,9 @@
 # -*- encoding: utf-8 -*-
-require File.expand_path('../lib/linkedin-scraper/version', __FILE__)
+require File.expand_path('../lib/linkedin_scraper/version', __FILE__)
 Gem::Specification.new do |gem|
   gem.authors       = ['Yatish Mehta']
-  gem.description   = %q{Scrapes the linkedin profile when a url is given }
+  gem.description   = %q{Scrapes the LinkedIn profile using the public url }
   gem.summary       = %q{when a url of  public linkedin profile page is given it scrapes the entire page and converts into a accessible object}
   gem.homepage      = 'https://github.com/yatishmehta27/linkedin-scraper'
   gem.files         = `git ls-files`.split($\)
@@ -13,9 +13,10 @@ Gem::Specification.new do |gem|
   gem.require_paths = ['lib']
   gem.version       = Linkedin::Scraper::VERSION
-  gem.add_dependency(%q<mechanize>, ['>= 0'])
-  gem.add_development_dependency 'rspec', '>=0'
-  gem.add_development_dependency 'rake'
+  gem.license       = "MIT"
+  gem.add_dependency 'mechanize', '~> 2'
+  gem.add_development_dependency 'rspec', '~> 3'
+  gem.add_development_dependency 'rake', '~> 10'
 end