RubyGems - linkedindata - Versions diffs - 0.0.17 → 0.0.18 - Mend

linkedindata 0.0.17 → 0.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: a5c91c293adffc48f543a68f568efbb3da7995ea
-  data.tar.gz: 61d5376780067945e8666e9aceb2485a58b4391b
+  metadata.gz: 62911808bef43a12c8723a47135534fd7ff330fb
+  data.tar.gz: 4012d7ef04d34401d79ee1c3b4150e3a353358fc
 SHA512:
-  metadata.gz: 03651bdc5fc45d1c4ca3d15818029f4f8a5b7e743996b9d58955d24354544fbb1f49e459d2b1ad22af3f117a3342bf8b792ae39eca9f84a9f77e35b67cdff303
-  data.tar.gz: a1fb3827faa3f640769cc87d1a84a8c9857e15f4c35151d3b6776ce557ed921e4bbae6a588791452e2b29f951d8525fd16709aa899dc4072bdf01a2a5c2ccdf6
+  metadata.gz: 1912abe3d5349f5cbbcd4c06ab699926859fb78f75c96aef49f932351791d325a7101b7e585ee81fc0c87869d0b5440d3a7c9c77ecf926a5d280da50a7a1e023
+  data.tar.gz: a4de7d6888cd3ef25edf8037a7624a570890882744305694cc9a45f564b0094b08ae2d4bf0789569915e6ab9e9d56731b873f5bf714f042fb9af5fcfacdc2539

data/lib/get_related.rb ADDED Viewed

@@ -0,0 +1,80 @@
+module GetRelated
+  # Get the list of names of related people
+  def getList(html)
+    namelist = Array.new
+    # Save each person's name and url
+    html.css("div.insights-browse-map").each do |d|
+      if d.css("h3").text == "People Also Viewed"
+        d.css("li").each do |l|
+          namelist.push({name: l.css("h4").text,
+                         url: l.css("a")[0]['href']})
+        end
+      end
+    end
+    return namelist
+  end
+  # Get all profiles within numhops of original(s)
+  def getRelatedProfiles
+    @numhops.times do |hop_count|
+      @output.select { |profile| profile[:degree] == hop_count }.each do |item|
+        downloadRelated(item, hop_count) if item[:related_people]
+      end
+    end
+  end
+  # Scrapes the related profiles for one result item
+  def downloadRelated(item, hop_count)
+    item[:related_people].each do |related_person|
+      # Check if it has been scraped already
+      if @output.select { |person| related_person[:name] == person[:name] }.empty?
+        scrape(related_person[:url], hop_count+1)
+      end
+    end
+  end
+  # Make list of profiles for score tracking
+  def fullProfileList(data)
+    profiles = Hash.new
+    data.each do |d|
+      profiles[d[:profile_url]] = 0
+    end
+    return profiles
+  end
+  # Adds points to a profile for showing up in related people
+  def addPointsToProfile(profile_scores, data_item, person)
+    if profile_scores[person[:url]]
+      # Calculate degree- (2/d*2) except when degree is 0
+      degree_divide = data_item[:degree] == 0 ? 1 : data_item[:degree]*2
+      profile_scores[person[:url]] += (2.0/degree_divide)
+    end
+    return profile_scores
+  end
+  # Add a score to each profile based on the # of times it appears in "people also viewed"
+  def relScore(data)
+    profile_scores = fullProfileList(data)
+    # Get degree and calculate score for each profile
+    data.each do |data_item|
+      if data_item[:related_people]
+        data_item[:related_people].each do |person|
+          profile_scores = addPointsToProfile(profile_scores, data_item, person)
+        end
+      end
+    end
+    # Merge scores back into dataset
+    data.each do |m|
+      m.merge!(score: profile_scores[m[:profile_url]])
+    end
+    return data
+  end
+end

data/lib/linkedin.rb ADDED Viewed

@@ -0,0 +1,66 @@
+# Someone already made a nice gem for parsing public profiles:
+# https://github.com/yatish27/linkedin-scraper
+# This class reopens that to add extra things I need
+module Linkedin
+  class Profile
+    include ProxyManager
+    include GetRelated
+    def initialize(url, curhops, proxylist, usedproxies)
+      @linkedin_url = url
+      @curhops = curhops
+      @proxylist = proxylist
+      @usedproxies = usedproxies
+      # Add attributes to list
+      ATTRIBUTES.push(
+        "related_people",
+        "profile_url",
+        "timestamp",
+        "degree",
+        "pic_path")
+      @page = getPage(url) # Get pages with proxies
+    end
+    def self.get_profile(url, curhops, proxylist, usedproxies)
+      Linkedin::Profile.new(url, curhops, proxylist, usedproxies)
+    rescue => e
+      puts e
+    end
+    # Gets "people also viewed list" form profile sidebar
+    def related_people
+      @related_people ||= getList(Nokogiri::HTML(@page.body))
+    end
+    # Similar to linkedin_url
+    def profile_url
+      @profile_url ||= @linkedin_url
+    end
+    # Get the time the profile was scraped
+    def timestamp
+      @timestamp ||= Time.now
+    end
+    # Get the number of hops out where profile appears
+    def degree
+      @degree ||= @curhops
+    end
+    # Download the profile picture
+    def pic_path
+      if picture
+        # Get path
+        dir = "public/uploads/pictures/"
+        full_path = dir+picture.split("/").last.chomp.strip
+        # Get file
+        `wget -P #{dir} #{picture}` if !File.file?(full_path)
+        return full_path
+      end
+    end
+  end
+end

data/lib/linkedindata.rb CHANGED Viewed

@@ -1,81 +1,46 @@
-require 'mechanize'
 require 'linkedin-scraper'
+require 'generalscraper'
 require 'json'
 require 'nokogiri'
-require 'open-uri'
-load 'parseprofile.rb'
-require 'pry'
-require 'urlarchiver'
 require 'set'
+load 'parse_profile.rb'
+load 'get_related.rb'
+load 'linkedin.rb'
 class LinkedinData
-  def initialize(input, todegree)
-    @input = input
+  include GetRelated
+  include ParseProfile
+  include Linkedin
+  def initialize(todegree, proxylist)
+    @proxylist = IO.readlines(proxylist)
+    @proxy_list_path = proxylist
+    @usedproxies = Hash.new
     @output = Array.new
     @startindex = 10
     @numhops = todegree
   end
   # Searches for profiles on Google
-  def search
-    agent = Mechanize.new
-    agent.user_agent_alias = 'Linux Firefox'
-    gform = agent.get("http://google.com").form("f")
-    gform.q = "site:linkedin.com/pub " + @input
-    page = agent.submit(gform, gform.buttons.first)
-    examine(page)
-  end
-  # Examines a search page
-  def examine(page)
-    # Separate getting profile links and going to next page
-      # Method for getting links to all result pages
-      # Different method for getting all profile links on page and scraping (split to new thread for this)
-         # Has own output set, merge into full one at end (make sure threadsafe)
-    # Have own input and output
-    page.links.each do |link|
-      if (link.href.include? "linkedin.com") && (!link.href.include? "webcache") && (!link.href.include? "site:linkedin.com/pub+")
-        saveurl = link.href.split("?q=")
-        if saveurl[1]
-          url = saveurl[1].split("&")
-          begin
-            scrape(url[0], 0)
-          rescue
-          end
-        end
-      end
-      # Find the link to the next page and go to it
-      if (link.href.include? "&sa=N") && (link.href.include? "&start=")
-        url1 = link.href.split("&start=")
-        url2 = url1[1].split("&sa=N")
-        if url2[0].to_i == @startindex
-          sleep(rand(30..90))
-          @startindex += 10
-          agent = Mechanize.new
-          examine(agent.get("http://google.com" + link.href))
-        end
-      end
+  def search(search_terms)
+    g = GeneralScraper.new("site:linkedin.com/pub", search_terms, @proxy_list_path)
+    JSON.parse(g.getURLs).each do |profile|
+      scrape(profile, 0)
     end
   end
-  # Scrapes profile
+  # Scrapes and parses individual profile
   def scrape(url, curhops)
     # Download profile and rescue on error
     begin
       url.gsub!("https", "http")
-      profile = Linkedin::Profile.get_profile(url)
+      profile = Linkedin::Profile.get_profile(url, curhops, @proxylist, @usedproxies)
     rescue
     end
-    # Parse profile if returned
-    if profile
-      p = ParseProfile.new(profile, url, curhops)
-      @output.concat(p.parse)
-    end
+    # Parse profile if returned and add to output
+    @output.concat(parseResume(profile)) if profile
   end
   # Make sure all keys that occur occur in each item (even if nil)
@@ -101,59 +66,22 @@ class LinkedinData
     return datarr
   end
-  # Add a score to each profile based on the # of times it appears in "people also viewed"
-  def relScore(data)
-    # Make list of profiles
-    profiles = Hash.new
-    data.each do |d|
-      profiles[d["profile_url"]] = 0
-    end
-    # Get degree for each profile
-    data.each do |i|
-      if i["related_people"]
-        i["related_people"].each do |p|
-          if profiles[p["url"]]
-            # Calculate degree- (2/d*2) except when degree is 0
-            degree_divide = i["degree"] == 0 ? 1 : i["degree"]*2
-            profiles[p["url"]] += (2.0/degree_divide)
-          end
-        end
-      end
-    end
-    # Merge scores back into dataset
-    data.each do |m|
-      m.merge!(:score => profiles[m["profile_url"]])
-    end
-    return data
+  # Gets related profiles then adds relevance scores and any missing keys
+  def prepareResults
+    getRelatedProfiles
+    deleteDuplicatePics
+    return JSON.pretty_generate(relScore(showAllKeys(@output)))
   end
-  # Gets all data and returns in JSON
-  def getData
-    search
-    # Get related profiles
-    @numhops.times do
-      @output.each do |o|
-        if o[:degree] < @numhops
-          if o[:related_people]
-            o[:related_people].each do |i|
-              if @output.select { |obj| obj[:name] == i[:name]}.empty?
-                scrape(i[:url], o[:degree]+1)
-              end
-            end
-          end
-        end
-      end
-    end
-    formatted_json = JSON.pretty_generate(relScore(showAllKeys(@output)))
-    return formatted_json
+  # Gets one profile and the related profiles
+  def getSingleProfile(url)
+    scrape(url, 0)
+    return prepareResults
+  end
+  # Gets all profiles in search results and returns in JSON
+  def getByKeywords(search_term)
+    search(search_term)
+    return prepareResults
   end
 end

data/lib/parse_profile.rb ADDED Viewed

@@ -0,0 +1,50 @@
+module ParseProfile
+  # Parse profile into items by company
+  def parseResume(profile)
+    output = Array.new
+    # Parse profiles for current companies
+    profile.current_companies.each do |c|
+      output.push(addPersonFields(c, "Yes", profile))
+    end
+    # Parse past position/company info
+    profile.past_companies.each do |c|
+      output.push(addPersonFields(c, "No", profile))
+    end
+    return output
+  end
+  # Deletes duplicate pictures
+  def deleteDuplicatePics
+    pics = Dir["public/uploads/pictures/*.jpg.*"]
+    pics.each do |p|
+      File.delete(p)
+    end
+  end
+  # Merge person data with role data
+  def addPersonFields(c, status, profile)
+    c.merge!(
+             skills: profile.skills,
+             certifications: profile.certifications,
+             languages: profile.languages,
+             name: profile.name,
+             location: profile.location,
+             area: profile.country,
+             industry: profile.industry,
+             picture: profile.picture,
+             organizations: profile.organizations,
+             groups: profile.groups,
+             education: profile.education,
+             websites: profile.websites,
+             profile_url: profile.profile_url,
+             current: status,
+             timestamp: profile.timestamp,
+             related_people: profile.related_people,
+             degree: profile.degree,
+             pic_path: profile.pic_path)
+    return c
+  end
+end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: linkedindata
 version: !ruby/object:Gem::Version
-  version: 0.0.17
+  version: 0.0.18
 platform: ruby
 authors:
 - M. C. McGrath
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2015-02-24 00:00:00.000000000 Z
+date: 2015-04-11 00:00:00.000000000 Z
 dependencies: []
 description: Scrapes all LinkedIn profiles including terms you specify.
 email: shidash@shidash.com
@@ -16,9 +16,10 @@ executables: []
 extensions: []
 extra_rdoc_files: []
 files:
-- lib/getrelated.rb
+- lib/get_related.rb
+- lib/linkedin.rb
 - lib/linkedindata.rb
-- lib/parseprofile.rb
+- lib/parse_profile.rb
 homepage: https://github.com/transparencytoolkit/linkedindata
 licenses:
 - GPL

data/lib/getrelated.rb DELETED Viewed

@@ -1,55 +0,0 @@
-require 'json'
-require 'nokogiri'
-require 'open-uri'
-class GetRelated
-  def initialize(url)
-    @url = url
-    @relatedlist = Array.new
-  end
-  # Get the list of names of related people
-  def getList
-    html = Nokogiri::HTML(open(@url.gsub("http", "https")))
-    if html
-       namelist = Array.new
-      # Go through each person
-      html.css("div.insights-browse-map").each do |d|
-        if d.css("h3").text == "People Also Viewed"
-          d.css("li").each do |l|
-            temphash = Hash.new
-            temphash[:name] = l.css("h4").text
-            temphash[:url] = l.css("a")[0]['href']
-            namelist.push(temphash)
-          end
-        end
-      end
-      return namelist
-    end
-  end
-end
-# This is just an outline for the next version of getrelated
-# Add degree back as field (0 by default)
-# Loop through all profiles
-    # Load n times (need to determine optimal num)
-       # Save list of related people (for profile- make list and append if seen listed as related or in related list)
-       # Save overall list of related people (with URLs and min degree)
-          # Track min degrees out
-# Go through overall list of related people
-     # Parse profile
-     # Make sure degree is correct when saved
-     # Maybe save in JSONs by degree
-# Info:
-  # Profiles of related people
-  # Degrees for all profiles
-  # Related people list on each profile (complete)
-# Deduplicate

data/lib/parseprofile.rb DELETED Viewed

@@ -1,79 +0,0 @@
-require 'json'
-load 'getrelated.rb'
-require 'pry'
-class ParseProfile
-  def initialize(profile, url, curhops)
-    @profile = profile
-    @url = url
-    @output = Array.new
-    @related_people
-    @curhops = curhops
-  end
-  # Parse profile
-  def parse
-    begin
-      g = GetRelated.new(@url)
-      @related_people = g.getList
-    rescue
-    end
-    # Parse profiles for current companies
-    @profile.current_companies.each do |c|
-      @output.push(parseCompany(c, "Yes"))
-    end
-    # Parse past position/company info
-    @profile.past_companies.each do |c|
-      @output.push(parseCompany(c, "No"))
-    end
-    # Clean up directories
-    pics = Dir["public/uploads/*.jpg.*"]
-    pics.each do |p|
-      File.delete(p)
-    end
-    return @output
-  end
-  # Merge person data with role data
-  def parseCompany(c, status)
-    c.merge!(
-             :skills => @profile.skills,
-             :certifications => @profile.certifications,
-             :languages => @profile.languages,
-             :name => @profile.first_name + " " + @profile.last_name,
-             :location => @profile.location,
-             :area => @profile.country,
-             :industry => @profile.industry,
-             :picture => @profile.picture,
-             :organizations => @profile.organizations,
-             :groups => @profile.groups,
-             :education => @profile.education,
-             :websites => @profile.websites,
-             :profile_url => @url,
-             :current => status,
-             :timestamp => Time.now,
-             :related_people => @related_people,
-             :degree => @curhops)
-    c.merge!(:pic_path => getPic)
-    return c
-  end
-  # Download pictures
-  def getPic
-    if @profile.picture
-      path = @profile.picture.split("/")
-      if !File.file?("public/uploads/pictures/" + path[path.length-1].chomp.strip)
-        begin
-          `wget -P public/uploads/pictures #{@profile.picture}`
-        rescue
-        end
-      end
-      return "public/uploads/pictures/" + path[path.length-1].chomp.strip
-    end
-  end
-end