RubyGems - linsc - Versions diffs - 0.0.2 - Mend

linsc 0.0.2

Files changed (26) hide show

@@ -0,0 +1,70 @@
+require_relative 'csv_handlers'
+class Merger
+  include CSVHandlers
+  def initialize(input_dir, output_path, mapping = nil)
+    @input_dir, @output_path, @mapping = input_dir, output_path, mapping
+    recruiter_file = Pathname.new(File.dirname __dir__).realdirpath + '../data/recruiters.txt'
+    @recruiters = recruiter_file.read.split(",").collect{|r| r.strip}
+    @lin_files = @input_dir.children.select{|fn| fn.to_s.match(/LIN.+\.csv/)}
+    if mapping
+      @headers = mapping.values
+    else
+      @headers = get_headers(@lin_files.first)
+    end
+    if File.exist?(@output_path)
+      File.delete(@output_path)
+    end
+    create_file(@output_path)
+  end
+  def construct_emails_hash
+    emails = {}
+    @lin_files.each do |pn|
+      lin_file = pn.to_s
+      recruiter_name = lin_file.match(/LIN[^.]+/)[0]
+      puts "merging #{recruiter_name}"
+      clean_file = File.read(lin_file, encoding: 'windows-1252').strip
+      CSV.parse(clean_file, headers: true, encoding: 'windows-1252') do |row|
+        row["Recruiter"] = recruiter_name
+        email = row['E-mail Address']&.downcase
+        if emails.has_key?(email)
+          emails[email] << row
+        else
+          emails[email] = [row]
+        end
+      end
+    end
+    emails
+  end
+  def merge
+    emails = construct_emails_hash
+    i = 0
+    j = emails.length
+    emails.each do |ek, ev|
+      i += 1
+      puts "merging - row #{i}/#{j}"
+      correct_row = ev.find do |row|
+        row['Recruiter'] == @recruiters.find do |rec|
+           ev.collect {|row| row['Recruiter']}.include?(rec)
+        end
+      end
+      if @mapping
+        output_row = CSV::Row.new(@headers, [])
+        correct_row.each do |key, value|
+          if @mapping[key]
+            output_row[@mapping[key]] = value&.encode('utf-8')
+          end
+        end
+        output_row['Email'] = output_row['Email']&.downcase
+      else
+        output_row = create_row(correct_row, @headers, 'utf-8')
+      end
+      append_to_csv(@output_path, output_row)
+    end
+    @output_path
+  end
+end

data/lib/linsc/parsers.rb ADDED

@@ -0,0 +1,320 @@
+module Parsers
+  def scrape_contact(input_row, page, mode)
+    row = CSV::Row.new(@headers, [])
+    name = page.at_css("#name")&.text&.split
+    contact_id = input_row["Contact ID"]
+    lin_id = input_row["LIN ID"]
+    cv_tr = input_row["CV TR"]
+    acc_name = input_row["Account Name"]
+    import_status = input_row["Linkedin Import Status"]
+    email = input_row["Email"]
+    lin_profile = input_row["Linkedin Profile"]
+    cand_id = input_row["Candidate ID"]
+    cand_source = input_row["LIN 1st Degree"]
+    title = page.at_css(".headline.title")&.text
+    country = page.at_css("#demographics .locality")&.text
+    sector = page.at_css("#demographics .descriptor:not(.adr)")&.text
+    positions = page.css("#experience .positions .position")
+    if positions
+      e1_title = positions[0]&.at_css(".item-title")&.text
+      e1_org = positions[0]&.at_css(".item-subtitle")&.text
+      e1_start = positions[0]&.css(".date-range time")[0]&.text
+      e1_end = positions[0]&.css(".date-range time")[1]&.text
+      e1_loc = positions[0]&.at_css(".location")&.text
+      e1_desc = positions[0]&.at_css(".description")&.text
+      e2_title = positions[1]&.at_css(".item-title")&.text
+      e2_org = positions[1]&.at_css(".item-subtitle")&.text
+      e2_start = positions[1]&.css(".date-range time")[0]&.text
+      e2_end = positions[1]&.css(".date-range time")[1]&.text
+      e2_loc = positions[1]&.at_css(".location")&.text
+      e2_desc = positions[1]&.at_css(".description")&.text
+      e3_title = positions[2]&.at_css(".item-title")&.text
+      e3_org = positions[2]&.at_css(".item-subtitle")&.text
+      e3_start = positions[2]&.css(".date-range time")[0]&.text
+      e3_end = positions[2]&.css(".date-range time")[1]&.text
+      e3_loc = positions[2]&.at_css(".location")&.text
+      e3_desc = positions[2]&.at_css(".description")&.text
+    end
+    certs = page.css(".certifications .certification")
+    if certs
+      c1_name = certs[0]&.at_css(".item-title")&.text
+      c2_name = certs[1]&.at_css(".item-title")&.text
+      c_type  = certs[0]&.at_css(".item-subtitle")&.text
+    end
+    schools = page.css("#education .schools .school")
+    if schools
+      s1_name = schools[0]&.at_css(".item-title")&.text
+      s2_name = schools[1]&.at_css(".item-title")&.text
+      s1_start = schools[0]&.css(".date-range time")[0]&.text
+      s2_start = schools[1]&.css(".date-range time")[0]&.text
+      s1_end = schools[0]&.css(".date-range time")[1]&.text
+      s2_end = schools[1]&.css(".date-range time")[1]&.text
+      s1_degree = schools[0]&.at_css(".item-subtitle")&.text
+      s2_degree = schools[1]&.at_css(".item-subtitle")&.text
+    end
+    summary = page.at_css("#summary .description")
+    summary&.css('br').each{|br| br.replace "\n"} if summary
+    text_resume = "\n\n***IMPORTED FROM LINKEDIN***\n#{lin_profile}\n\n"
+    text_resume += name.join(" ")
+    text_resume += "\n#{email}"
+    text_resume += "\nTitle: #{title}" if title
+    text_resume += "\nLocation: #{country}" if country
+    text_resume += "\nSector: #{sector}" if sector
+    text_resume += "\n\nSUMMARY\n#{summary.text}" if summary
+    text_resume += "\n\nEXPERIENCE\n" if positions && positions.length > 0
+    positions.each do |position|
+      jtitle = position.at_css(".item-title")
+      jcompany = position.at_css(".item-subtitle")
+      jdates = position.at_css(".date-range")
+      jlocation = position.at_css(".location")
+      jdesc = position.at_css(".description")
+      jdesc.css('br').each{|br| br.replace "\n"} if jdesc
+      text_resume += "\n#{jtitle.text}\n" if jtitle
+      text_resume += " - #{jcompany.text}\n" if jcompany && jcompany.text.length > 0
+      text_resume += "#{jdates.text}\n" if jdates
+      text_resume += "#{jlocation.text}\n" if jlocation
+      text_resume += "#{jdesc.text}\n" if jdesc
+    end
+    text_resume += "\n\nEDUCATION\n" if schools && schools.length > 0
+    schools.each do |school|
+      stitle = school.at_css(".item-title")
+      sdegree = school.at_css(".item-subtitle")
+      sdates = school.at_css(".date-range")
+      sdesc = school.at_css(".description")
+      sdesc.css('br').each{|br| br.replace "\n"} if sdesc
+      text_resume += "\n#{stitle.text}\n" if stitle
+      text_resume += " - #{sdegree.text}\n" if sdegree && sdegree.text.length > 0
+      text_resume += "#{sdates.text}\n" if sdates
+      text_resume += "#{sdesc.text}\n" if sdesc
+    end
+    text_resume  += "\n\nCERTIFICATIONS\n" if certs && certs.length > 0
+    certs.each do |cert|
+      ctitle = cert.at_css(".item-title")
+      csub = cert.at_css(".item-subtitle")
+      cdates = cert.at_css(".date-range")
+      text_resume += "\n#{ctitle.text}\n" if ctitle
+      text_resume += "#{csub.text}\n" if csub
+      text_resume += "#{cdates.text}\n" if cdates
+    end
+    interests = page.css("#interests .pills .interest")
+    text_resume += "\nINTERESTS\n" if interests && interests.length > 0
+    ints = []
+    interests.each do |interest|
+      int = interest.at_css(".wrap")&.text
+      if int
+        ints << int unless (int == "See less") || (int.match(/See \d+\+/))
+      end
+    end
+    text_resume += "#{ints.join(", ")}\n\n"
+    skills = page.css("#skills .pills .skill")
+    text_resume += "\n\nSKILLS\n" if skills && skills.length > 0
+    sks = []
+    skills.each do |skill|
+      sk = skill.at_css(".wrap")&.text
+      if sk
+        sks << sk unless (sk == "See less") || (sk.match(/See \d+\+/))
+      end
+    end
+    text_resume += "#{sks.join(", ")}\n\n"
+    languages = page.css("#languages .language")
+    text_resume += "\n\nLANGUAGES\n" if languages.length > 0
+    langs = []
+    languages.each do |language|
+      lang = language.at_css(".name")&.text
+      prof = language.at_css(".proficiency")
+      lang += " (#{prof.text})" if prof && prof.text.length > 0
+      langs << lang if lang
+    end
+    text_resume += "#{langs.join(", ")}\n\n"
+    projects = page.css("#projects .project")
+    text_resume += "\n\nPROJECTS\n" if projects && projects.length > 0
+    projects.each do |project|
+      ptitle = project.at_css(".item-title")
+      pdates = project.at_css(".date-range")
+      pdesc = project.at_css(".description")
+      pdesc.css('br').each{|br| br.replace "\n"} if pdesc
+      pcont = project.at_css(".contributors")
+      text_resume += "\n#{ptitle.text}\n" if ptitle
+      text_resume += "#{pdates.text}\n" if pdates
+      text_resume += "#{pdesc.text}\n" if pdesc
+      text_resume += "#{pcont.text}\n " if pcont
+    end
+    pubs = page.css("#publications .publication")
+    text_resume += "\n\nPUBLICATIONS\n" if pubs && pubs.length > 0
+    pubs.each do |pub|
+      pubtitle = pub.at_css(".item-title")
+      pubsub = pub.at_css(".item-subtitle")
+      pubdates = pub.at_css(".date-range")
+      pubdesc = pub.at_css(".description")
+      pubdesc.css('br').each{|br| br.replace "\n"} if pubdesc
+      pubcont = pub.at_css(".contributors")
+      text_resume += "\n#{pubtitle.text}\n" if pubtitle
+      text_resume += "#{pubsub.text}\n" if pubsub
+      text_resume += "#{pubdates.text}\n" if pubdates
+      text_resume += "#{pubdesc.text}\n" if pubdesc
+      text_resume += "#{pubcont.text}\n" if pubcont
+    end
+    vols = page.css("#volunteering .position")
+    text_resume += "\n\nVOLUNTEERING\n" if vols && vols.length > 0
+    vols.each do |vol|
+      voltitle = vol.at_css(".item-title")
+      volsub = vol.at_css(".item-subtitle")
+      voldates = vol.at_css(".date-range")
+      voldesc = vol.at_css(".description")
+      voldesc.css('br').each{|br| br.replace "\n"} if voldesc
+      volcause = vol.at_css(".cause")
+      text_resume += "\n#{voltitle.text}\n" if voltitle
+      text_resume += "#{volsub.text}\n" if volsub
+      text_resume += "#{voldates.text}\n" if voldates
+      text_resume += "Cause: #{volcause.text}\n" if volcause
+      text_resume += "#{voldesc.text}\n" if voldesc
+    end
+    orgs = page.css("#organizations li")
+    text_resume += "\n\nORGANIZATIONS\n" if orgs && orgs.length > 0
+    orgs.each do |org|
+      orgtitle = org.at_css(".item-title")
+      orgsub = org.at_css(".item-subtitle")
+      orgdates = org.at_css(".date-range")
+      orgdesc = org.at_css(".description")
+      orgdesc.css('br').each{|br| br.replace "\n"} if orgdesc
+      text_resume += "\n#{orgtitle.text}\n" if orgtitle
+      text_resume += "#{orgsub.text}\n" if orgsub
+      text_resume += "#{orgdates.text}\n" if orgdates
+      text_resume += "#{orgdesc.text}\n" if orgdesc
+    end
+    pats = page.css("#patents .patent")
+    text_resume += "\n\nPATENTS\n" if pats && pats.length > 0
+    pats.each do |pat|
+      pattitle = pat.at_css(".item-title")
+      patsub = pat.at_css(".item-subtitle")
+      patdates = pat.at_css(".date-range")
+      patdesc = pat.at_css(".description")
+      patdesc.css('br').each{|br| br.replace "\n"} if patdesc
+      patcont = pat.at_css(".contributors")
+      text_resume += "\n#{pattitle.text}\n" if pattitle
+      text_resume += "#{patsub.text}\n" if patsub
+      text_resume += "#{patdates.text}\n" if patdates
+      text_resume += "#{patdesc.text}\n" if patdesc
+      text_resume += "#{patcont.text}\n" if patcont
+    end
+    awards = page.css("#awards .award")
+    text_resume += "\n\nAWARDS\n" if awards && awards.length > 0
+    awards.each do |award|
+      atitle = award.at_css(".item-title")
+      asub = award.at_css(".item-subtitle")
+      adates = award.at_css(".date-range")
+      adesc = award.at_css(".description")
+      adesc.css('br').each{|br| br.replace "\n"} if adesc
+      text_resume += "\n#{atitle.text}\n" if atitle
+      text_resume += "#{asub.text}\n" if asub
+      text_resume += "#{adates.text}\n" if adates
+      text_resume += "#{adesc.text}\n" if adesc
+    end
+    courses = page.css("#courses li")
+    text_resume += "\n\nCOURSES\n" if courses && courses.length > 0
+    courses.each do |course|
+      coutitle = course.at_css(".item-title")
+      coulist = course.at_css(".courses-list")
+      text_resume += "\n#{coutitle.text}\n" if coutitle
+      text_resume += "#{coulist.text}\n" if coulist
+    end
+    row["Contact ID"] = contact_id
+    row["LIN ID"] = lin_id
+    row["CV TR"] = "1"
+    row["Account Name"] = acc_name
+    row["Linkedin Import Status"] = import_status
+    row["First Name"] = name[0]&.slice(0, 39)
+    row["Last Name"] = name[1..-1]&.join(" ")&.slice(0, 79)
+    row["Email"] = email
+    row["Candidate ID"] = cand_id
+    row["LIN 1st Degree"] = cand_source
+    row["Title"] = title&.slice(0, 127)
+    row["Contact Country"] = country
+    row["Contact LIN Sector"] = sector&.slice(0, 99)
+    row["Employer 1 Title"] = e1_title&.slice(0, 31999)
+    row["Employer Organization Name 1"] = e1_org&.slice(0, 254)
+    row["Employer 1 Start Date"] = format_date(e1_start) #format
+    row["Employer 1 End Date"] = format_date(e1_end) #format
+    row["Employer 1 Location"] = e1_loc&.slice(0, 254)
+    row["Employer 1 Description"] = e1_desc&.slice(0, 31999)
+    row["Employer 2 Title"] = e2_title&.slice(0, 31999)
+    row["Employer Organization Name 2"] = e2_org&.slice(0, 254)
+    row["Employer 2 Start Date"] = format_date(e2_start) #format
+    row["Employer 2 End Date"] = format_date(e2_end) #format
+    row["Employer 2 Location"] = e2_loc&.slice(0, 254)
+    row["Employer 2 Description"] = e2_desc&.slice(0, 31999)
+    row["Employer 3 Title"] = e3_title&.slice(0, 31999)
+    row["Employer Organization Name 3"] = e3_org&.slice(0, 254)
+    row["Employer 3 Start Date"] = format_date(e3_start) #format
+    row["Employer 3 End Date"] = format_date(e3_end) #format
+    row["Employer 3 Location"] = e3_loc&.slice(0, 254)
+    row["Employer 3 Description"] = e3_desc&.slice(0, 31999)
+    row["License or Certification Name 1"] = c1_name&.slice(0, 254)
+    row["License or Certification Name 2"] = c2_name&.slice(0, 254)
+    row["License or Certification Credential Type"] = c_type&.slice(0, 254)
+    row["Education School 1"] = s1_name&.slice(0, 124)
+    row["Education Degree Name 1"] = s1_degree&.slice(0, 254)
+    row["Education Degree Date 1"] = format_date(s1_end)
+    row["Education School 2"] = s2_name&.slice(0, 124)
+    row["Education Degree Name 2"] = s2_degree&.slice(0, 254)
+    row["Education Degree Date 2"] = format_date(s2_end)
+    row["Text Resume"] = text_resume&.slice(0, 31999)
+    row["LinkedIn Profile"] = lin_profile&.slice(0, 254)
+    row["Resume Last Updated"] = Time.now.strftime('%Y-%m-%d %H:%M:%S')
+    row["LIN Import Date"] = Time.now.strftime('%Y-%m-%d')
+    row["CV Uploaded"] = "1"
+    row
+  end
+  def scrape_education(input_row, page)
+    rows = []
+    schools = page.css("#education .schools .school")
+    schools.each do |school|
+      row = CSV::Row.new(@education_headers, [])
+      row["Contact"] = input_row["Contact ID"]
+      row["LIN ID"] = input_row["LIN ID"]
+      row["School Name"] = school.at_css(".item-title").text.slice(0, 149)
+      row["Major"] = school.at_css(".item-subtitle").text.slice(0, 254)
+      dstart = school.css(".date-range time")[0]
+      dend = school.css(".date-range time")[1]
+      if dend
+        row["Graduation Year"] = dend.text.gsub(/\D/, '').slice(0, 74)
+      else
+        row["Graduation Year"] = dstart.text.gsub(/\D/, '').slice(0, 74)
+      end
+      rows << row
+    end
+    rows
+  end
+  def scrape_employment(input_row, page)
+    rows = []
+    positions = page.css("#experience .positions .position")
+    positions.each do |position|
+      row = CSV::Row.new(@employment_headers, [])
+      row["Contact"] = input_row["Contact ID"]
+      row["LIN ID"] = input_row["LIN ID"]
+      row["Job Title"] = position.at_css(".item-title").text.slice(0, 74)
+      row["Employer Name"] = position.at_css(".item-subtitle").text.slice(0, 149)
+      jstart = position.css(".date-range time")[0]
+      jend = position.css(".date-range time")[1]
+      row["Start Date"] = format_date(jstart.text)
+      row["End Date"] = format_date(jend.text)
+      row["Location"] = position.at_css(".location").text.slice(0, 254)
+      rows << row
+    end
+    rows
+  end
+end

data/lib/linsc/proxy.rb ADDED

@@ -0,0 +1,30 @@
+class Proxy
+  attr_accessor :ip, :port, :username, :password, :status, :last_used, :user_agent
+  def initialize(ip:, port: 80, username: nil, password: nil, status: nil, last_used: nil, user_agent: nil)
+    @ip, @port, @username, @password, @status, @last_used =
+                  ip, port, username, password, status, last_used
+  end
+  def dead
+    @status = 'dead'
+    @last_used = Time.now
+  end
+  def good
+    @status = 'good'
+    @last_used = Time.now
+  end
+  def good?
+    @status == 'good' ? true : false
+  end
+  def dead?
+    @status == 'dead' ? true : false
+  end
+  def used
+    @last_used = Time.now
+  end
+end

data/lib/linsc/proxy_handler.rb ADDED

@@ -0,0 +1,42 @@
+require_relative 'proxy'
+class ProxyHandler
+  def initialize(cooldown_time = 5)
+    @cooldown_time = cooldown_time
+    @proxy_list = File.read('./../data/proxies.txt').split("\n")
+    .collect{|proxy| proxy.split(':')}
+    @proxies = []
+    @ua_list = File.read('./../data/agents.txt').split("\n")
+    @proxy_list.each do |proxy_details|
+      proxy = Proxy.new(ip: proxy_details[0], port: proxy_details[1],
+      username: proxy_details[2], password: proxy_details[3], status: 'good',
+      last_used: Time.now - @cooldown_time, user_agent: @ua_list.shift)
+      @proxies << proxy
+    end
+    if @proxies.length == 0
+      puts "proxies.txt is empty! if you don't want to use any proxies, use the -n flag. see docs for more."
+      exit
+    end
+  end
+  def get_proxy
+    @good_proxies = @proxies.select { |proxy| proxy.good? }
+    if @good_proxies.length > 0
+      @good_proxies.sort!{|a, b| a.last_used <=> b.last_used}
+      best_proxy = @good_proxies.first
+      duration = Time.now - best_proxy.last_used
+      sleep(@cooldown_time - duration) if duration < @cooldown_time
+      best_proxy
+    else
+      puts "All proxies are dead. Wait a few hours before resuming."
+      exit
+    end
+  end
+  def length
+    @proxies.length
+  end
+end