RubyGems - indeed_scraper2022 - Versions diffs - 0.2.0 → 0.4.0 - Mend

indeed_scraper2022 0.2.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 07f5323381b5751c470454f6f4c3ba6dced6f1424e054b85360a49d814d662ba
-  data.tar.gz: 3d25353b9f8a0543944cac82ef6dc91adf7d3e83444f3c6ef469f15cbba8a3d8
+  metadata.gz: 7f98a83b7ed582d1b2973882701833688aec2d6d2bd132241a26c01a32915f93
+  data.tar.gz: dc5c34a5af19cdffbd244e15416914e91c8a06f365f0fff28bcd537a30ec468e
 SHA512:
-  metadata.gz: 5e13ae04b46bfa3eb15aab8d0aff388d8caec591c413493db591c37da099d2bcd5ba340a72137d4aa7d374652b68bc1d037b86fe4cc2ed2ae5b0a56c5202f00b
-  data.tar.gz: ca14ae99251aabbcaee08a3bb6f240742ed1fab0f438496dc742ef39a10abb13e310b2d6a93bc472f5e1b3e45cfd8956d6a62f803b1d3a152054cf4e1ae35402
+  metadata.gz: 4ce40e021339f6b1c24faed495ebbd4b257200f62d1466ffa54cd05e654ccef29b23dbfcf4af64b9426f4d2dbde6bb778f6b920a7656af8289e3f81a269ba54a
+  data.tar.gz: 634325fed61c7888b08fd72bfc47f4d64f98f5514110169152389e49940f15a080c02f62811aae2b5f34c6a604c17d2512d2219dba0f4473dc1034900bdb7ec6

checksums.yaml.gz.sig CHANGED Viewed

Binary file

data/lib/indeed_scraper2022.rb CHANGED Viewed

@@ -4,23 +4,36 @@
 require 'ferrumwizard'
 require 'nokorexi'
+require 'yaml'
 # Given the nature of changes to jobsearch websites,
 # don't rely upon this gem working in the near future.
+# this gem consists of 3 main classes:
+#
+# *  IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
+# *  IS22Plus - Archives the scraped vacancies to local file
+# *  IS22Archive - Allows viewing of archived vacancies offline
+#
 class IndeedScraper2022Err < Exception
 end
 class IndeedScraper2022
+  attr_reader :browser
   def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '',
                  headless: true, cookies: nil, debug: false)
     @debug = debug
     @url_base, @q, @location = url, q, location
     @headless, @cookies = headless, cookies
-    @results = search(q: @q, location: @location)
+    fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
+    @browser = fw.browser
   end
@@ -32,35 +45,49 @@ class IndeedScraper2022
   def search(q: @q, location: @location, start: nil)
-    fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
     url = @url_base
     url += 'start=' + start if start
-    browser = fw.browser
-    browser.goto(url)
+    @browser.goto(url)
+    #@browser.network.wait_for_idle
+    puts 'sleeping for 4 seconds' if @debug
+    sleep 4
     if q.length > 1 then
-      input = browser.at_xpath("//input[@name='q']")
-      input.focus.type(q)
+      input = @browser.at_xpath("//input[@name='q']")
+      # select any existing text and overwrite it
+      input.focus.type(:home); sleep 0.2
+      input.focus.type(:shift, :end); sleep 0.2
+      input.focus.type(q); sleep 0.2
     end
     if location.length > 1 then
-      input2 = browser.at_xpath("//input[@name='l']")
-      input2.focus.type(location)
+      input2 = @browser.at_xpath("//input[@name='l']")
+      # select any existing text and overwrite it
+      input2.focus.type(:home); sleep 0.2
+      input2.focus.type(:shift, :end); sleep 0.2
+      input2.focus.type(location); sleep 0.2
     end
-    button = browser.at_xpath("//button[@type='submit']")
+    button = @browser.at_xpath("//button[@type='submit']")
     button.click
+    #@browser.network.wait_for_idle
+    puts 'sleeping for 2 seconds' if @debug
+    sleep 2
-    doc2 = Nokogiri::XML(browser.body)
+    doc2 = Nokogiri::XML(@browser.body)
     a2 = doc2.xpath  "//a[div/div/div/div/table/tbody/tr/td/div]"
     puts 'a2: ' + a2.length.inspect if @debug
     @a2 = a2.map {|x| Rexle.new x.to_s }
-    @a2.map do |doc|
+    @results = @a2.map do |doc|
       div = doc.element("a[@class='desktop']/div[@class='slider"  \
           "_container']/div[@class='slider_list']/div[@class='sl"  \
@@ -126,52 +153,65 @@ class IndeedScraper2022
   def fetchjob(url)
     doc = Nokorexi.new(url).to_doc
+    puts 'before e0' if @debug
     e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
     #div = e0.element("//div[@class='jobsearch-JobComponent']")
+    puts 'before div1' if @debug
     div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
+    puts 'before div2' if @debug
     div2 = div1.element("div")
     # jobsearch (e.g. Full Stack Website Developer (Wordpress))
+    puts 'before jobtitle' if @debug
     jobtitle = div2.element("div[@class='jobsearch-JobInfoHead"  \
         "er-title-container']/h1[@class='jobsearch-JobInfoHead"  \
         "er-title']")&.text
+    puts 'before div3' if @debug
     div3 = div2.element("div[@class='jobsearch-CompanyInfoCon"  \
         "tainer']/div[@class='jobsearch-CompanyInfoWithoutHead"  \
         "erImage']/div/div[@class='jobsearch-DesktopStickyCont"  \
         "ainer-subtitle']")
     # icl (e.g. Lyles Sutherland)
+    puts 'before cname' if @debug
     cname = div3.xpath("div[@class='jobsearch-DesktopSt"  \
         "ickyContainer-companyrating']/div/div[@class='icl-u-x"  \
         "s-mr--xs']")[1]
+    puts 'before clink' if @debug
     clink = div3.element('//a')
     company = cname.text ? cname.text : clink.text
     companylink = clink.attributes[:href] if clink
+    puts 'before salary' if @debug
     salary = div1.element("//span[@class='attribute_snippet']")&.text
+    puts 'before type' if @debug
     type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
     div5 = div3.xpath("div/div")
     location, worklocation = div5.map(&:text).compact
     # icl (e.g. Full-time, Permanent)
+    puts 'before jobtype' if @debug
     jobtype = div1.element("div/div/div[@class='jobsearch-J"  \
         "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
     jobtype = jobtype&.texts.join if jobtype
     # jobsearch (e.g. Urgently needed)
+    puts 'before jobnote1' if @debug
     jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag"  \
         "']/div[@class='urgently-hiring']/div[@class='jobsearc"  \
         "h-DesktopTag-text']")&.text
     # jobsearch (e.g. 10 days ago)
+    puts 'before days' if @debug
     days = e0.element("//div[@class='jobsearch-JobTab-con"  \
         "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
     d = Date.today - days.to_i
     datepost = d.strftime("%Y-%m-%d")
+    puts 'before jobdesc' if @debug
     jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl"  \
         "ass='jobsearch-jobDescriptionText']").xml
@@ -200,12 +240,70 @@ class IS22Plus < IndeedScraper2022
           debug: debug)
   end
-  def archive()
+  # note: The most efficient method to accumulate vacancy articles is to
+  #       execute archive() daily
+  #
+  def archive(filepath='/tmp/indeed')
+    search() if @results.nil?
+    return unless @results
+    FileUtils.mkdir_p filepath
+    idxfile = File.join(filepath, 'index.yml')
-    1.upto(15).each do |n|
-      page(n)
+    index = if File.exists? idxfile then
+      YAML.load(File.read(idxfile))
+    else
+      {}
     end
+    @results.each.with_index do |item, i|
+      puts 'saving ' + item[:title] if @debug
+      puts 'link: ' + item[:link].inspect
+      links = RXFReader.reveal(item[:link])
+      puts 'links: ' + links.inspect if @debug
+      url = links.last
+      puts 'url: ' + url.inspect if @debug
+      id = url[/(?<=jk=)[^&]+/]
+      if index[id.to_sym] then
+        # the vacancy record has previously been saved
+        #
+        next
+      else
+        # write the full page vacancy article to file
+        #
+        File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
+        h = {
+          link: url[/^[^&]+/],
+          title: item[:title].to_s,
+          salary: item[:salary].to_s,
+          company: item[:company].to_s.strip,
+          location: item[:location].to_s,
+          jobsnippet: item[:jobsnippet],
+          date: item[:date],
+          added: Time.now.strftime("%Y-%m-%d")
+        }
+        # add the vacancy snippet to the index file
+        #
+        index[id.to_sym] = h
+      end
+    end
+    # save the vacancy index file
+    #
+    File.write idxfile, index.to_yaml
   end
   def list()
@@ -218,3 +316,38 @@ class IS22Plus < IndeedScraper2022
 end
+class IS22Archive
+  attr_reader :index
+  def initialize(filepath='/tmp/indeed', debug: false)
+    @debug = debug
+    FileUtils.mkdir_p filepath
+    @idxfile = File.join(filepath, 'index.yml')
+    @index = if File.exists? @idxfile then
+      YAML.load(File.read(@idxfile))
+    else
+      {}
+    end
+  end
+  def list()
+    @index.map.with_index do |x,i|
+      id, h = x
+      puts 'h: ' + h.inspect if @debug
+      "%2d. %s: %s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title]]
+    end.join("\n")
+  end
+end

data.tar.gz.sig CHANGED Viewed

Binary file

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: indeed_scraper2022
 version: !ruby/object:Gem::Version
-  version: 0.2.0
+  version: 0.4.0
 platform: ruby
 authors:
 - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
   YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
   SW/2zInu2bkj/meWm5eBoWHT
   -----END CERTIFICATE-----
-date: 2022-03-30 00:00:00.000000000 Z
+date: 2022-04-14 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokorexi
@@ -63,20 +63,20 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.2'
+        version: '0.3'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 0.2.2
+        version: 0.3.1
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.2'
+        version: '0.3'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 0.2.2
+        version: 0.3.1
 description:
 email: digital.robertson@gmail.com
 executables: []

metadata.gz.sig CHANGED Viewed

Binary file