RubyGems - indeed_scraper2022 - Versions diffs - 0.3.0 → 0.5.0 - Mend

indeed_scraper2022 0.3.0 → 0.5.0

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 972f811430fae59121e39c9c4752b64fc43b37165a52dcec8c3eac42cf1e4555
-  data.tar.gz: 85e987eb264b098b4c892e2e05d2ab082e3b8968fff2bdd519552e889f014f9d
+  metadata.gz: 833f3e77c7771f39e3eccbd5f277a8ca73fbd34d55d5efc7c2509b7a4dbf61bd
+  data.tar.gz: 707cf360d0ca30102e59bc0e5ead4111199db76a090d734729c327eaefbd6cdd
 SHA512:
-  metadata.gz: e7d3c2a13e315383248c557806dd0184d8831f46a2e314816395f62fcb886ba2e38a3e1f2deb180ceb33b614cf0b7be8a13379028fc56982e388948575bdb02c
-  data.tar.gz: 6ca4792d260c43b22fcee5ead8928525df17f1db112c49ac4cc7d7a5c9b29a8f483b94c18cd39bace1d4a4ee553ff83254a7fb445d76abf06c1705d19bac455c
+  metadata.gz: d3c8d5eacb62503e4b29634836b74a8b6c9636d9127fc345d79b9f177d75b41f2558c351ec7028d9a74887f964f440022b1d25e416f14752c16ded16055dcd2c
+  data.tar.gz: aea4011eea3c4f37f3537626e3ca2179bee215854a975d2b7618d09737395961fe88648460a8f58ce966acadc9a1fe8c21263319b07ff12147d4264b9374ae39

checksums.yaml.gz.sig CHANGED Viewed

Binary file

data/lib/indeed_scraper2022.rb CHANGED Viewed

@@ -5,12 +5,20 @@
 require 'ferrumwizard'
 require 'nokorexi'
 require 'yaml'
+require 'reveal_url22'
 # Given the nature of changes to jobsearch websites,
 # don't rely upon this gem working in the near future.
+# this gem consists of 3 main classes:
+#
+# *  IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
+# *  IS22Plus - Archives the scraped vacancies to local file
+# *  IS22Archive - Allows viewing of archived vacancies offline
+#
 class IndeedScraper2022Err < Exception
 end
@@ -37,9 +45,10 @@ class IndeedScraper2022
   end
   def search(q: @q, location: @location, start: nil)
+    puts 'inside search' if @debug
     url = @url_base
     url += 'start=' + start if start
+    puts 'url: ' + url.inspect if @debug
     @browser.goto(url)
     #@browser.network.wait_for_idle
@@ -74,34 +83,52 @@ class IndeedScraper2022
     sleep 2
     doc2 = Nokogiri::XML(@browser.body)
+    File.write '/tmp/body.txt', doc2.to_s if @debug
-    a2 = doc2.xpath  "//a[div/div/div/div/table/tbody/tr/td/div]"
+    a2 = doc2.root.xpath  "//li/div[div/div/div/div/table/tbody/tr/td/div/h2/a]"
     puts 'a2: ' + a2.length.inspect if @debug
     @a2 = a2.map {|x| Rexle.new x.to_s }
     @results = @a2.map do |doc|
-      div = doc.element("a[@class='desktop']/div[@class='slider"  \
+      div = doc.element("div[@class='cardOutline']/div[@class='slider"  \
           "_container']/div[@class='slider_list']/div[@class='sl"  \
           "ider_item']/div[@class='job_seen_beacon']")
       td = div.element("table[@class='jobCard_mainContent']/tbo"  \
           "dy/tr/td[@class='resultContent']")
       # job title (e.g. Software Developer)
-      jobtitle = td.element("div[@class='tapItem-gutter']/h2[@"  \
-          "class='jobTitle-color-purple']/span")&.text
+      job = td.element("div[@class='tapItem-gutter']/h2[@"  \
+          "class='jobTitle-color-purple']/a")
+      href = job.attributes[:href]
+      jobtitle = job.element("span")&.text
       puts 'jobtitle: ' + jobtitle.inspect if @debug
-      salary = td.element("div[@class='metadataContainer']/"  \
-          "div[@class='salary-snippet-container']/div[@class='sa"  \
-          "lary-snippet']/span")&.text
+      sal = td.element("div[@class='metadataContainer']/"  \
+          "div[@class='salary-snippet-container']")
+      salary = if sal then
+        sal_e = sal.element("div[@class='attribute_snippet']")
+        if sal_e then
+          sal_e.texts[0]
+        else
+          sal_e2 = sal.element("div[@class='salary-snippet']/span")
+          sal_e2 ? sal_e2.text : ''
+        end
+      else
+        ''
+      end
       puts 'salary: ' + salary.inspect if @debug
       div1 = td.element("div[@class='companyInfo']")
       # company name (e.g. Coda Octopus Products Ltd)
-      company_name = div1.element("span[@class='companyName']")&.text
+      coname = div1.element("span[@class='companyName']")
+      puts 'coname: ' + coname.text.inspect if @debug
+      company_name = coname.text.to_s.strip.length > 1 ? coname.text : coname.element('a').text
       # company location (e.g. Edinburgh)
       location = div1.element("div[@class='companyLocation']")&.text
@@ -111,7 +138,12 @@ class IndeedScraper2022
           "v[@class='result-footer']")
       # job (e.g. Our products are primarily written in C#, using...)
-      jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
+      advert_items = div3.xpath("div[@class='job-snippet']/ul/li/text()")
+      jobsnippet = if advert_items.any? then
+        advert_items.join("\n")
+      else
+        div3.element("div[@class='job-snippet']").text
+      end
       # visually (e.g. Posted 14 days ago)
       dateposted =  div3.element("span[@class='date']")&.texts
@@ -119,7 +151,7 @@ class IndeedScraper2022
       {
         link:  @url_base.sub(/\/[^\/]+$/,'') \
-          + doc.root.attributes[:href].gsub(/&amp;/,'&'),
+          + href.gsub(/&amp;/,'&'),
         title: jobtitle,
         salary: salary,
         company: company_name,
@@ -146,52 +178,65 @@ class IndeedScraper2022
   def fetchjob(url)
     doc = Nokorexi.new(url).to_doc
+    puts 'before e0' if @debug
     e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
     #div = e0.element("//div[@class='jobsearch-JobComponent']")
+    puts 'before div1' if @debug
     div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
+    puts 'before div2' if @debug
     div2 = div1.element("div")
     # jobsearch (e.g. Full Stack Website Developer (Wordpress))
+    puts 'before jobtitle' if @debug
     jobtitle = div2.element("div[@class='jobsearch-JobInfoHead"  \
         "er-title-container']/h1[@class='jobsearch-JobInfoHead"  \
         "er-title']")&.text
+    puts 'before div3' if @debug
     div3 = div2.element("div[@class='jobsearch-CompanyInfoCon"  \
         "tainer']/div[@class='jobsearch-CompanyInfoWithoutHead"  \
         "erImage']/div/div[@class='jobsearch-DesktopStickyCont"  \
         "ainer-subtitle']")
     # icl (e.g. Lyles Sutherland)
+    puts 'before cname' if @debug
     cname = div3.xpath("div[@class='jobsearch-DesktopSt"  \
         "ickyContainer-companyrating']/div/div[@class='icl-u-x"  \
         "s-mr--xs']")[1]
+    puts 'before clink' if @debug
     clink = div3.element('//a')
     company = cname.text ? cname.text : clink.text
     companylink = clink.attributes[:href] if clink
+    puts 'before salary' if @debug
     salary = div1.element("//span[@class='attribute_snippet']")&.text
+    puts 'before type' if @debug
     type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
     div5 = div3.xpath("div/div")
     location, worklocation = div5.map(&:text).compact
     # icl (e.g. Full-time, Permanent)
+    puts 'before jobtype' if @debug
     jobtype = div1.element("div/div/div[@class='jobsearch-J"  \
         "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
     jobtype = jobtype&.texts.join if jobtype
     # jobsearch (e.g. Urgently needed)
+    puts 'before jobnote1' if @debug
     jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag"  \
         "']/div[@class='urgently-hiring']/div[@class='jobsearc"  \
         "h-DesktopTag-text']")&.text
     # jobsearch (e.g. 10 days ago)
+    puts 'before days' if @debug
     days = e0.element("//div[@class='jobsearch-JobTab-con"  \
         "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
     d = Date.today - days.to_i
     datepost = d.strftime("%Y-%m-%d")
+    puts 'before jobdesc' if @debug
     jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl"  \
         "ass='jobsearch-jobDescriptionText']").xml
@@ -217,11 +262,16 @@ class IS22Plus < IndeedScraper2022
   def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
     super(q: q, location: location, headless: headless, cookies: cookies,
-          debug: debug)
+          debug: true)
   end
+  # note: The most efficient method to accumulate vacancy articles is to
+  #       execute archive() daily
+  #
   def archive(filepath='/tmp/indeed')
+    search() if @results.nil?
     return unless @results
     FileUtils.mkdir_p filepath
@@ -238,16 +288,23 @@ class IS22Plus < IndeedScraper2022
       puts 'saving ' + item[:title] if @debug
       puts 'link: ' + item[:link].inspect
-      links = RXFReader.reveal(item[:link])
-      puts 'links: ' + links.inspect
+      links = URL.reveal(item[:link])
+      puts 'links: ' + links.inspect if @debug
       url = links.last
-      id = url[/(?<=\?jk=)[^&]+/]
+      puts 'url: ' + url.inspect if @debug
+      id = url[/(?<=jk=)[^&]+/]
       if index[id.to_sym] then
+        # the vacancy record has previously been saved
+        #
         next
       else
+        # write the full page vacancy article to file
+        #
         File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
         h = {
@@ -257,14 +314,19 @@ class IS22Plus < IndeedScraper2022
           company: item[:company].to_s.strip,
           location: item[:location].to_s,
           jobsnippet: item[:jobsnippet],
-          date: item[:date]
+          date: item[:date],
+          added: Time.now.strftime("%Y-%m-%d")
         }
+        # add the vacancy snippet to the index file
+        #
         index[id.to_sym] = h
       end
     end
+    # save the vacancy index file
+    #
     File.write idxfile, index.to_yaml
   end
@@ -279,3 +341,37 @@ class IS22Plus < IndeedScraper2022
 end
+class IS22Archive
+  attr_reader :index
+  def initialize(filepath='/tmp/indeed', debug: false)
+    FileUtils.mkdir_p filepath
+    @idxfile = File.join(filepath, 'index.yml')
+    @index = if File.exists? @idxfile then
+      YAML.load(File.read(@idxfile))
+    else
+      {}
+    end
+  end
+  def list()
+    @index.to_a.reverse.map.with_index do |x,i|
+      id, h = x
+      puts 'h: ' + h.inspect if @debug
+      co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
+      "%2d. %s: %s%s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title], co]
+    end.join("\n")
+  end
+end

data.tar.gz.sig CHANGED Viewed

Binary file

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: indeed_scraper2022
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.5.0
 platform: ruby
 authors:
 - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
   YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
   SW/2zInu2bkj/meWm5eBoWHT
   -----END CERTIFICATE-----
-date: 2022-04-01 00:00:00.000000000 Z
+date: 2022-05-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokorexi
@@ -63,20 +63,40 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.2'
+        version: '0.3'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 0.2.2
+        version: 0.3.1
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.2'
+        version: '0.3'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 0.2.2
+        version: 0.3.1
+- !ruby/object:Gem::Dependency
+  name: reveal_url22
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.1'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.1.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.1'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.1.0
 description:
 email: digital.robertson@gmail.com
 executables: []

metadata.gz.sig CHANGED Viewed

Binary file