RubyGems - indeed_scraper2022 - Versions diffs - 0.3.0 → 0.5.0 - Mend

indeed_scraper2022 0.3.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 972f811430fae59121e39c9c4752b64fc43b37165a52dcec8c3eac42cf1e4555
-  data.tar.gz: 85e987eb264b098b4c892e2e05d2ab082e3b8968fff2bdd519552e889f014f9d
+  metadata.gz: 833f3e77c7771f39e3eccbd5f277a8ca73fbd34d55d5efc7c2509b7a4dbf61bd
+  data.tar.gz: 707cf360d0ca30102e59bc0e5ead4111199db76a090d734729c327eaefbd6cdd
 SHA512:
-  metadata.gz: e7d3c2a13e315383248c557806dd0184d8831f46a2e314816395f62fcb886ba2e38a3e1f2deb180ceb33b614cf0b7be8a13379028fc56982e388948575bdb02c
-  data.tar.gz: 6ca4792d260c43b22fcee5ead8928525df17f1db112c49ac4cc7d7a5c9b29a8f483b94c18cd39bace1d4a4ee553ff83254a7fb445d76abf06c1705d19bac455c
+  metadata.gz: d3c8d5eacb62503e4b29634836b74a8b6c9636d9127fc345d79b9f177d75b41f2558c351ec7028d9a74887f964f440022b1d25e416f14752c16ded16055dcd2c
+  data.tar.gz: aea4011eea3c4f37f3537626e3ca2179bee215854a975d2b7618d09737395961fe88648460a8f58ce966acadc9a1fe8c21263319b07ff12147d4264b9374ae39

checksums.yaml.gz.sig CHANGED Viewed

Binary file

data/lib/indeed_scraper2022.rb CHANGED Viewed

@@ -5,12 +5,20 @@
 require 'ferrumwizard'
 require 'nokorexi'
 require 'yaml'
+require 'reveal_url22'
 # Given the nature of changes to jobsearch websites,
 # don't rely upon this gem working in the near future.
+# this gem consists of 3 main classes:
+#
+# *  IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
+# *  IS22Plus - Archives the scraped vacancies to local file
+# *  IS22Archive - Allows viewing of archived vacancies offline
+#
 class IndeedScraper2022Err < Exception
 end
@@ -37,9 +45,10 @@ class IndeedScraper2022
   end
   def search(q: @q, location: @location, start: nil)
+    puts 'inside search' if @debug
     url = @url_base
     url += 'start=' + start if start
+    puts 'url: ' + url.inspect if @debug
     @browser.goto(url)
     #@browser.network.wait_for_idle
@@ -74,34 +83,52 @@ class IndeedScraper2022
     sleep 2
     doc2 = Nokogiri::XML(@browser.body)
+    File.write '/tmp/body.txt', doc2.to_s if @debug
-    a2 = doc2.xpath  "//a[div/div/div/div/table/tbody/tr/td/div]"
+    a2 = doc2.root.xpath  "//li/div[div/div/div/div/table/tbody/tr/td/div/h2/a]"
     puts 'a2: ' + a2.length.inspect if @debug
     @a2 = a2.map {|x| Rexle.new x.to_s }
     @results = @a2.map do |doc|
-      div = doc.element("a[@class='desktop']/div[@class='slider"  \
+      div = doc.element("div[@class='cardOutline']/div[@class='slider"  \
           "_container']/div[@class='slider_list']/div[@class='sl"  \
           "ider_item']/div[@class='job_seen_beacon']")
       td = div.element("table[@class='jobCard_mainContent']/tbo"  \
           "dy/tr/td[@class='resultContent']")
       # job title (e.g. Software Developer)
-      jobtitle = td.element("div[@class='tapItem-gutter']/h2[@"  \
-          "class='jobTitle-color-purple']/span")&.text
+      job = td.element("div[@class='tapItem-gutter']/h2[@"  \
+          "class='jobTitle-color-purple']/a")
+      href = job.attributes[:href]
+      jobtitle = job.element("span")&.text
       puts 'jobtitle: ' + jobtitle.inspect if @debug
-      salary = td.element("div[@class='metadataContainer']/"  \
-          "div[@class='salary-snippet-container']/div[@class='sa"  \
-          "lary-snippet']/span")&.text
+      sal = td.element("div[@class='metadataContainer']/"  \
+          "div[@class='salary-snippet-container']")
+      salary = if sal then
+        sal_e = sal.element("div[@class='attribute_snippet']")
+        if sal_e then
+          sal_e.texts[0]
+        else
+          sal_e2 = sal.element("div[@class='salary-snippet']/span")
+          sal_e2 ? sal_e2.text : ''
+        end
+      else
+        ''
+      end
       puts 'salary: ' + salary.inspect if @debug
       div1 = td.element("div[@class='companyInfo']")
       # company name (e.g. Coda Octopus Products Ltd)
-      company_name = div1.element("span[@class='companyName']")&.text
+      coname = div1.element("span[@class='companyName']")
+      puts 'coname: ' + coname.text.inspect if @debug
+      company_name = coname.text.to_s.strip.length > 1 ? coname.text : coname.element('a').text
       # company location (e.g. Edinburgh)
       location = div1.element("div[@class='companyLocation']")&.text
@@ -111,7 +138,12 @@ class IndeedScraper2022
           "v[@class='result-footer']")
       # job (e.g. Our products are primarily written in C#, using...)
-      jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
+      advert_items = div3.xpath("div[@class='job-snippet']/ul/li/text()")
+      jobsnippet = if advert_items.any? then
+        advert_items.join("\n")
+      else
+        div3.element("div[@class='job-snippet']").text
+      end
       # visually (e.g. Posted 14 days ago)
       dateposted =  div3.element("span[@class='date']")&.texts
@@ -119,7 +151,7 @@ class IndeedScraper2022
       {
         link:  @url_base.sub(/\/[^\/]+$/,'') \
-          + doc.root.attributes[:href].gsub(/&amp;/,'&'),
+          + href.gsub(/&amp;/,'&'),
         title: jobtitle,
         salary: salary,
         company: company_name,
@@ -146,52 +178,65 @@ class IndeedScraper2022
   def fetchjob(url)
     doc = Nokorexi.new(url).to_doc
+    puts 'before e0' if @debug
     e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
     #div = e0.element("//div[@class='jobsearch-JobComponent']")
+    puts 'before div1' if @debug
     div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
+    puts 'before div2' if @debug
     div2 = div1.element("div")
     # jobsearch (e.g. Full Stack Website Developer (Wordpress))
+    puts 'before jobtitle' if @debug
     jobtitle = div2.element("div[@class='jobsearch-JobInfoHead"  \
         "er-title-container']/h1[@class='jobsearch-JobInfoHead"  \
         "er-title']")&.text
+    puts 'before div3' if @debug
     div3 = div2.element("div[@class='jobsearch-CompanyInfoCon"  \
         "tainer']/div[@class='jobsearch-CompanyInfoWithoutHead"  \
         "erImage']/div/div[@class='jobsearch-DesktopStickyCont"  \
         "ainer-subtitle']")
     # icl (e.g. Lyles Sutherland)
+    puts 'before cname' if @debug
     cname = div3.xpath("div[@class='jobsearch-DesktopSt"  \
         "ickyContainer-companyrating']/div/div[@class='icl-u-x"  \
         "s-mr--xs']")[1]
+    puts 'before clink' if @debug
     clink = div3.element('//a')
     company = cname.text ? cname.text : clink.text
     companylink = clink.attributes[:href] if clink
+    puts 'before salary' if @debug
     salary = div1.element("//span[@class='attribute_snippet']")&.text
+    puts 'before type' if @debug
     type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
     div5 = div3.xpath("div/div")
     location, worklocation = div5.map(&:text).compact
     # icl (e.g. Full-time, Permanent)
+    puts 'before jobtype' if @debug
     jobtype = div1.element("div/div/div[@class='jobsearch-J"  \
         "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
     jobtype = jobtype&.texts.join if jobtype
     # jobsearch (e.g. Urgently needed)
+    puts 'before jobnote1' if @debug
     jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag"  \
         "']/div[@class='urgently-hiring']/div[@class='jobsearc"  \
         "h-DesktopTag-text']")&.text
     # jobsearch (e.g. 10 days ago)
+    puts 'before days' if @debug
     days = e0.element("//div[@class='jobsearch-JobTab-con"  \
         "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
     d = Date.today - days.to_i
     datepost = d.strftime("%Y-%m-%d")
+    puts 'before jobdesc' if @debug
     jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl"  \
         "ass='jobsearch-jobDescriptionText']").xml
@@ -217,11 +262,16 @@ class IS22Plus < IndeedScraper2022
   def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
     super(q: q, location: location, headless: headless, cookies: cookies,
-          debug: debug)
+          debug: true)
   end
+  # note: The most efficient method to accumulate vacancy articles is to
+  #       execute archive() daily
+  #
   def archive(filepath='/tmp/indeed')
+    search() if @results.nil?
     return unless @results
     FileUtils.mkdir_p filepath
@@ -238,16 +288,23 @@ class IS22Plus < IndeedScraper2022
       puts 'saving ' + item[:title] if @debug
       puts 'link: ' + item[:link].inspect
-      links = RXFReader.reveal(item[:link])
-      puts 'links: ' + links.inspect
+      links = URL.reveal(item[:link])
+      puts 'links: ' + links.inspect if @debug
       url = links.last
-      id = url[/(?<=\?jk=)[^&]+/]
+      puts 'url: ' + url.inspect if @debug
+      id = url[/(?<=jk=)[^&]+/]
       if index[id.to_sym] then
+        # the vacancy record has previously been saved
+        #
         next
       else
+        # write the full page vacancy article to file
+        #
         File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
         h = {
@@ -257,14 +314,19 @@ class IS22Plus < IndeedScraper2022
           company: item[:company].to_s.strip,
           location: item[:location].to_s,
           jobsnippet: item[:jobsnippet],
-          date: item[:date]
+          date: item[:date],
+          added: Time.now.strftime("%Y-%m-%d")
         }
+        # add the vacancy snippet to the index file
+        #
         index[id.to_sym] = h
       end
     end
+    # save the vacancy index file
+    #
     File.write idxfile, index.to_yaml
   end
@@ -279,3 +341,37 @@ class IS22Plus < IndeedScraper2022
 end
+class IS22Archive
+  attr_reader :index
+  def initialize(filepath='/tmp/indeed', debug: false)
+    FileUtils.mkdir_p filepath
+    @idxfile = File.join(filepath, 'index.yml')
+    @index = if File.exists? @idxfile then
+      YAML.load(File.read(@idxfile))
+    else
+      {}
+    end
+  end
+  def list()
+    @index.to_a.reverse.map.with_index do |x,i|
+      id, h = x
+      puts 'h: ' + h.inspect if @debug
+      co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
+      "%2d. %s: %s%s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title], co]
+    end.join("\n")
+  end
+end

data.tar.gz.sig CHANGED Viewed

Binary file

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: indeed_scraper2022
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.5.0
 platform: ruby
 authors:
 - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
   YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
   SW/2zInu2bkj/meWm5eBoWHT
   -----END CERTIFICATE-----
-date: 2022-04-01 00:00:00.000000000 Z
+date: 2022-05-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokorexi
@@ -63,20 +63,40 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.2'
+        version: '0.3'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 0.2.2
+        version: 0.3.1
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.2'
+        version: '0.3'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 0.2.2
+        version: 0.3.1
+- !ruby/object:Gem::Dependency
+  name: reveal_url22
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.1'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.1.0
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '0.1'
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: 0.1.0
 description:
 email: digital.robertson@gmail.com
 executables: []

metadata.gz.sig CHANGED Viewed

Binary file