RubyGems - indeed_scraper2022 - Versions diffs - 0.3.0 → 0.4.0 - Mend

indeed_scraper2022 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 972f811430fae59121e39c9c4752b64fc43b37165a52dcec8c3eac42cf1e4555
-  data.tar.gz: 85e987eb264b098b4c892e2e05d2ab082e3b8968fff2bdd519552e889f014f9d
+  metadata.gz: 7f98a83b7ed582d1b2973882701833688aec2d6d2bd132241a26c01a32915f93
+  data.tar.gz: dc5c34a5af19cdffbd244e15416914e91c8a06f365f0fff28bcd537a30ec468e
 SHA512:
-  metadata.gz: e7d3c2a13e315383248c557806dd0184d8831f46a2e314816395f62fcb886ba2e38a3e1f2deb180ceb33b614cf0b7be8a13379028fc56982e388948575bdb02c
-  data.tar.gz: 6ca4792d260c43b22fcee5ead8928525df17f1db112c49ac4cc7d7a5c9b29a8f483b94c18cd39bace1d4a4ee553ff83254a7fb445d76abf06c1705d19bac455c
+  metadata.gz: 4ce40e021339f6b1c24faed495ebbd4b257200f62d1466ffa54cd05e654ccef29b23dbfcf4af64b9426f4d2dbde6bb778f6b920a7656af8289e3f81a269ba54a
+  data.tar.gz: 634325fed61c7888b08fd72bfc47f4d64f98f5514110169152389e49940f15a080c02f62811aae2b5f34c6a604c17d2512d2219dba0f4473dc1034900bdb7ec6

checksums.yaml.gz.sig CHANGED Viewed

Binary file

data/lib/indeed_scraper2022.rb CHANGED Viewed

@@ -11,6 +11,13 @@ require 'yaml'
+# this gem consists of 3 main classes:
+#
+# *  IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
+# *  IS22Plus - Archives the scraped vacancies to local file
+# *  IS22Archive - Allows viewing of archived vacancies offline
+#
 class IndeedScraper2022Err < Exception
 end
@@ -146,52 +153,65 @@ class IndeedScraper2022
   def fetchjob(url)
     doc = Nokorexi.new(url).to_doc
+    puts 'before e0' if @debug
     e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
     #div = e0.element("//div[@class='jobsearch-JobComponent']")
+    puts 'before div1' if @debug
     div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
+    puts 'before div2' if @debug
     div2 = div1.element("div")
     # jobsearch (e.g. Full Stack Website Developer (Wordpress))
+    puts 'before jobtitle' if @debug
     jobtitle = div2.element("div[@class='jobsearch-JobInfoHead"  \
         "er-title-container']/h1[@class='jobsearch-JobInfoHead"  \
         "er-title']")&.text
+    puts 'before div3' if @debug
     div3 = div2.element("div[@class='jobsearch-CompanyInfoCon"  \
         "tainer']/div[@class='jobsearch-CompanyInfoWithoutHead"  \
         "erImage']/div/div[@class='jobsearch-DesktopStickyCont"  \
         "ainer-subtitle']")
     # icl (e.g. Lyles Sutherland)
+    puts 'before cname' if @debug
     cname = div3.xpath("div[@class='jobsearch-DesktopSt"  \
         "ickyContainer-companyrating']/div/div[@class='icl-u-x"  \
         "s-mr--xs']")[1]
+    puts 'before clink' if @debug
     clink = div3.element('//a')
     company = cname.text ? cname.text : clink.text
     companylink = clink.attributes[:href] if clink
+    puts 'before salary' if @debug
     salary = div1.element("//span[@class='attribute_snippet']")&.text
+    puts 'before type' if @debug
     type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
     div5 = div3.xpath("div/div")
     location, worklocation = div5.map(&:text).compact
     # icl (e.g. Full-time, Permanent)
+    puts 'before jobtype' if @debug
     jobtype = div1.element("div/div/div[@class='jobsearch-J"  \
         "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
     jobtype = jobtype&.texts.join if jobtype
     # jobsearch (e.g. Urgently needed)
+    puts 'before jobnote1' if @debug
     jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag"  \
         "']/div[@class='urgently-hiring']/div[@class='jobsearc"  \
         "h-DesktopTag-text']")&.text
     # jobsearch (e.g. 10 days ago)
+    puts 'before days' if @debug
     days = e0.element("//div[@class='jobsearch-JobTab-con"  \
         "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
     d = Date.today - days.to_i
     datepost = d.strftime("%Y-%m-%d")
+    puts 'before jobdesc' if @debug
     jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl"  \
         "ass='jobsearch-jobDescriptionText']").xml
@@ -220,8 +240,13 @@ class IS22Plus < IndeedScraper2022
           debug: debug)
   end
+  # note: The most efficient method to accumulate vacancy articles is to
+  #       execute archive() daily
+  #
   def archive(filepath='/tmp/indeed')
+    search() if @results.nil?
     return unless @results
     FileUtils.mkdir_p filepath
@@ -239,15 +264,22 @@ class IS22Plus < IndeedScraper2022
       puts 'saving ' + item[:title] if @debug
       puts 'link: ' + item[:link].inspect
       links = RXFReader.reveal(item[:link])
-      puts 'links: ' + links.inspect
+      puts 'links: ' + links.inspect if @debug
       url = links.last
-      id = url[/(?<=\?jk=)[^&]+/]
+      puts 'url: ' + url.inspect if @debug
+      id = url[/(?<=jk=)[^&]+/]
       if index[id.to_sym] then
+        # the vacancy record has previously been saved
+        #
         next
       else
+        # write the full page vacancy article to file
+        #
         File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
         h = {
@@ -257,14 +289,19 @@ class IS22Plus < IndeedScraper2022
           company: item[:company].to_s.strip,
           location: item[:location].to_s,
           jobsnippet: item[:jobsnippet],
-          date: item[:date]
+          date: item[:date],
+          added: Time.now.strftime("%Y-%m-%d")
         }
+        # add the vacancy snippet to the index file
+        #
         index[id.to_sym] = h
       end
     end
+    # save the vacancy index file
+    #
     File.write idxfile, index.to_yaml
   end
@@ -279,3 +316,38 @@ class IS22Plus < IndeedScraper2022
 end
+class IS22Archive
+  attr_reader :index
+  def initialize(filepath='/tmp/indeed', debug: false)
+    @debug = debug
+    FileUtils.mkdir_p filepath
+    @idxfile = File.join(filepath, 'index.yml')
+    @index = if File.exists? @idxfile then
+      YAML.load(File.read(@idxfile))
+    else
+      {}
+    end
+  end
+  def list()
+    @index.map.with_index do |x,i|
+      id, h = x
+      puts 'h: ' + h.inspect if @debug
+      "%2d. %s: %s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title]]
+    end.join("\n")
+  end
+end

data.tar.gz.sig CHANGED Viewed

Binary file

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: indeed_scraper2022
 version: !ruby/object:Gem::Version
-  version: 0.3.0
+  version: 0.4.0
 platform: ruby
 authors:
 - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
   YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
   SW/2zInu2bkj/meWm5eBoWHT
   -----END CERTIFICATE-----
-date: 2022-04-01 00:00:00.000000000 Z
+date: 2022-04-14 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokorexi
@@ -63,20 +63,20 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.2'
+        version: '0.3'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 0.2.2
+        version: 0.3.1
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.2'
+        version: '0.3'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 0.2.2
+        version: 0.3.1
 description:
 email: digital.robertson@gmail.com
 executables: []

metadata.gz.sig CHANGED Viewed

Binary file