RubyGems - indeed_scraper2022 - Versions diffs - 0.2.1 → 0.4.1 - Mend

indeed_scraper2022 0.2.1 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 5e33dfd54667ecc9f8b7985aa07af403be8d95729ce68e8a40d8c985d57bd4e1
-  data.tar.gz: a2c041ec8103b6afac3a422e7b73bc82c89fd7f8d955240439a29ec0347c8a5f
+  metadata.gz: 88f80a06ef0ab435c144d3b4ec53f1c98f2da7c427224c31dbef44f62fdafee3
+  data.tar.gz: d0f549053bb225e7c8ebb2492715c6c470f689e191e9ae747e8f97317a61c02c
 SHA512:
-  metadata.gz: 8e640cb8262a057bb588b501ee1122a59e6e239e2a5988dd0566ffffb814a2fef763c36fdeae1ba5dc4e6f819ca145374058bd62373ce776df1e393057a49fc0
-  data.tar.gz: 0a6bfe0ef2b685d5711a95704cee3fa67d58eb7c9d0f149c872f9c23b0cc489382ab1327b101f7accf982f7ea1f1a6d56dc20ae528ef1fd4d6105c9ef93067da
+  metadata.gz: 90d23c6c35a87cdcf763dc15a072f35ede166aafe2f1f5eed9294de28e916cdaca3eb043a034fb50750c9444af9e5042dc50d01390816efcdde360d2d01c4e55
+  data.tar.gz: 30153c9c5aafdb5e89d56632e223edfe0196a71c4d9294cc3f963f2039238cd27b836ac9ac42b49cad27988ada868d0d002246699399f7d175216b97689d46ed

checksums.yaml.gz.sig CHANGED Viewed

Binary file

data/lib/indeed_scraper2022.rb CHANGED Viewed

@@ -4,11 +4,20 @@
 require 'ferrumwizard'
 require 'nokorexi'
+require 'yaml'
 # Given the nature of changes to jobsearch websites,
 # don't rely upon this gem working in the near future.
+# this gem consists of 3 main classes:
+#
+# *  IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
+# *  IS22Plus - Archives the scraped vacancies to local file
+# *  IS22Archive - Allows viewing of archived vacancies offline
+#
 class IndeedScraper2022Err < Exception
 end
@@ -99,7 +108,9 @@ class IndeedScraper2022
       div1 = td.element("div[@class='companyInfo']")
       # company name (e.g. Coda Octopus Products Ltd)
-      company_name = div1.element("span[@class='companyName']")&.text
+      coname = div1.element("span[@class='companyName']")
+      puts 'coname: ' + coname.text.inspect if @debug
+      company_name = coname.text.to_s.strip.length > 1 ? coname.text : coname.element('a').text
       # company location (e.g. Edinburgh)
       location = div1.element("div[@class='companyLocation']")&.text
@@ -144,52 +155,65 @@ class IndeedScraper2022
   def fetchjob(url)
     doc = Nokorexi.new(url).to_doc
+    puts 'before e0' if @debug
     e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
     #div = e0.element("//div[@class='jobsearch-JobComponent']")
+    puts 'before div1' if @debug
     div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
+    puts 'before div2' if @debug
     div2 = div1.element("div")
     # jobsearch (e.g. Full Stack Website Developer (Wordpress))
+    puts 'before jobtitle' if @debug
     jobtitle = div2.element("div[@class='jobsearch-JobInfoHead"  \
         "er-title-container']/h1[@class='jobsearch-JobInfoHead"  \
         "er-title']")&.text
+    puts 'before div3' if @debug
     div3 = div2.element("div[@class='jobsearch-CompanyInfoCon"  \
         "tainer']/div[@class='jobsearch-CompanyInfoWithoutHead"  \
         "erImage']/div/div[@class='jobsearch-DesktopStickyCont"  \
         "ainer-subtitle']")
     # icl (e.g. Lyles Sutherland)
+    puts 'before cname' if @debug
     cname = div3.xpath("div[@class='jobsearch-DesktopSt"  \
         "ickyContainer-companyrating']/div/div[@class='icl-u-x"  \
         "s-mr--xs']")[1]
+    puts 'before clink' if @debug
     clink = div3.element('//a')
     company = cname.text ? cname.text : clink.text
     companylink = clink.attributes[:href] if clink
+    puts 'before salary' if @debug
     salary = div1.element("//span[@class='attribute_snippet']")&.text
+    puts 'before type' if @debug
     type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
     div5 = div3.xpath("div/div")
     location, worklocation = div5.map(&:text).compact
     # icl (e.g. Full-time, Permanent)
+    puts 'before jobtype' if @debug
     jobtype = div1.element("div/div/div[@class='jobsearch-J"  \
         "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
     jobtype = jobtype&.texts.join if jobtype
     # jobsearch (e.g. Urgently needed)
+    puts 'before jobnote1' if @debug
     jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag"  \
         "']/div[@class='urgently-hiring']/div[@class='jobsearc"  \
         "h-DesktopTag-text']")&.text
     # jobsearch (e.g. 10 days ago)
+    puts 'before days' if @debug
     days = e0.element("//div[@class='jobsearch-JobTab-con"  \
         "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
     d = Date.today - days.to_i
     datepost = d.strftime("%Y-%m-%d")
+    puts 'before jobdesc' if @debug
     jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl"  \
         "ass='jobsearch-jobDescriptionText']").xml
@@ -218,14 +242,70 @@ class IS22Plus < IndeedScraper2022
           debug: debug)
   end
-  def archive()
+  # note: The most efficient method to accumulate vacancy articles is to
+  #       execute archive() daily
+  #
+  def archive(filepath='/tmp/indeed')
+    search() if @results.nil?
     return unless @results
-    1.upto(@results.length).each do |n|
-      page(n)
+    FileUtils.mkdir_p filepath
+    idxfile = File.join(filepath, 'index.yml')
+    index = if File.exists? idxfile then
+      YAML.load(File.read(idxfile))
+    else
+      {}
+    end
+    @results.each.with_index do |item, i|
+      puts 'saving ' + item[:title] if @debug
+      puts 'link: ' + item[:link].inspect
+      links = RXFReader.reveal(item[:link])
+      puts 'links: ' + links.inspect if @debug
+      url = links.last
+      puts 'url: ' + url.inspect if @debug
+      id = url[/(?<=jk=)[^&]+/]
+      if index[id.to_sym] then
+        # the vacancy record has previously been saved
+        #
+        next
+      else
+        # write the full page vacancy article to file
+        #
+        File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
+        h = {
+          link: url[/^[^&]+/],
+          title: item[:title].to_s,
+          salary: item[:salary].to_s,
+          company: item[:company].to_s.strip,
+          location: item[:location].to_s,
+          jobsnippet: item[:jobsnippet],
+          date: item[:date],
+          added: Time.now.strftime("%Y-%m-%d")
+        }
+        # add the vacancy snippet to the index file
+        #
+        index[id.to_sym] = h
+      end
     end
+    # save the vacancy index file
+    #
+    File.write idxfile, index.to_yaml
   end
   def list()
@@ -238,3 +318,39 @@ class IS22Plus < IndeedScraper2022
 end
+class IS22Archive
+  attr_reader :index
+  def initialize(filepath='/tmp/indeed', debug: false)
+    @debug = debug
+    FileUtils.mkdir_p filepath
+    @idxfile = File.join(filepath, 'index.yml')
+    @index = if File.exists? @idxfile then
+      YAML.load(File.read(@idxfile))
+    else
+      {}
+    end
+  end
+  def list()
+    @index.to_a.reverse.map.with_index do |x,i|
+      id, h = x
+      puts 'h: ' + h.inspect if @debug
+      co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
+      "%2d. %s: %s%s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title], co]
+    end.join("\n")
+  end
+end

data.tar.gz.sig CHANGED Viewed

Binary file

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: indeed_scraper2022
 version: !ruby/object:Gem::Version
-  version: 0.2.1
+  version: 0.4.1
 platform: ruby
 authors:
 - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
   YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
   SW/2zInu2bkj/meWm5eBoWHT
   -----END CERTIFICATE-----
-date: 2022-03-30 00:00:00.000000000 Z
+date: 2022-04-16 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokorexi
@@ -63,20 +63,20 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.2'
+        version: '0.3'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 0.2.2
+        version: 0.3.1
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.2'
+        version: '0.3'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 0.2.2
+        version: 0.3.1
 description:
 email: digital.robertson@gmail.com
 executables: []

metadata.gz.sig CHANGED Viewed

Binary file