RubyGems - indeed_scraper2022 - Versions diffs - 0.2.1 → 0.4.1 - Mend

indeed_scraper2022 0.2.1 → 0.4.1

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 5e33dfd54667ecc9f8b7985aa07af403be8d95729ce68e8a40d8c985d57bd4e1
-  data.tar.gz: a2c041ec8103b6afac3a422e7b73bc82c89fd7f8d955240439a29ec0347c8a5f
+  metadata.gz: 88f80a06ef0ab435c144d3b4ec53f1c98f2da7c427224c31dbef44f62fdafee3
+  data.tar.gz: d0f549053bb225e7c8ebb2492715c6c470f689e191e9ae747e8f97317a61c02c
 SHA512:
-  metadata.gz: 8e640cb8262a057bb588b501ee1122a59e6e239e2a5988dd0566ffffb814a2fef763c36fdeae1ba5dc4e6f819ca145374058bd62373ce776df1e393057a49fc0
-  data.tar.gz: 0a6bfe0ef2b685d5711a95704cee3fa67d58eb7c9d0f149c872f9c23b0cc489382ab1327b101f7accf982f7ea1f1a6d56dc20ae528ef1fd4d6105c9ef93067da
+  metadata.gz: 90d23c6c35a87cdcf763dc15a072f35ede166aafe2f1f5eed9294de28e916cdaca3eb043a034fb50750c9444af9e5042dc50d01390816efcdde360d2d01c4e55
+  data.tar.gz: 30153c9c5aafdb5e89d56632e223edfe0196a71c4d9294cc3f963f2039238cd27b836ac9ac42b49cad27988ada868d0d002246699399f7d175216b97689d46ed

checksums.yaml.gz.sig CHANGED Viewed

Binary file

data/lib/indeed_scraper2022.rb CHANGED Viewed

@@ -4,11 +4,20 @@
 require 'ferrumwizard'
 require 'nokorexi'
+require 'yaml'
 # Given the nature of changes to jobsearch websites,
 # don't rely upon this gem working in the near future.
+# this gem consists of 3 main classes:
+#
+# *  IndeedScraper2022 - Scrapes a page of vacancies from indeed.com
+# *  IS22Plus - Archives the scraped vacancies to local file
+# *  IS22Archive - Allows viewing of archived vacancies offline
+#
 class IndeedScraper2022Err < Exception
 end
@@ -99,7 +108,9 @@ class IndeedScraper2022
       div1 = td.element("div[@class='companyInfo']")
       # company name (e.g. Coda Octopus Products Ltd)
-      company_name = div1.element("span[@class='companyName']")&.text
+      coname = div1.element("span[@class='companyName']")
+      puts 'coname: ' + coname.text.inspect if @debug
+      company_name = coname.text.to_s.strip.length > 1 ? coname.text : coname.element('a').text
       # company location (e.g. Edinburgh)
       location = div1.element("div[@class='companyLocation']")&.text
@@ -144,52 +155,65 @@ class IndeedScraper2022
   def fetchjob(url)
     doc = Nokorexi.new(url).to_doc
+    puts 'before e0' if @debug
     e0 = doc.element("html/body/div/div/div/div/div/div/div/div")
     #div = e0.element("//div[@class='jobsearch-JobComponent']")
+    puts 'before div1' if @debug
     div1 = e0.element("//div[@class='jobsearch-DesktopStickyContainer']")
+    puts 'before div2' if @debug
     div2 = div1.element("div")
     # jobsearch (e.g. Full Stack Website Developer (Wordpress))
+    puts 'before jobtitle' if @debug
     jobtitle = div2.element("div[@class='jobsearch-JobInfoHead"  \
         "er-title-container']/h1[@class='jobsearch-JobInfoHead"  \
         "er-title']")&.text
+    puts 'before div3' if @debug
     div3 = div2.element("div[@class='jobsearch-CompanyInfoCon"  \
         "tainer']/div[@class='jobsearch-CompanyInfoWithoutHead"  \
         "erImage']/div/div[@class='jobsearch-DesktopStickyCont"  \
         "ainer-subtitle']")
     # icl (e.g. Lyles Sutherland)
+    puts 'before cname' if @debug
     cname = div3.xpath("div[@class='jobsearch-DesktopSt"  \
         "ickyContainer-companyrating']/div/div[@class='icl-u-x"  \
         "s-mr--xs']")[1]
+    puts 'before clink' if @debug
     clink = div3.element('//a')
     company = cname.text ? cname.text : clink.text
     companylink = clink.attributes[:href] if clink
+    puts 'before salary' if @debug
     salary = div1.element("//span[@class='attribute_snippet']")&.text
+    puts 'before type' if @debug
     type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
     div5 = div3.xpath("div/div")
     location, worklocation = div5.map(&:text).compact
     # icl (e.g. Full-time, Permanent)
+    puts 'before jobtype' if @debug
     jobtype = div1.element("div/div/div[@class='jobsearch-J"  \
         "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
     jobtype = jobtype&.texts.join if jobtype
     # jobsearch (e.g. Urgently needed)
+    puts 'before jobnote1' if @debug
     jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag"  \
         "']/div[@class='urgently-hiring']/div[@class='jobsearc"  \
         "h-DesktopTag-text']")&.text
     # jobsearch (e.g. 10 days ago)
+    puts 'before days' if @debug
     days = e0.element("//div[@class='jobsearch-JobTab-con"  \
         "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
     d = Date.today - days.to_i
     datepost = d.strftime("%Y-%m-%d")
+    puts 'before jobdesc' if @debug
     jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl"  \
         "ass='jobsearch-jobDescriptionText']").xml
@@ -218,14 +242,70 @@ class IS22Plus < IndeedScraper2022
           debug: debug)
   end
-  def archive()
+  # note: The most efficient method to accumulate vacancy articles is to
+  #       execute archive() daily
+  #
+  def archive(filepath='/tmp/indeed')
+    search() if @results.nil?
     return unless @results
-    1.upto(@results.length).each do |n|
-      page(n)
+    FileUtils.mkdir_p filepath
+    idxfile = File.join(filepath, 'index.yml')
+    index = if File.exists? idxfile then
+      YAML.load(File.read(idxfile))
+    else
+      {}
+    end
+    @results.each.with_index do |item, i|
+      puts 'saving ' + item[:title] if @debug
+      puts 'link: ' + item[:link].inspect
+      links = RXFReader.reveal(item[:link])
+      puts 'links: ' + links.inspect if @debug
+      url = links.last
+      puts 'url: ' + url.inspect if @debug
+      id = url[/(?<=jk=)[^&]+/]
+      if index[id.to_sym] then
+        # the vacancy record has previously been saved
+        #
+        next
+      else
+        # write the full page vacancy article to file
+        #
+        File.write File.join(filepath, 'j' + id + '.txt'), page(i+1)
+        h = {
+          link: url[/^[^&]+/],
+          title: item[:title].to_s,
+          salary: item[:salary].to_s,
+          company: item[:company].to_s.strip,
+          location: item[:location].to_s,
+          jobsnippet: item[:jobsnippet],
+          date: item[:date],
+          added: Time.now.strftime("%Y-%m-%d")
+        }
+        # add the vacancy snippet to the index file
+        #
+        index[id.to_sym] = h
+      end
     end
+    # save the vacancy index file
+    #
+    File.write idxfile, index.to_yaml
   end
   def list()
@@ -238,3 +318,39 @@ class IS22Plus < IndeedScraper2022
 end
+class IS22Archive
+  attr_reader :index
+  def initialize(filepath='/tmp/indeed', debug: false)
+    @debug = debug
+    FileUtils.mkdir_p filepath
+    @idxfile = File.join(filepath, 'index.yml')
+    @index = if File.exists? @idxfile then
+      YAML.load(File.read(@idxfile))
+    else
+      {}
+    end
+  end
+  def list()
+    @index.to_a.reverse.map.with_index do |x,i|
+      id, h = x
+      puts 'h: ' + h.inspect if @debug
+      co = h[:company].length > 1 ? " (%s)" % h[:company] : ''
+      "%2d. %s: %s%s" % [i+1, Date.parse(h[:added]).strftime("%d %b"), h[:title], co]
+    end.join("\n")
+  end
+end

data.tar.gz.sig CHANGED Viewed

Binary file

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: indeed_scraper2022
 version: !ruby/object:Gem::Version
-  version: 0.2.1
+  version: 0.4.1
 platform: ruby
 authors:
 - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
   YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
   SW/2zInu2bkj/meWm5eBoWHT
   -----END CERTIFICATE-----
-date: 2022-03-30 00:00:00.000000000 Z
+date: 2022-04-16 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokorexi
@@ -63,20 +63,20 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.2'
+        version: '0.3'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 0.2.2
+        version: 0.3.1
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.2'
+        version: '0.3'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 0.2.2
+        version: 0.3.1
 description:
 email: digital.robertson@gmail.com
 executables: []

metadata.gz.sig CHANGED Viewed

Binary file