RubyGems - indeed_scraper2022 - Versions diffs - 0.1.1 → 0.2.0 - Mend

indeed_scraper2022 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 50a484cf1a272522091413129241620336f12ca94d795b7ab132dd6911802d1c
-  data.tar.gz: 06dffee1253aa5076da9b6897bc48009687e8df86aa1a3629ee1d8a4432fdd13
+  metadata.gz: 07f5323381b5751c470454f6f4c3ba6dced6f1424e054b85360a49d814d662ba
+  data.tar.gz: 3d25353b9f8a0543944cac82ef6dc91adf7d3e83444f3c6ef469f15cbba8a3d8
 SHA512:
-  metadata.gz: 7327fc5bf9668c4f292eabf673574bfd7ca9fbf180133896c559ac1b7415d4ee880365302d24550aba4b83ee2a709a77bdb059145310e4a5de21840fe11a5058
-  data.tar.gz: a84b587275793166a7ac40d63607c8ea2f8dd340a3197ce782f901ca8c27de27b8eeef7f36fc9399aba286ddc399da109b2d14cd9db99b3c6a4a545e6ad9f21c
+  metadata.gz: 5e13ae04b46bfa3eb15aab8d0aff388d8caec591c413493db591c37da099d2bcd5ba340a72137d4aa7d374652b68bc1d037b86fe4cc2ed2ae5b0a56c5202f00b
+  data.tar.gz: ca14ae99251aabbcaee08a3bb6f240742ed1fab0f438496dc742ef39a10abb13e310b2d6a93bc472f5e1b3e45cfd8956d6a62f803b1d3a152054cf4e1ae35402

checksums.yaml.gz.sig CHANGED Viewed

Binary file

data/lib/indeed_scraper2022.rb CHANGED Viewed

@@ -2,20 +2,25 @@
 # file: indeed_scraper2022.rb
-require 'mechanize'
+require 'ferrumwizard'
 require 'nokorexi'
 # Given the nature of changes to jobsearch websites,
 # don't rely upon this gem working in the near future.
+class IndeedScraper2022Err < Exception
+end
 class IndeedScraper2022
-  def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '', debug: false)
+  def initialize(url='https://uk.indeed.com/?r=us', q: '', location: '',
+                 headless: true, cookies: nil, debug: false)
     @debug = debug
     @url_base, @q, @location = url, q, location
-    @results = search
+    @headless, @cookies = headless, cookies
+    @results = search(q: @q, location: @location)
   end
@@ -25,7 +30,93 @@ class IndeedScraper2022
     @results
   end
+  def search(q: @q, location: @location, start: nil)
+    fw = FerrumWizard.new( headless: @headless, cookies: @cookies, debug: @debug)
+    url = @url_base
+    url += 'start=' + start if start
+    browser = fw.browser
+    browser.goto(url)
+    if q.length > 1 then
+      input = browser.at_xpath("//input[@name='q']")
+      input.focus.type(q)
+    end
+    if location.length > 1 then
+      input2 = browser.at_xpath("//input[@name='l']")
+      input2.focus.type(location)
+    end
+    button = browser.at_xpath("//button[@type='submit']")
+    button.click
+    doc2 = Nokogiri::XML(browser.body)
+    a2 = doc2.xpath  "//a[div/div/div/div/table/tbody/tr/td/div]"
+    puts 'a2: ' + a2.length.inspect if @debug
+    @a2 = a2.map {|x| Rexle.new x.to_s }
+    @a2.map do |doc|
+      div = doc.element("a[@class='desktop']/div[@class='slider"  \
+          "_container']/div[@class='slider_list']/div[@class='sl"  \
+          "ider_item']/div[@class='job_seen_beacon']")
+      td = div.element("table[@class='jobCard_mainContent']/tbo"  \
+          "dy/tr/td[@class='resultContent']")
+      # job title (e.g. Software Developer)
+      jobtitle = td.element("div[@class='tapItem-gutter']/h2[@"  \
+          "class='jobTitle-color-purple']/span")&.text
+      puts 'jobtitle: ' + jobtitle.inspect if @debug
+      salary = td.element("div[@class='metadataContainer']/"  \
+          "div[@class='salary-snippet-container']/div[@class='sa"  \
+          "lary-snippet']/span")&.text
+      puts 'salary: ' + salary.inspect if @debug
+      div1 = td.element("div[@class='companyInfo']")
+      # company name (e.g. Coda Octopus Products Ltd)
+      company_name = div1.element("span[@class='companyName']")&.text
+      # company location (e.g. Edinburgh)
+      location = div1.element("div[@class='companyLocation']")&.text
+      tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
+      div3 = tbody.element("tr[@class='underShelfFooter']/td/di"  \
+          "v[@class='result-footer']")
+      # job (e.g. Our products are primarily written in C#, using...)
+      jobsnippet = div3.xpath("div[@class='job-snippet']/ul/li/text()").join("\n")
+      # visually (e.g. Posted 14 days ago)
+      dateposted =  div3.element("span[@class='date']")&.texts
+      date = (Date.today - dateposted.first.to_i).to_s if dateposted
+      {
+        link:  @url_base.sub(/\/[^\/]+$/,'') \
+          + doc.root.attributes[:href].gsub(/&amp;/,'&'),
+        title: jobtitle,
+        salary: salary,
+        company: company_name,
+        location: location,
+        jobsnippet: jobsnippet,
+        date: date
+      }
+    end
+  end
   def page(n)
+    if n < 1 or n > @results.length then
+      raise IndeedScraper2022Err, 'Invalid page no.'
+    end
     url = @results[n-1][:link]
     fetchjob(url)
   end
@@ -44,7 +135,7 @@ class IndeedScraper2022
     # jobsearch (e.g. Full Stack Website Developer (Wordpress))
     jobtitle = div2.element("div[@class='jobsearch-JobInfoHead"  \
         "er-title-container']/h1[@class='jobsearch-JobInfoHead"  \
-        "er-title']").text
+        "er-title']")&.text
     div3 = div2.element("div[@class='jobsearch-CompanyInfoCon"  \
         "tainer']/div[@class='jobsearch-CompanyInfoWithoutHead"  \
@@ -56,110 +147,74 @@ class IndeedScraper2022
         "ickyContainer-companyrating']/div/div[@class='icl-u-x"  \
         "s-mr--xs']")[1]
     clink = div3.element('//a')
-    company = cname ? cname.text : clink.text
+    company = cname.text ? cname.text : clink.text
     companylink = clink.attributes[:href] if clink
+    salary = div1.element("//span[@class='attribute_snippet']")&.text
+    type = div1.element("//span[@class='jobsearch-JobMetadataHeader-item']")&.texts&.last
     div5 = div3.xpath("div/div")
     location, worklocation = div5.map(&:text).compact
     # icl (e.g. Full-time, Permanent)
     jobtype = div1.element("div/div/div[@class='jobsearch-J"  \
         "obMetadataHeader-item']/span[@class='icl-u-xs-mt--xs']")
-    jobtype = jobtype.texts.join if jobtype
+    jobtype = jobtype&.texts.join if jobtype
     # jobsearch (e.g. Urgently needed)
     jobnote1 = e0.element("//div[@class='jobsearch-DesktopTag"  \
         "']/div[@class='urgently-hiring']/div[@class='jobsearc"  \
-        "h-DesktopTag-text']")
-    jobnote1 = jobnote1.text if jobnote1
+        "h-DesktopTag-text']")&.text
     # jobsearch (e.g. 10 days ago)
-    datepost = e0.element("//div[@class='jobsearch-JobTab-con"  \
-        "tent']/div[@class='jobsearch-JobMetadataFooter']/div").text
+    days = e0.element("//div[@class='jobsearch-JobTab-con"  \
+        "tent']/div[@class='jobsearch-JobMetadataFooter']/div[2]")&.text
+    d = Date.today - days.to_i
+    datepost = d.strftime("%Y-%m-%d")
     jobdesc = e0.element("//div[@class='icl-u-xs-mt--md']/div[@cl"  \
-        "ass='jobsearch-jobDescriptionText']")
+        "ass='jobsearch-jobDescriptionText']").xml
     {
       title: jobtitle,
+      type: type,
       company: company,
       companylink: companylink,
       location: location,
+      salary: salary,
       worklocation: worklocation,
       note: jobnote1,
-      date: (Date.today - datepost.to_i).to_s,
+      date: datepost,
       desc: jobdesc
     }
   end
-  def search(q='', location='')
-    a = Mechanize.new
-    page = a.get(@url_base)
-    form = page.forms.first
-    form.fields[0].value = @q
-    form.fields[1].value = @location
-    pg = form.submit
-    doc2 = Nokogiri::XML(pg.body)
-    a2 = doc2.xpath  "//a[div/div/div/div/table/tbody/tr/td/div]"
-    puts 'a2: ' + a2.length.inspect if @debug
-    @a2 = a2.map {|x| Rexle.new x.to_s }
-    @a2.map do |doc|
-      div = doc.element("a[@class='desktop']/div[@class='slider"  \
-          "_container']/div[@class='slider_list']/div[@class='sl"  \
-          "ider_item']/div[@class='job_seen_beacon']")
-      td = div.element("table[@class='jobCard_mainContent']/tbo"  \
-          "dy/tr/td[@class='resultContent']")
-      # job title (e.g. Software Developer)
-      jobtitle = td.element("div[@class='tapItem-gutter']/h2[@"  \
-          "class='jobTitle-color-purple']/span").text
-      puts 'jobtitle: ' + jobtitle.inspect if @debug
+end
-      salary = td.element("div[@class='metadataContainer']/"  \
-          "div[@class='salary-snippet-container']/div[@class='sa"  \
-          "lary-snippet']/span")
-      salary = salary.text if salary
-      puts 'salary: ' + salary.inspect if @debug
-      div1 = td.element("div[@class='companyInfo']")
+class IS22Plus < IndeedScraper2022
-      # company name (e.g. Coda Octopus Products Ltd)
-      company_name = div1.element("span[@class='companyName']").text
+  def initialize(q: '', location: '', headless: true, cookies: nil, debug: false)
+    super(q: q, location: location, headless: headless, cookies: cookies,
+          debug: debug)
+  end
-      # company location (e.g. Edinburgh)
-      location = div1.element("div[@class='companyLocation']").text
-      tbody = div.element("table[@class='jobCardShelfContainer']/tbody")
+  def archive()
-      div3 = tbody.element("tr[@class='underShelfFooter']/td/di"  \
-          "v[@class='result-footer']")
+    1.upto(15).each do |n|
+      page(n)
+    end
-      # job (e.g. Our products are primarily written in C#, using...)
-      jobsnippet = div3.element("div[@class='job-snippet']/ul/li").text
+  end
-      # visually (e.g. Posted 14 days ago)
-      dateposted =  div3.element("span[@class='date']").texts
-      date = (Date.today - dateposted.first.to_i).to_s
+  def list()
-      {
-        link:  @url_base.sub(/\/[^\/]+$/,'') \
-          + doc.root.attributes[:href].gsub(/&amp;/,'&'),
-        title: jobtitle,
-        salary: salary,
-        company: company_name,
-        location: location,
-        jobsnippet: jobsnippet,
-        date: date
-      }
+    @results.map.with_index do |x,i|
+      "%2d. %s" % [i+1,x[:title]]
+    end.join("\n")
-    end
   end
-end
+end

data.tar.gz.sig CHANGED Viewed

Binary file

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: indeed_scraper2022
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.2.0
 platform: ruby
 authors:
 - James Robertson
@@ -35,7 +35,7 @@ cert_chain:
   YLGLkwmbiUHX5dRU3RwOwOrZiyvND5BIj7S6dZ6jYHe0I727apgQNc3swTz5mW6I
   SW/2zInu2bkj/meWm5eBoWHT
   -----END CERTIFICATE-----
-date: 2022-01-25 00:00:00.000000000 Z
+date: 2022-03-30 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokorexi
@@ -43,42 +43,42 @@ dependencies:
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.5'
+        version: '0.7'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 0.5.5
+        version: 0.7.0
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '0.5'
+        version: '0.7'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 0.5.5
+        version: 0.7.0
 - !ruby/object:Gem::Dependency
-  name: mechanize
+  name: ferrumwizard
   requirement: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '2.8'
+        version: '0.2'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 2.8.4
+        version: 0.2.2
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - "~>"
       - !ruby/object:Gem::Version
-        version: '2.8'
+        version: '0.2'
     - - ">="
       - !ruby/object:Gem::Version
-        version: 2.8.4
+        version: 0.2.2
 description:
-email: james@jamesrobertson.eu
+email: digital.robertson@gmail.com
 executables: []
 extensions: []
 extra_rdoc_files: []
@@ -96,15 +96,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: '0'
+      version: 2.3.0
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubyforge_project:
-rubygems_version: 2.7.10
+rubygems_version: 3.2.22
 signing_key:
 specification_version: 4
 summary: Attempts to scrape the indeed.com jobsearch results (1 page).

metadata.gz.sig CHANGED Viewed

Binary file