RubyGems - remote_job_scraper - Versions diffs - 0.4.2 → 0.4.4 - Mend

remote_job_scraper 0.4.2 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

checksums.yaml +4 -4
data/Gemfile.lock +1 -1
data/README.md +17 -1
data/lib/remote_job_scraper.rb +13 -39
data/lib/remote_job_scraper/cli.rb +75 -0
data/lib/remote_job_scraper/configuration.rb +10 -0
data/lib/remote_job_scraper/version.rb +1 -1
data/lib/sites/base.rb +14 -25
data/lib/sites/elixir_radar.rb +1 -1
data/lib/sites/jobs_rails42.rb +39 -30
data/lib/sites/remote_ok.rb +26 -16
data/lib/sites/we_work_remotely.rb +27 -18
data/lib/support/offer_parser.rb +4 -6
data/lib/support/spreadsheet_creator.rb +2 -1
metadata +4 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 72ba313413dc7997be5ff27cd80872eadf9a6a42
-  data.tar.gz: f60d2785e8577ef7a1e567d5753eba66f0957912
+  metadata.gz: 6629f970e902f9ec6789b5574156730e7c9066a0
+  data.tar.gz: 6b4eca4e512e432f4c4e642da57253e4756d0290
 SHA512:
-  metadata.gz: 4aecc4a83e5c4a9737db7feca2029effcae08a618796c3dbe8b87f81c4373e1eee25cdc7726ebca1b8c220650840d7d0d59298e1c20291a5dc52bdfd29b4eadd
-  data.tar.gz: 54382e1d93daf8f50fc92ee79cc80a3527e93a1fbdceb5006e2db1ea4de71e2523cfce148ded18d59296bd79e8dca8121800b3afbe93815fb6858959653da3c6
+  metadata.gz: 88559d103a0b1e5d70185641684e05f221d45fca0ea146b15324d34cc32c0ae40f42473f5564cc215410675f4b532e4a7a158fc659f73bfb6fb8c2d16208da11
+  data.tar.gz: f6e2e97da63b78a378200be0925221d80e569f4b1c3573cfa9928c3910332e5be834dc89bb4df0803c280b1a7ccff36e60519778d9c709295fe9ed1fa4612ed6

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    remote_job_scraper (0.4.1)
+    remote_job_scraper (0.4.3)
       nokogiri
       spreadsheet
       thor

data/README.md CHANGED Viewed

@@ -2,8 +2,9 @@
 Ruby gem that collects job offers for remote positions with ease.
-Going through many job listings and finding the right one may be a time-consuming process. That's why this tool has been built. It allows to automate the process, retrieve necessary data and store it in CSV/Excel file in just a few minutes. The main focus is to inform a user about the location (time-zone) required for a position.
+Going through many job listings and finding the right one may be a time-consuming process. That's why this tool has been built. It allows to automate the process, retrieve necessary data and store it in CSV/Excel file in just a few minutes. The main focus is to inform an user about the location (time-zone) required for a position.
+![screenshot](http://i67.tinypic.com/2ewfj3a.png)
 ## Installation
@@ -18,6 +19,21 @@ Going through many job listings and finding the right one may be a time-consumin
   * [x] 2.4.1
   * [ ] 2.0.0 (https://github.com/rafaltrojanowski/remote_job_scraper/issues/1)
+## Running test
+Running test:
+```
+$ rspec spec
+```
+A few tests run very slow because they parse a huge amount of pages.
+You can skip slow tests by running:
+```
+$ rspec . --tag ~speed:slow
+```
 ## Development
 After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.

data/lib/remote_job_scraper.rb CHANGED Viewed

@@ -1,4 +1,6 @@
 require 'remote_job_scraper/version'
+require 'remote_job_scraper/configuration'
+require 'remote_job_scraper/cli'
 require 'sites/we_work_remotely'
 require 'sites/remote_ok'
@@ -11,51 +13,23 @@ require 'support/spreadsheet_creator'
 require 'nokogiri'
 require 'open-uri'
 require 'csv'
-require "thor"
 module RemoteJobScraper
-  AVAILABLE_SITES = %w(we_work_remotely remote_ok 42jobs_rails)
-  class CLI < Thor
-    desc 'collect_jobs', "Retrieves data from #{AVAILABLE_SITES.join(', ')}"
-    def collect_jobs
-      [Sites::WeWorkRemotely, Sites::RemoteOk].each do |klass|
-        klass.new.collect_jobs
-      end
-    end
-    desc 'collect_jobs_from SITE', "Retrieves data from SITE, e.g. #{AVAILABLE_SITES.sample}"
-    def collect_jobs_from(site)
-      case site
-      when 'we_work_remotely'
-        then Sites::WeWorkRemotely.new.collect_jobs
-      when 'remote_ok'
-        then Sites::RemoteOk.new.collect_jobs
-      when '42jobs_rails'
-        then Sites::JobsRails42.new.collect_jobs
-      else
-        raise "#{site} is not correct. Use: #{AVAILABLE_SITES.join(', ')}."
-      end
-    end
-    desc 'generate_summary', "Merges data from #{AVAILABLE_SITES.join(', ')} and exports to XLS file"
-    def generate_summary
-      Support::SpreadsheetCreator.generate
-    end
+  class << self
+    attr_accessor :configuration
+  end
-    desc 'remove DIRNAME', "Removes DIRNAME (default: 'data'). Use carefully."
-    def remove(dirname = 'data')
-      puts "[Warning!]"
-      puts "This command will remove #{Dir.pwd}/#{dirname} permanently"
-      puts "Press Ctrl-C to abort."
+  def self.configuration
+    @configuration ||= Configuration.new
+  end
-      sleep 3
+  def self.reset
+    @configuration = Configuration.new
+  end
-      FileUtils.rm_rf(dirname)
-      puts "Removed data in #{Dir.pwd}/#{dirname}."
-    end
+  def self.configure
+    yield(configuration)
   end
   def self.root

data/lib/remote_job_scraper/cli.rb ADDED Viewed

@@ -0,0 +1,75 @@
+require "thor"
+module RemoteJobScraper
+  class CLI < Thor
+    AVAILABLE_SITES = %w(we_work_remotely remote_ok 42jobs_rails)
+    desc 'collect_jobs LIMIT DELAY',
+      "Retrieves data from #{AVAILABLE_SITES.join(', ')}.
+       [Example]: remote_job_scraper collect_jobs 10 9.0..10.0
+      "
+    def collect_jobs(limit = nil, delay = nil)
+      limit = limit.to_i
+      limit = limit.zero? ? nil : limit
+      begin
+        unless delay.nil?
+          arr =  delay.split('..').map{ |d| Float(d) }
+          range = arr[0]..arr[1]
+          RemoteJobScraper.configuration.delay_range = range
+        end
+      rescue
+        raise "Passed: DELAY=#{range} DELAY need to be in format: 2.0..5.0 "
+      end
+      [
+        Sites::WeWorkRemotely,
+        Sites::RemoteOk,
+        Sites::JobsRails42
+      ].each do |klass|
+        klass.new.collect_jobs(limit: limit)
+      end
+    end
+    desc 'collect_jobs_from SITE LIMIT',
+      "Retrieves data from SITE with LIMIT, e.g. #{AVAILABLE_SITES.sample}
+       [Example]: remote_job_scraper collect_jobs_from remote_ok 10
+      "
+    def collect_jobs_from(site, limit=nil)
+      limit = limit.to_i
+      limit = limit.zero? ? nil : limit
+      case site
+      when 'we_work_remotely'
+        then Sites::WeWorkRemotely.new.collect_jobs(limit: limit)
+      when 'remote_ok'
+        then Sites::RemoteOk.new.collect_jobs(limit: limit)
+      when '42jobs_rails'
+        then Sites::JobsRails42.new.collect_jobs(limit: limit)
+      else
+        raise "#{site} is not correct. Use: #{AVAILABLE_SITES.join(', ')}."
+      end
+    end
+    desc 'generate_summary',
+      "Merges data from #{AVAILABLE_SITES.join(', ')} and exports to
+      separate sheets in XLS file.
+      "
+    def generate_summary
+      Support::SpreadsheetCreator.generate
+    end
+    desc 'remove DIRNAME', "Removes DIRNAME (default: 'data'). Use carefully."
+    def remove(dirname = 'data')
+      puts "[Warning!]"
+      puts "This command will remove #{Dir.pwd}/#{dirname} permanently"
+      puts "Press Ctrl-C to abort."
+      sleep 3
+      FileUtils.rm_rf(dirname)
+      puts "Removed data in #{Dir.pwd}/#{dirname}."
+    end
+  end
+end

data/lib/remote_job_scraper/configuration.rb ADDED Viewed

@@ -0,0 +1,10 @@
+module RemoteJobScraper
+  class Configuration
+    attr_accessor :delay_range
+    def initialize
+      @delay_range = 0..2.0
+    end
+  end
+end

data/lib/remote_job_scraper/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module RemoteJobScraper
-  VERSION = "0.4.2"
+  VERSION = "0.4.4"
 end

data/lib/sites/base.rb CHANGED Viewed

@@ -1,44 +1,33 @@
 module Sites
   class Base
-    attr_reader :job_type, :doc, :url
+    attr_reader :doc, :url, :rows_count, :jobs_count
-    def initialize(job_type: :programming)
-      @job_type = job_type
-      @url = build_url
+    def initialize
+      @url = "#{self.class::HOST}#{self.class::PATH}"
       @doc = Nokogiri::HTML(open_page(@url))
-      @current_time = Time.new
+      @current_time = Time.now
       @timestamp = @current_time.strftime("%Y%m%d%H%M%S")
-      @count = get_count
+      @rows_count = 0
+      @jobs_count = get_jobs_count
     end
-    def open_page(url)
-      sleep(rand(0..2.0)) unless ENV['RAILS_ENV'] == 'test' # less mechanical behaviour
+    private
-      if ENV['RAILS_ENV'] == 'test'
-        open(url)
-      else
-        open(url, 'User-Agent' => user_agent)
-      end
+    def open_page(url)
+      sleep(rand(delay_range)) unless ENV['RAILS_ENV'] == 'test' # less mechanical behaviour
+      options = ENV['RAILS_ENV'] == 'test' ? {} : { 'User-Agent' => user_agent }
+      open(url, options)
     end
-    private
+    def delay_range
+      RemoteJobScraper.configuration.delay_range
+    end
     def user_agent
       Support::UserAgent::LIST.sample
     end
-    def build_url
-      case job_type
-        when :programming
-          then "#{self.class::HOST}#{self.class::PROGRAMMING}"
-        when :devops
-          then "#{self.class::HOST}#{self.class::DEVOPS}"
-        else
-          raise "Error"
-        end
-    end
     def filepath
       return test_filepath if ENV["RAILS_ENV"] == 'test'
       "#{self.class::STORE_DIR}/#{@timestamp}.csv"

data/lib/sites/elixir_radar.rb CHANGED Viewed

@@ -4,7 +4,7 @@ module Sites
   class ElixirRadar < Base
     HOST = 'http://plataformatec.com.br/'.freeze
-    PROGRAMMING = '/elixir-radar/jobs'.freeze
+    PATH = '/elixir-radar/jobs'.freeze
   end
 end

data/lib/sites/jobs_rails42.rb CHANGED Viewed

@@ -1,62 +1,71 @@
 module Sites
   class JobsRails42 < Base
-    # @TODO/NOTE: There is pagination on this site, it would be cool to find a way
-    # to grab more offers than just first page (25 items)
-    # I had to rename this class because we are not allowed to have numbers
+    # @NOTE: I had to rename this class because we are not allowed to have numbers
     # on the beginning of the class name (42JobsRails won't work).
+    # file paths follow this convention
     HOST = 'https://www.42jobs.io'.freeze
-    PROGRAMMING = '/rails/jobs-remote'.freeze
+    PATH = '/rails/jobs-remote'.freeze
     JOB_ITEM_SELECTOR = 'li.job-offers__item a'.freeze
     STORE_DIR = 'data/jobs_rails42'.freeze
-    def initialize(job_type: :programming, total_pages: 4)
-      @job_type = job_type
-      @url = build_url
-      @doc = nil
-      @current_time = Time.new
+    def initialize
+      @url = "#{self.class::HOST}#{self.class::PATH}"
+      @current_time = Time.now
       @timestamp = @current_time.strftime("%Y%m%d%H%M%S")
-      @total_pages = total_pages
-      @count = get_count
+      @doc = nil
+      @total_pages = 4
+      @rows_count = 0
+      @jobs_count = get_jobs_count
     end
-    def collect_jobs
-      (1..@total_pages).to_a.each do |page|
-        current_page = "#{@url}?page=#{page}"
-        doc = Nokogiri::HTML(open_page(current_page))
-        process_page(doc, current_page, page)
+    def collect_jobs(limit: nil)
+      FileUtils.mkdir_p STORE_DIR
+      (1..@total_pages).each do |page|
+        process_page(page: page, limit: limit)
       end
     end
     private
-    def process_page(doc, page_url, page)
-      puts "[Info] Getting the data from #{page_url} at #{@current_time}..."
-      FileUtils.mkdir_p STORE_DIR
+    def process_page(page:, limit:)
+      current_page = "#{@url}?page=#{page}"
+      doc = Nokogiri::HTML(open_page(current_page))
+      puts "[Info] Getting the data from #{current_page}"
       CSV.open(filepath, 'ab') do |csv|
         doc.css(JOB_ITEM_SELECTOR).each do |link|
+          return if limit == @rows_count
           job_url = "#{HOST}#{link["href"]}"
-          puts "[Info] Processing #{job_url}..."
-          job_page = Nokogiri::HTML(open_page(job_url))
-          offer_text = job_page.css('.job-offer__description').to_s
+          puts "[Info] Parsing #{job_url}..."
-          location = Support::OfferParser.get_location(offer_text)
-          keywords = Support::OfferParser.get_keywords(offer_text)
+          csv << get_row(job_url)
-          csv << [job_url, location, keywords]
+          @rows_count += 1
         end
       end
-      puts "[Done] Collected #{@count} job offers from #{url}. Data stores in: #{filepath}." if page == @total_pages
+      puts "[Done] Collected #{@jobs_count} job offers from #{url}. Data stored in: #{filepath}." if page == @total_pages
     end
-    private
+    def get_row(job_url)
+      job_page = Nokogiri::HTML(open_page(job_url))
+      offer_text = job_page.css('.job-offer__description').to_s
+      location = Support::OfferParser.get_location(offer_text)
+      keywords = Support::OfferParser.get_keywords(offer_text)
+      company = job_page.css('.job-offer__summary a').text
+      [job_url, location, keywords, company]
+    end
-    def get_count
-      25 * @total_pages
+    def get_jobs_count
+      jobs_count = 25 * @total_pages
+      puts "[Info] There are #{jobs_count} remote jobs on [42JobsRails]."
+      jobs_count
     end
   end
 end

data/lib/sites/remote_ok.rb CHANGED Viewed

@@ -4,41 +4,51 @@ module Sites
   class RemoteOk < Base
     HOST = 'https://remoteok.io'.freeze
-    PROGRAMMING = '/remote-dev-jobs'.freeze
+    PATH = '/remote-dev-jobs'.freeze
     JOB_ITEM_SELECTOR = 'tr.job'.freeze
     STORE_DIR = 'data/remote_ok'.freeze
-    def initialize(args = {})
-      super(args = {})
+    def initialize
+      super
     end
-    def collect_jobs
-      puts "[Info] Getting the data from #{url} at #{@current_time}..."
+    def collect_jobs(limit: nil)
+      puts "[Info] Getting the data from #{url}"
       FileUtils.mkdir_p STORE_DIR
       CSV.open(filepath, 'w') do |csv|
         doc.css(JOB_ITEM_SELECTOR).each do |link|
+          return if limit == @rows_count
           job_url = "#{HOST}#{link["data-url"]}"
-          puts "[Info] Processing #{job_url}..."
-          job_page = Nokogiri::HTML(open_page(job_url))
-          offer_text = job_page.css('td.heading').to_s
+          puts "[Info] Parsing #{job_url}..."
-          location = Support::OfferParser.get_location(offer_text)
-          keywords = Support::OfferParser.get_keywords(offer_text)
+          csv << get_row(job_url)
-          csv << [job_url, location, keywords]
+          @rows_count += 1
         end
       end
-      puts "[Done] Collected #{@count} job offers from #{url}. Data stores in: #{filepath}."
+      puts "[Done] Collected #{@rows_count} job offers from #{url}. Data stored in: #{filepath}."
     end
     private
-    def get_count
-      count = doc.css(JOB_ITEM_SELECTOR).map { |link| link['data-url'] }.size
-      puts "[Info] There is #{count} remote jobs available."
-      count
+    def get_row(job_url)
+      job_page = Nokogiri::HTML(open_page(job_url))
+      offer_text = job_page.css('td.heading').to_s
+      location = Support::OfferParser.get_location(offer_text)
+      keywords = Support::OfferParser.get_keywords(offer_text)
+      company = job_page.css('a.companyLink h3').text
+      [job_url, location, keywords, company]
+    end
+    def get_jobs_count
+      jobs_count = doc.css(JOB_ITEM_SELECTOR).map { |link| link['data-url'] }.size
+      puts "[Info] There are #{jobs_count} remote jobs on [RemoteOK]."
+      jobs_count
     end
   end
 end

data/lib/sites/we_work_remotely.rb CHANGED Viewed

@@ -4,49 +4,58 @@ module Sites
   class WeWorkRemotely < Base
     HOST = 'https://weworkremotely.com'.freeze
-    PROGRAMMING = '/categories/remote-programming-jobs'.freeze
+    PATH = '/categories/remote-programming-jobs'.freeze
     DEVOPS     = '/categories/remote-devops-sysadmin-jobs'.freeze
     JOB_ITEM_SELECTOR = '.jobs-container li a'.freeze
     STORE_DIR = 'data/we_work_remotely'
-    def initialize(args = {})
-      super(args = {})
+    def initialize
+      super
     end
-    def collect_jobs
-      puts "[Info] Getting the data from #{url} at #{@current_time}..."
+    def collect_jobs(limit: nil)
+      puts "[Info] Getting the data from #{url}"
       FileUtils.mkdir_p STORE_DIR
       CSV.open(filepath, 'w') do |csv|
         doc.css(JOB_ITEM_SELECTOR).each do |link|
           if link["href"].start_with?("/remote-jobs")
-            job_url = "#{HOST}#{link["href"]}"
-            puts "[Info] Processing #{job_url}..."
-            job_page = Nokogiri::HTML(open_page(job_url))
-            offer_text = job_page.css('.listing-container').to_s
+            return if limit == @rows_count
-            region = job_page.css('span.region').first
-            location = job_page.css('span.location').first
+            job_url = "#{HOST}#{link["href"]}"
+            puts "[Info] Parsing #{job_url}..."
-            keywords = Support::OfferParser.get_keywords(offer_text)
+            csv << get_row(job_url)
-            csv << [job_url, location, region, keywords]
+            @rows_count += 1
           end
         end
       end
-      puts "[Done] Collected #{@count} job offers from #{url}. Data stores in: #{filepath}."
+      puts "[Done] Collected #{@rows_count} job offers from #{url}. Data stored in: #{filepath}."
     end
     private
-    def get_count
-      count = doc.css(JOB_ITEM_SELECTOR)
+    def get_row(job_url)
+      job_page = Nokogiri::HTML(open_page(job_url))
+      offer_text = job_page.css('.listing-container').to_s
+      region = job_page.css('.listing-header-container span.region').first
+      location = job_page.css('.listing-header-container span.location').first
+      keywords = Support::OfferParser.get_keywords(offer_text)
+      company = job_page.css('.listing-header-container span.company').first
+      [job_url, location, region, keywords, company]
+    end
+    def get_jobs_count
+      jobs_count = doc.css(JOB_ITEM_SELECTOR)
         .map { |link| link['href'] }
         .select { |href| href.start_with?('/remote-jobs') }
         .size
-      puts "[Info] There is #{count} remote jobs available."
-      count
+      puts "[Info] There are #{jobs_count} remote jobs on [WeWorkRemotely]."
+      jobs_count
     end
   end
 end

data/lib/support/offer_parser.rb CHANGED Viewed

@@ -19,11 +19,11 @@ module Support
       indexes.each do |index|
         next if index[0].nil?
-        locations << tokens[index[0] + 1].gsub(',', '') if index[1] == 'location'
-        locations << tokens[index[0] - 1].gsub(',', '') if index[1] == 'based'
+        locations << tokens[index[0] + 1] if index[1] == 'location'
+        locations << tokens[index[0] - 1..index[0] + 2] if index[1] == 'based'
       end
-      locations.join(', ').capitalize
+      locations.join(' ').capitalize
     end
     def self.get_keywords(content, keywords = KEYWORDS)
@@ -42,9 +42,7 @@ module Support
     def self.get_tokens(content)
       content
-        .gsub('.', '')
-        .gsub(',', '')
-        .gsub(':', '')
+        .gsub(/\W+/, ' ') # remove non letters
         .downcase
         .split(/[\s-]/)
     end

data/lib/support/spreadsheet_creator.rb CHANGED Viewed

@@ -28,7 +28,8 @@ module Support
         0 => 90,
         1 => 20,
         2 => 20,
-        3 => 20
+        3 => 20,
+        4 => 20
       }
     end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: remote_job_scraper
 version: !ruby/object:Gem::Version
-  version: 0.4.2
+  version: 0.4.4
 platform: ruby
 authors:
 - Rafał Trojanowski
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-01-28 00:00:00.000000000 Z
+date: 2019-01-30 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -162,6 +162,8 @@ files:
 - bin/setup
 - exe/remote_job_scraper
 - lib/remote_job_scraper.rb
+- lib/remote_job_scraper/cli.rb
+- lib/remote_job_scraper/configuration.rb
 - lib/remote_job_scraper/version.rb
 - lib/sites/base.rb
 - lib/sites/elixir_radar.rb