RubyGems - remote_job_scraper - Versions diffs - 0.4.2 → 0.4.4 - Mend

remote_job_scraper 0.4.2 → 0.4.4

Files changed (15) hide show

checksums.yaml +4 -4
data/Gemfile.lock +1 -1
data/README.md +17 -1
data/lib/remote_job_scraper.rb +13 -39
data/lib/remote_job_scraper/cli.rb +75 -0
data/lib/remote_job_scraper/configuration.rb +10 -0
data/lib/remote_job_scraper/version.rb +1 -1
data/lib/sites/base.rb +14 -25
data/lib/sites/elixir_radar.rb +1 -1
data/lib/sites/jobs_rails42.rb +39 -30
data/lib/sites/remote_ok.rb +26 -16
data/lib/sites/we_work_remotely.rb +27 -18
data/lib/support/offer_parser.rb +4 -6
data/lib/support/spreadsheet_creator.rb +2 -1
metadata +4 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 72ba313413dc7997be5ff27cd80872eadf9a6a42
-  data.tar.gz: f60d2785e8577ef7a1e567d5753eba66f0957912
+  metadata.gz: 6629f970e902f9ec6789b5574156730e7c9066a0
+  data.tar.gz: 6b4eca4e512e432f4c4e642da57253e4756d0290
 SHA512:
-  metadata.gz: 4aecc4a83e5c4a9737db7feca2029effcae08a618796c3dbe8b87f81c4373e1eee25cdc7726ebca1b8c220650840d7d0d59298e1c20291a5dc52bdfd29b4eadd
-  data.tar.gz: 54382e1d93daf8f50fc92ee79cc80a3527e93a1fbdceb5006e2db1ea4de71e2523cfce148ded18d59296bd79e8dca8121800b3afbe93815fb6858959653da3c6
+  metadata.gz: 88559d103a0b1e5d70185641684e05f221d45fca0ea146b15324d34cc32c0ae40f42473f5564cc215410675f4b532e4a7a158fc659f73bfb6fb8c2d16208da11
+  data.tar.gz: f6e2e97da63b78a378200be0925221d80e569f4b1c3573cfa9928c3910332e5be834dc89bb4df0803c280b1a7ccff36e60519778d9c709295fe9ed1fa4612ed6

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    remote_job_scraper (0.4.1)
+    remote_job_scraper (0.4.3)
       nokogiri
       spreadsheet
       thor

data/README.md CHANGED Viewed

@@ -2,8 +2,9 @@
 Ruby gem that collects job offers for remote positions with ease.
-Going through many job listings and finding the right one may be a time-consuming process. That's why this tool has been built. It allows to automate the process, retrieve necessary data and store it in CSV/Excel file in just a few minutes. The main focus is to inform a user about the location (time-zone) required for a position.
+Going through many job listings and finding the right one may be a time-consuming process. That's why this tool has been built. It allows to automate the process, retrieve necessary data and store it in CSV/Excel file in just a few minutes. The main focus is to inform an user about the location (time-zone) required for a position.
+![screenshot](http://i67.tinypic.com/2ewfj3a.png)
 ## Installation
@@ -18,6 +19,21 @@ Going through many job listings and finding the right one may be a time-consumin
   * [x] 2.4.1
   * [ ] 2.0.0 (https://github.com/rafaltrojanowski/remote_job_scraper/issues/1)
+## Running test
+Running test:
+```
+$ rspec spec
+```
+A few tests run very slow because they parse a huge amount of pages.
+You can skip slow tests by running:
+```
+$ rspec . --tag ~speed:slow
+```
 ## Development
 After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.

data/lib/remote_job_scraper.rb CHANGED Viewed

@@ -1,4 +1,6 @@
 require 'remote_job_scraper/version'
+require 'remote_job_scraper/configuration'
+require 'remote_job_scraper/cli'
 require 'sites/we_work_remotely'
 require 'sites/remote_ok'
@@ -11,51 +13,23 @@ require 'support/spreadsheet_creator'
 require 'nokogiri'
 require 'open-uri'
 require 'csv'
-require "thor"
 module RemoteJobScraper
-  AVAILABLE_SITES = %w(we_work_remotely remote_ok 42jobs_rails)
-  class CLI < Thor
-    desc 'collect_jobs', "Retrieves data from #{AVAILABLE_SITES.join(', ')}"
-    def collect_jobs
-      [Sites::WeWorkRemotely, Sites::RemoteOk].each do |klass|
-        klass.new.collect_jobs
-      end
-    end
-    desc 'collect_jobs_from SITE', "Retrieves data from SITE, e.g. #{AVAILABLE_SITES.sample}"
-    def collect_jobs_from(site)
-      case site
-      when 'we_work_remotely'
-        then Sites::WeWorkRemotely.new.collect_jobs
-      when 'remote_ok'
-        then Sites::RemoteOk.new.collect_jobs
-      when '42jobs_rails'
-        then Sites::JobsRails42.new.collect_jobs
-      else
-        raise "#{site} is not correct. Use: #{AVAILABLE_SITES.join(', ')}."
-      end
-    end
-    desc 'generate_summary', "Merges data from #{AVAILABLE_SITES.join(', ')} and exports to XLS file"
-    def generate_summary
-      Support::SpreadsheetCreator.generate
-    end
+  class << self
+    attr_accessor :configuration
+  end
-    desc 'remove DIRNAME', "Removes DIRNAME (default: 'data'). Use carefully."
-    def remove(dirname = 'data')
-      puts "[Warning!]"
-      puts "This command will remove #{Dir.pwd}/#{dirname} permanently"
-      puts "Press Ctrl-C to abort."
+  def self.configuration
+    @configuration ||= Configuration.new
+  end
-      sleep 3
+  def self.reset
+    @configuration = Configuration.new
+  end
-      FileUtils.rm_rf(dirname)
-      puts "Removed data in #{Dir.pwd}/#{dirname}."
-    end
+  def self.configure
+    yield(configuration)
   end
   def self.root

data/lib/remote_job_scraper/cli.rb ADDED Viewed

@@ -0,0 +1,75 @@
+require "thor"
+module RemoteJobScraper
+  class CLI < Thor
+    AVAILABLE_SITES = %w(we_work_remotely remote_ok 42jobs_rails)
+    desc 'collect_jobs LIMIT DELAY',
+      "Retrieves data from #{AVAILABLE_SITES.join(', ')}.
+       [Example]: remote_job_scraper collect_jobs 10 9.0..10.0
+      "
+    def collect_jobs(limit = nil, delay = nil)
+      limit = limit.to_i
+      limit = limit.zero? ? nil : limit
+      begin
+        unless delay.nil?
+          arr =  delay.split('..').map{ |d| Float(d) }
+          range = arr[0]..arr[1]
+          RemoteJobScraper.configuration.delay_range = range
+        end
+      rescue
+        raise "Passed: DELAY=#{range} DELAY need to be in format: 2.0..5.0 "
+      end
+      [
+        Sites::WeWorkRemotely,
+        Sites::RemoteOk,
+        Sites::JobsRails42
+      ].each do |klass|
+        klass.new.collect_jobs(limit: limit)
+      end
+    end
+    desc 'collect_jobs_from SITE LIMIT',
+      "Retrieves data from SITE with LIMIT, e.g. #{AVAILABLE_SITES.sample}
+       [Example]: remote_job_scraper collect_jobs_from remote_ok 10
+      "
+    def collect_jobs_from(site, limit=nil)
+      limit = limit.to_i
+      limit = limit.zero? ? nil : limit
+      case site
+      when 'we_work_remotely'
+        then Sites::WeWorkRemotely.new.collect_jobs(limit: limit)
+      when 'remote_ok'
+        then Sites::RemoteOk.new.collect_jobs(limit: limit)
+      when '42jobs_rails'
+        then Sites::JobsRails42.new.collect_jobs(limit: limit)
+      else
+        raise "#{site} is not correct. Use: #{AVAILABLE_SITES.join(', ')}."
+      end
+    end
+    desc 'generate_summary',
+      "Merges data from #{AVAILABLE_SITES.join(', ')} and exports to
+      separate sheets in XLS file.
+      "
+    def generate_summary
+      Support::SpreadsheetCreator.generate
+    end
+    desc 'remove DIRNAME', "Removes DIRNAME (default: 'data'). Use carefully."
+    def remove(dirname = 'data')
+      puts "[Warning!]"
+      puts "This command will remove #{Dir.pwd}/#{dirname} permanently"
+      puts "Press Ctrl-C to abort."
+      sleep 3
+      FileUtils.rm_rf(dirname)
+      puts "Removed data in #{Dir.pwd}/#{dirname}."
+    end
+  end
+end

data/lib/remote_job_scraper/configuration.rb ADDED Viewed

@@ -0,0 +1,10 @@
+module RemoteJobScraper
+  class Configuration
+    attr_accessor :delay_range
+    def initialize
+      @delay_range = 0..2.0
+    end
+  end
+end

data/lib/remote_job_scraper/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module RemoteJobScraper
-  VERSION = "0.4.2"
+  VERSION = "0.4.4"
 end

data/lib/sites/base.rb CHANGED Viewed

@@ -1,44 +1,33 @@
 module Sites
   class Base
-    attr_reader :job_type, :doc, :url
+    attr_reader :doc, :url, :rows_count, :jobs_count
-    def initialize(job_type: :programming)
-      @job_type = job_type
-      @url = build_url
+    def initialize
+      @url = "#{self.class::HOST}#{self.class::PATH}"
       @doc = Nokogiri::HTML(open_page(@url))
-      @current_time = Time.new
+      @current_time = Time.now
       @timestamp = @current_time.strftime("%Y%m%d%H%M%S")
-      @count = get_count
+      @rows_count = 0
+      @jobs_count = get_jobs_count
     end
-    def open_page(url)
-      sleep(rand(0..2.0)) unless ENV['RAILS_ENV'] == 'test' # less mechanical behaviour
+    private
-      if ENV['RAILS_ENV'] == 'test'
-        open(url)
-      else
-        open(url, 'User-Agent' => user_agent)
-      end
+    def open_page(url)
+      sleep(rand(delay_range)) unless ENV['RAILS_ENV'] == 'test' # less mechanical behaviour
+      options = ENV['RAILS_ENV'] == 'test' ? {} : { 'User-Agent' => user_agent }
+      open(url, options)
     end
-    private
+    def delay_range
+      RemoteJobScraper.configuration.delay_range
+    end
     def user_agent
       Support::UserAgent::LIST.sample
     end
-    def build_url
-      case job_type
-        when :programming
-          then "#{self.class::HOST}#{self.class::PROGRAMMING}"
-        when :devops
-          then "#{self.class::HOST}#{self.class::DEVOPS}"
-        else
-          raise "Error"
-        end
-    end
     def filepath
       return test_filepath if ENV["RAILS_ENV"] == 'test'
       "#{self.class::STORE_DIR}/#{@timestamp}.csv"

data/lib/sites/elixir_radar.rb CHANGED Viewed

@@ -4,7 +4,7 @@ module Sites
   class ElixirRadar < Base
     HOST = 'http://plataformatec.com.br/'.freeze
-    PROGRAMMING = '/elixir-radar/jobs'.freeze
+    PATH = '/elixir-radar/jobs'.freeze
   end
 end

data/lib/sites/jobs_rails42.rb CHANGED Viewed

@@ -1,62 +1,71 @@
 module Sites
   class JobsRails42 < Base
-    # @TODO/NOTE: There is pagination on this site, it would be cool to find a way
-    # to grab more offers than just first page (25 items)
-    # I had to rename this class because we are not allowed to have numbers
+    # @NOTE: I had to rename this class because we are not allowed to have numbers
     # on the beginning of the class name (42JobsRails won't work).
+    # file paths follow this convention
     HOST = 'https://www.42jobs.io'.freeze
-    PROGRAMMING = '/rails/jobs-remote'.freeze
+    PATH = '/rails/jobs-remote'.freeze
     JOB_ITEM_SELECTOR = 'li.job-offers__item a'.freeze
     STORE_DIR = 'data/jobs_rails42'.freeze
-    def initialize(job_type: :programming, total_pages: 4)
-      @job_type = job_type
-      @url = build_url
-      @doc = nil
-      @current_time = Time.new
+    def initialize
+      @url = "#{self.class::HOST}#{self.class::PATH}"
+      @current_time = Time.now
       @timestamp = @current_time.strftime("%Y%m%d%H%M%S")
-      @total_pages = total_pages
-      @count = get_count
+      @doc = nil
+      @total_pages = 4
+      @rows_count = 0
+      @jobs_count = get_jobs_count
     end
-    def collect_jobs
-      (1..@total_pages).to_a.each do |page|
-        current_page = "#{@url}?page=#{page}"
-        doc = Nokogiri::HTML(open_page(current_page))
-        process_page(doc, current_page, page)
+    def collect_jobs(limit: nil)
+      FileUtils.mkdir_p STORE_DIR
+      (1..@total_pages).each do |page|
+        process_page(page: page, limit: limit)
       end
     end
     private
-    def process_page(doc, page_url, page)
-      puts "[Info] Getting the data from #{page_url} at #{@current_time}..."
-      FileUtils.mkdir_p STORE_DIR
+    def process_page(page:, limit:)
+      current_page = "#{@url}?page=#{page}"
+      doc = Nokogiri::HTML(open_page(current_page))
+      puts "[Info] Getting the data from #{current_page}"
       CSV.open(filepath, 'ab') do |csv|
         doc.css(JOB_ITEM_SELECTOR).each do |link|
+          return if limit == @rows_count
           job_url = "#{HOST}#{link["href"]}"
-          puts "[Info] Processing #{job_url}..."
-          job_page = Nokogiri::HTML(open_page(job_url))
-          offer_text = job_page.css('.job-offer__description').to_s
+          puts "[Info] Parsing #{job_url}..."
-          location = Support::OfferParser.get_location(offer_text)
-          keywords = Support::OfferParser.get_keywords(offer_text)
+          csv << get_row(job_url)
-          csv << [job_url, location, keywords]
+          @rows_count += 1
         end
       end
-      puts "[Done] Collected #{@count} job offers from #{url}. Data stores in: #{filepath}." if page == @total_pages
+      puts "[Done] Collected #{@jobs_count} job offers from #{url}. Data stored in: #{filepath}." if page == @total_pages
     end
-    private
+    def get_row(job_url)
+      job_page = Nokogiri::HTML(open_page(job_url))
+      offer_text = job_page.css('.job-offer__description').to_s
+      location = Support::OfferParser.get_location(offer_text)
+      keywords = Support::OfferParser.get_keywords(offer_text)
+      company = job_page.css('.job-offer__summary a').text
+      [job_url, location, keywords, company]
+    end
-    def get_count
-      25 * @total_pages
+    def get_jobs_count
+      jobs_count = 25 * @total_pages
+      puts "[Info] There are #{jobs_count} remote jobs on [42JobsRails]."
+      jobs_count
     end
   end
 end

data/lib/sites/remote_ok.rb CHANGED Viewed

@@ -4,41 +4,51 @@ module Sites
   class RemoteOk < Base
     HOST = 'https://remoteok.io'.freeze
-    PROGRAMMING = '/remote-dev-jobs'.freeze
+    PATH = '/remote-dev-jobs'.freeze
     JOB_ITEM_SELECTOR = 'tr.job'.freeze
     STORE_DIR = 'data/remote_ok'.freeze
-    def initialize(args = {})
-      super(args = {})
+    def initialize
+      super
     end
-    def collect_jobs
-      puts "[Info] Getting the data from #{url} at #{@current_time}..."
+    def collect_jobs(limit: nil)
+      puts "[Info] Getting the data from #{url}"
       FileUtils.mkdir_p STORE_DIR
       CSV.open(filepath, 'w') do |csv|
         doc.css(JOB_ITEM_SELECTOR).each do |link|
+          return if limit == @rows_count
           job_url = "#{HOST}#{link["data-url"]}"
-          puts "[Info] Processing #{job_url}..."
-          job_page = Nokogiri::HTML(open_page(job_url))
-          offer_text = job_page.css('td.heading').to_s
+          puts "[Info] Parsing #{job_url}..."
-          location = Support::OfferParser.get_location(offer_text)
-          keywords = Support::OfferParser.get_keywords(offer_text)
+          csv << get_row(job_url)
-          csv << [job_url, location, keywords]
+          @rows_count += 1
         end
       end
-      puts "[Done] Collected #{@count} job offers from #{url}. Data stores in: #{filepath}."
+      puts "[Done] Collected #{@rows_count} job offers from #{url}. Data stored in: #{filepath}."
     end
     private
-    def get_count
-      count = doc.css(JOB_ITEM_SELECTOR).map { |link| link['data-url'] }.size
-      puts "[Info] There is #{count} remote jobs available."
-      count
+    def get_row(job_url)
+      job_page = Nokogiri::HTML(open_page(job_url))
+      offer_text = job_page.css('td.heading').to_s
+      location = Support::OfferParser.get_location(offer_text)
+      keywords = Support::OfferParser.get_keywords(offer_text)
+      company = job_page.css('a.companyLink h3').text
+      [job_url, location, keywords, company]
+    end
+    def get_jobs_count
+      jobs_count = doc.css(JOB_ITEM_SELECTOR).map { |link| link['data-url'] }.size
+      puts "[Info] There are #{jobs_count} remote jobs on [RemoteOK]."
+      jobs_count
     end
   end
 end

data/lib/sites/we_work_remotely.rb CHANGED Viewed

@@ -4,49 +4,58 @@ module Sites
   class WeWorkRemotely < Base
     HOST = 'https://weworkremotely.com'.freeze
-    PROGRAMMING = '/categories/remote-programming-jobs'.freeze
+    PATH = '/categories/remote-programming-jobs'.freeze
     DEVOPS     = '/categories/remote-devops-sysadmin-jobs'.freeze
     JOB_ITEM_SELECTOR = '.jobs-container li a'.freeze
     STORE_DIR = 'data/we_work_remotely'
-    def initialize(args = {})
-      super(args = {})
+    def initialize
+      super
     end
-    def collect_jobs
-      puts "[Info] Getting the data from #{url} at #{@current_time}..."
+    def collect_jobs(limit: nil)
+      puts "[Info] Getting the data from #{url}"
       FileUtils.mkdir_p STORE_DIR
       CSV.open(filepath, 'w') do |csv|
         doc.css(JOB_ITEM_SELECTOR).each do |link|
           if link["href"].start_with?("/remote-jobs")
-            job_url = "#{HOST}#{link["href"]}"
-            puts "[Info] Processing #{job_url}..."
-            job_page = Nokogiri::HTML(open_page(job_url))
-            offer_text = job_page.css('.listing-container').to_s
+            return if limit == @rows_count
-            region = job_page.css('span.region').first
-            location = job_page.css('span.location').first
+            job_url = "#{HOST}#{link["href"]}"
+            puts "[Info] Parsing #{job_url}..."
-            keywords = Support::OfferParser.get_keywords(offer_text)
+            csv << get_row(job_url)
-            csv << [job_url, location, region, keywords]
+            @rows_count += 1
           end
         end
       end
-      puts "[Done] Collected #{@count} job offers from #{url}. Data stores in: #{filepath}."
+      puts "[Done] Collected #{@rows_count} job offers from #{url}. Data stored in: #{filepath}."
     end
     private
-    def get_count
-      count = doc.css(JOB_ITEM_SELECTOR)
+    def get_row(job_url)
+      job_page = Nokogiri::HTML(open_page(job_url))
+      offer_text = job_page.css('.listing-container').to_s
+      region = job_page.css('.listing-header-container span.region').first
+      location = job_page.css('.listing-header-container span.location').first
+      keywords = Support::OfferParser.get_keywords(offer_text)
+      company = job_page.css('.listing-header-container span.company').first
+      [job_url, location, region, keywords, company]
+    end
+    def get_jobs_count
+      jobs_count = doc.css(JOB_ITEM_SELECTOR)
         .map { |link| link['href'] }
         .select { |href| href.start_with?('/remote-jobs') }
         .size
-      puts "[Info] There is #{count} remote jobs available."
-      count
+      puts "[Info] There are #{jobs_count} remote jobs on [WeWorkRemotely]."
+      jobs_count
     end
   end
 end

data/lib/support/offer_parser.rb CHANGED Viewed

@@ -19,11 +19,11 @@ module Support
       indexes.each do |index|
         next if index[0].nil?
-        locations << tokens[index[0] + 1].gsub(',', '') if index[1] == 'location'
-        locations << tokens[index[0] - 1].gsub(',', '') if index[1] == 'based'
+        locations << tokens[index[0] + 1] if index[1] == 'location'
+        locations << tokens[index[0] - 1..index[0] + 2] if index[1] == 'based'
       end
-      locations.join(', ').capitalize
+      locations.join(' ').capitalize
     end
     def self.get_keywords(content, keywords = KEYWORDS)
@@ -42,9 +42,7 @@ module Support
     def self.get_tokens(content)
       content
-        .gsub('.', '')
-        .gsub(',', '')
-        .gsub(':', '')
+        .gsub(/\W+/, ' ') # remove non letters
         .downcase
         .split(/[\s-]/)
     end

data/lib/support/spreadsheet_creator.rb CHANGED Viewed

@@ -28,7 +28,8 @@ module Support
         0 => 90,
         1 => 20,
         2 => 20,
-        3 => 20
+        3 => 20,
+        4 => 20
       }
     end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: remote_job_scraper
 version: !ruby/object:Gem::Version
-  version: 0.4.2
+  version: 0.4.4
 platform: ruby
 authors:
 - Rafał Trojanowski
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2019-01-28 00:00:00.000000000 Z
+date: 2019-01-30 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -162,6 +162,8 @@ files:
 - bin/setup
 - exe/remote_job_scraper
 - lib/remote_job_scraper.rb
+- lib/remote_job_scraper/cli.rb
+- lib/remote_job_scraper/configuration.rb
 - lib/remote_job_scraper/version.rb
 - lib/sites/base.rb
 - lib/sites/elixir_radar.rb