remote_job_scraper 0.5.0 → 0.6.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8f12102d94abaccc92c800bb21d459fddce99380
4
- data.tar.gz: 2af9bb36f83a766ab3ce90b3a9473b4cb1166683
3
+ metadata.gz: efcddd08a1843eaa0509f9bdd853ae47476df9db
4
+ data.tar.gz: 7bb7b71b57b3d8b6b1b8255320092849e7ff7b97
5
5
  SHA512:
6
- metadata.gz: af613f9bff539e13a3585d80801e1983792e6d2125e0322526c411ee3a58ad8eb9ea955e88b07847e51a151e2eb072a7e4697b2141111ccadf5de047ad81bf46
7
- data.tar.gz: a49fb2f1e599c3df9d49aa4c5be63d3930b43873c11a33465ad447fdc0fe7c7f33030c1c1f7d73fae0faf49ef7fa2fe5205262a9186c8260eec468265486a0fe
6
+ metadata.gz: 03bab4b51e95b76ab697d58577843c56b5b8ce0a2beaac21946d96e11babbe21ec0d410a46dfaf400e540cee70ac7546e084f624dd2b7d7dfe5dcf74be4dd966
7
+ data.tar.gz: 3e1f3570fe6766b5b20a8acd5cd1f01768da398d9a7de759caf2efa5a0c459c6641f3f3fb81919a6b47cec0df102de7d3e0b5944eacc9664ef6af97c57945724
data/.gitignore CHANGED
@@ -11,3 +11,4 @@ data/
11
11
  # rspec failure tracking
12
12
  .rspec_status
13
13
  .DS_Store
14
+ .byebug_history
data/README.md CHANGED
@@ -40,6 +40,12 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
40
40
 
41
41
  To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
42
42
 
43
+ ## Debugging
44
+ ```
45
+ require 'byebug'
46
+ byebug
47
+ ```
48
+
43
49
  ## Contributing
44
50
 
45
51
  Bug reports and pull requests are welcome on GitHub at https://github.com/rafaltrojanowski/remote_job_scraper. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
@@ -6,6 +6,7 @@ require 'sites/we_work_remotely'
6
6
  require 'sites/remote_ok'
7
7
  require 'sites/jobs_rails42'
8
8
  require 'sites/github_remote_jobs'
9
+ require 'sites/elixir_companies'
9
10
 
10
11
  require 'support/offer_parser'
11
12
  require 'support/user_agent'
@@ -5,14 +5,22 @@ module RemoteJobScraper
5
5
 
6
6
  AVAILABLE_SITES = %w(we_work_remotely remote_ok 42jobs_rails)
7
7
 
8
- desc 'collect_companies',
8
+ desc 'collect_remote_companies',
9
9
  "Retrieves remote companies
10
- [Example]: remote_job_scraper collect_companies
10
+ [Example]: remote_job_scraper collect_remote_companies
11
11
  "
12
- def collect_companies
12
+ def collect_remote_companies
13
13
  Sites::GithubRemoteJobs.new.collect_companies
14
14
  end
15
15
 
16
+ desc 'collect_elixir_companies',
17
+ "Retrieves Elixir companies
18
+ [Example]: remote_job_scraper collect_elixir_companies
19
+ "
20
+ def collect_elixir_companies
21
+ Sites::ElixirCompanies.new.collect_companies
22
+ end
23
+
16
24
  desc 'collect_jobs LIMIT DELAY',
17
25
  "Retrieves data from #{AVAILABLE_SITES.join(', ')}.
18
26
  [Example]: remote_job_scraper collect_jobs 10 9.0..10.0
@@ -1,3 +1,3 @@
1
1
  module RemoteJobScraper
2
- VERSION = "0.5.0"
2
+ VERSION = "0.6.0"
3
3
  end
@@ -0,0 +1,87 @@
1
+ require_relative 'base'
2
+
3
+ module Sites
4
+ class ElixirCompanies < Base
5
+
6
+ HOST = 'https://elixir-companies.com'.freeze
7
+ PATH = '/en/browse'.freeze
8
+ JOB_ITEM_SELECTOR = 'div.company.box'.freeze
9
+ STORE_DIR = 'data/elixir_companies'.freeze
10
+
11
+ def initialize
12
+ @url = "#{self.class::HOST}#{self.class::PATH}"
13
+ @current_time = Time.now
14
+ @timestamp = @current_time.strftime("%Y%m%d%H%M%S")
15
+ @doc = nil
16
+ @total_pages = 26
17
+ @rows_count = 0
18
+ @jobs_count = get_jobs_count
19
+ end
20
+
21
+ def collect_companies(limit: nil)
22
+ FileUtils.mkdir_p STORE_DIR
23
+
24
+ (1..@total_pages).each do |page|
25
+ process_page(page: page, limit: limit)
26
+ end
27
+ end
28
+
29
+ def companies_count
30
+ @rows_count
31
+ end
32
+
33
+ private
34
+
35
+ def process_page(page:, limit:)
36
+ current_page = "#{@url}?page=#{page}"
37
+ doc = Nokogiri::HTML(open_page(current_page))
38
+ puts "[Info] Getting the data from #{current_page}"
39
+
40
+ CSV.open(filepath, 'ab') do |csv|
41
+ doc.css(JOB_ITEM_SELECTOR).each do |company_box|
42
+ return if limit == @rows_count
43
+ csv << get_row(company_box)
44
+ @rows_count += 1
45
+ end
46
+ end
47
+
48
+ puts "[Done] Collected #{@jobs_count} job offers from #{url}. Data stored in: #{filepath}." if page == @total_pages
49
+ end
50
+
51
+ def get_row(company_box)
52
+ company_title = company_box.css('div.content p.title').text
53
+ company_info = company_box.css('div.content.company-info p')
54
+
55
+ # A bit ugly way to get a data between span elements
56
+ array = company_info.text.split("\n").select do |element|
57
+ element =~ /[a-zA-Z]/
58
+ end.map!(&:strip).delete_if do |element|
59
+ element == "GitHub" || element == "Add a job"
60
+ end
61
+
62
+ has_blog = array[2] && (array[2].include?("/") || array[2].include?("blog"))
63
+
64
+ industry = array[0]
65
+ company_website = array[1]
66
+ is_hiring = company_box["class"].include?("has-ribbon")
67
+
68
+ if has_blog
69
+ blog = array[2]
70
+ location = array[4]
71
+ else
72
+ blog = nil
73
+ location = array[2]
74
+ end
75
+
76
+ row = [company_title, industry, company_website, blog, location]
77
+ hiring = is_hiring ? "Hiring!" : nil
78
+ row.push hiring
79
+ end
80
+
81
+ def get_jobs_count
82
+ jobs_count = 16 * @total_pages # roughly - first page has 14 items
83
+ puts "[Info] There are #{jobs_count} remote jobs on [ElixirCompanies]."
84
+ jobs_count
85
+ end
86
+ end
87
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: remote_job_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rafał Trojanowski
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-06-07 00:00:00.000000000 Z
11
+ date: 2019-08-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -166,6 +166,7 @@ files:
166
166
  - lib/remote_job_scraper/configuration.rb
167
167
  - lib/remote_job_scraper/version.rb
168
168
  - lib/sites/base.rb
169
+ - lib/sites/elixir_companies.rb
169
170
  - lib/sites/elixir_radar.rb
170
171
  - lib/sites/github_remote_jobs.rb
171
172
  - lib/sites/jobs_rails42.rb