remote_job_scraper 0.5.0 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8f12102d94abaccc92c800bb21d459fddce99380
4
- data.tar.gz: 2af9bb36f83a766ab3ce90b3a9473b4cb1166683
3
+ metadata.gz: efcddd08a1843eaa0509f9bdd853ae47476df9db
4
+ data.tar.gz: 7bb7b71b57b3d8b6b1b8255320092849e7ff7b97
5
5
  SHA512:
6
- metadata.gz: af613f9bff539e13a3585d80801e1983792e6d2125e0322526c411ee3a58ad8eb9ea955e88b07847e51a151e2eb072a7e4697b2141111ccadf5de047ad81bf46
7
- data.tar.gz: a49fb2f1e599c3df9d49aa4c5be63d3930b43873c11a33465ad447fdc0fe7c7f33030c1c1f7d73fae0faf49ef7fa2fe5205262a9186c8260eec468265486a0fe
6
+ metadata.gz: 03bab4b51e95b76ab697d58577843c56b5b8ce0a2beaac21946d96e11babbe21ec0d410a46dfaf400e540cee70ac7546e084f624dd2b7d7dfe5dcf74be4dd966
7
+ data.tar.gz: 3e1f3570fe6766b5b20a8acd5cd1f01768da398d9a7de759caf2efa5a0c459c6641f3f3fb81919a6b47cec0df102de7d3e0b5944eacc9664ef6af97c57945724
data/.gitignore CHANGED
@@ -11,3 +11,4 @@ data/
11
11
  # rspec failure tracking
12
12
  .rspec_status
13
13
  .DS_Store
14
+ .byebug_history
data/README.md CHANGED
@@ -40,6 +40,12 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
40
40
 
41
41
  To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
42
42
 
43
+ ## Debugging
44
+ ```
45
+ require 'byebug'
46
+ byebug
47
+ ```
48
+
43
49
  ## Contributing
44
50
 
45
51
  Bug reports and pull requests are welcome on GitHub at https://github.com/rafaltrojanowski/remote_job_scraper. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
@@ -6,6 +6,7 @@ require 'sites/we_work_remotely'
6
6
  require 'sites/remote_ok'
7
7
  require 'sites/jobs_rails42'
8
8
  require 'sites/github_remote_jobs'
9
+ require 'sites/elixir_companies'
9
10
 
10
11
  require 'support/offer_parser'
11
12
  require 'support/user_agent'
@@ -5,14 +5,22 @@ module RemoteJobScraper
5
5
 
6
6
  AVAILABLE_SITES = %w(we_work_remotely remote_ok 42jobs_rails)
7
7
 
8
- desc 'collect_companies',
8
+ desc 'collect_remote_companies',
9
9
  "Retrieves remote companies
10
- [Example]: remote_job_scraper collect_companies
10
+ [Example]: remote_job_scraper collect_remote_companies
11
11
  "
12
- def collect_companies
12
+ def collect_remote_companies
13
13
  Sites::GithubRemoteJobs.new.collect_companies
14
14
  end
15
15
 
16
+ desc 'collect_elixir_companies',
17
+ "Retrieves Elixir companies
18
+ [Example]: remote_job_scraper collect_elixir_companies
19
+ "
20
+ def collect_elixir_companies
21
+ Sites::ElixirCompanies.new.collect_companies
22
+ end
23
+
16
24
  desc 'collect_jobs LIMIT DELAY',
17
25
  "Retrieves data from #{AVAILABLE_SITES.join(', ')}.
18
26
  [Example]: remote_job_scraper collect_jobs 10 9.0..10.0
@@ -1,3 +1,3 @@
1
1
  module RemoteJobScraper
2
- VERSION = "0.5.0"
2
+ VERSION = "0.6.0"
3
3
  end
@@ -0,0 +1,87 @@
1
+ require_relative 'base'
2
+
3
+ module Sites
4
+ class ElixirCompanies < Base
5
+
6
+ HOST = 'https://elixir-companies.com'.freeze
7
+ PATH = '/en/browse'.freeze
8
+ JOB_ITEM_SELECTOR = 'div.company.box'.freeze
9
+ STORE_DIR = 'data/elixir_companies'.freeze
10
+
11
+ def initialize
12
+ @url = "#{self.class::HOST}#{self.class::PATH}"
13
+ @current_time = Time.now
14
+ @timestamp = @current_time.strftime("%Y%m%d%H%M%S")
15
+ @doc = nil
16
+ @total_pages = 26
17
+ @rows_count = 0
18
+ @jobs_count = get_jobs_count
19
+ end
20
+
21
+ def collect_companies(limit: nil)
22
+ FileUtils.mkdir_p STORE_DIR
23
+
24
+ (1..@total_pages).each do |page|
25
+ process_page(page: page, limit: limit)
26
+ end
27
+ end
28
+
29
+ def companies_count
30
+ @rows_count
31
+ end
32
+
33
+ private
34
+
35
+ def process_page(page:, limit:)
36
+ current_page = "#{@url}?page=#{page}"
37
+ doc = Nokogiri::HTML(open_page(current_page))
38
+ puts "[Info] Getting the data from #{current_page}"
39
+
40
+ CSV.open(filepath, 'ab') do |csv|
41
+ doc.css(JOB_ITEM_SELECTOR).each do |company_box|
42
+ return if limit == @rows_count
43
+ csv << get_row(company_box)
44
+ @rows_count += 1
45
+ end
46
+ end
47
+
48
+ puts "[Done] Collected #{@jobs_count} job offers from #{url}. Data stored in: #{filepath}." if page == @total_pages
49
+ end
50
+
51
+ def get_row(company_box)
52
+ company_title = company_box.css('div.content p.title').text
53
+ company_info = company_box.css('div.content.company-info p')
54
+
55
+ # A bit ugly way to get a data between span elements
56
+ array = company_info.text.split("\n").select do |element|
57
+ element =~ /[a-zA-Z]/
58
+ end.map!(&:strip).delete_if do |element|
59
+ element == "GitHub" || element == "Add a job"
60
+ end
61
+
62
+ has_blog = array[2] && (array[2].include?("/") || array[2].include?("blog"))
63
+
64
+ industry = array[0]
65
+ company_website = array[1]
66
+ is_hiring = company_box["class"].include?("has-ribbon")
67
+
68
+ if has_blog
69
+ blog = array[2]
70
+ location = array[4]
71
+ else
72
+ blog = nil
73
+ location = array[2]
74
+ end
75
+
76
+ row = [company_title, industry, company_website, blog, location]
77
+ hiring = is_hiring ? "Hiring!" : nil
78
+ row.push hiring
79
+ end
80
+
81
+ def get_jobs_count
82
+ jobs_count = 16 * @total_pages # roughly - first page has 14 items
83
+ puts "[Info] There are #{jobs_count} remote jobs on [ElixirCompanies]."
84
+ jobs_count
85
+ end
86
+ end
87
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: remote_job_scraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.6.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Rafał Trojanowski
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2019-06-07 00:00:00.000000000 Z
11
+ date: 2019-08-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri
@@ -166,6 +166,7 @@ files:
166
166
  - lib/remote_job_scraper/configuration.rb
167
167
  - lib/remote_job_scraper/version.rb
168
168
  - lib/sites/base.rb
169
+ - lib/sites/elixir_companies.rb
169
170
  - lib/sites/elixir_radar.rb
170
171
  - lib/sites/github_remote_jobs.rb
171
172
  - lib/sites/jobs_rails42.rb