remote_job_scraper 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +13 -0
- data/.rspec +3 -0
- data/.ruby-version +1 -0
- data/.travis.yml +5 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +61 -0
- data/LICENSE.txt +21 -0
- data/README.md +49 -0
- data/Rakefile +6 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/exe/remote_job_scraper +5 -0
- data/lib/remote_job_scraper/version.rb +3 -0
- data/lib/remote_job_scraper.rb +64 -0
- data/lib/sites/base.rb +62 -0
- data/lib/sites/elixir_radar.rb +10 -0
- data/lib/sites/jobs_rails42.rb +64 -0
- data/lib/sites/rails_jobs.rb +9 -0
- data/lib/sites/remote_ok.rb +45 -0
- data/lib/sites/we_work_remotely.rb +52 -0
- data/lib/support/offer_parser.rb +52 -0
- data/lib/support/spreadsheet_creator.rb +40 -0
- data/lib/support/user_agent.rb +7486 -0
- data/remote_job_scraper.gemspec +48 -0
- metadata +201 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: debb2440ac5898b2fc0642017b9bf01aa6adb89f
|
4
|
+
data.tar.gz: e370546575983d780c9ed6aa28d7611270b08b86
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: '0844c2a152641653717d587829e8a73aca58292afb175e014a4edc08386ff009de153703fc37d8d271cfd5919441a73a25b5f6ee79c2dda2cd375d1bccbe2e70'
|
7
|
+
data.tar.gz: cfd1916966d735458d11f4953f0ec26932833e7c03fb028890bfb3f6226ff30c0ce1c58f5a733597f070d3b185dca3597abcacf8f3aaa6befc690cb699366cd5
|
data/.gitignore
ADDED
data/.rspec
ADDED
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.4.1
|
data/.travis.yml
ADDED
data/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
# Contributor Covenant Code of Conduct
|
2
|
+
|
3
|
+
## Our Pledge
|
4
|
+
|
5
|
+
In the interest of fostering an open and welcoming environment, we as
|
6
|
+
contributors and maintainers pledge to making participation in our project and
|
7
|
+
our community a harassment-free experience for everyone, regardless of age, body
|
8
|
+
size, disability, ethnicity, gender identity and expression, level of experience,
|
9
|
+
nationality, personal appearance, race, religion, or sexual identity and
|
10
|
+
orientation.
|
11
|
+
|
12
|
+
## Our Standards
|
13
|
+
|
14
|
+
Examples of behavior that contributes to creating a positive environment
|
15
|
+
include:
|
16
|
+
|
17
|
+
* Using welcoming and inclusive language
|
18
|
+
* Being respectful of differing viewpoints and experiences
|
19
|
+
* Gracefully accepting constructive criticism
|
20
|
+
* Focusing on what is best for the community
|
21
|
+
* Showing empathy towards other community members
|
22
|
+
|
23
|
+
Examples of unacceptable behavior by participants include:
|
24
|
+
|
25
|
+
* The use of sexualized language or imagery and unwelcome sexual attention or
|
26
|
+
advances
|
27
|
+
* Trolling, insulting/derogatory comments, and personal or political attacks
|
28
|
+
* Public or private harassment
|
29
|
+
* Publishing others' private information, such as a physical or electronic
|
30
|
+
address, without explicit permission
|
31
|
+
* Other conduct which could reasonably be considered inappropriate in a
|
32
|
+
professional setting
|
33
|
+
|
34
|
+
## Our Responsibilities
|
35
|
+
|
36
|
+
Project maintainers are responsible for clarifying the standards of acceptable
|
37
|
+
behavior and are expected to take appropriate and fair corrective action in
|
38
|
+
response to any instances of unacceptable behavior.
|
39
|
+
|
40
|
+
Project maintainers have the right and responsibility to remove, edit, or
|
41
|
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
42
|
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
43
|
+
permanently any contributor for other behaviors that they deem inappropriate,
|
44
|
+
threatening, offensive, or harmful.
|
45
|
+
|
46
|
+
## Scope
|
47
|
+
|
48
|
+
This Code of Conduct applies both within project spaces and in public spaces
|
49
|
+
when an individual is representing the project or its community. Examples of
|
50
|
+
representing a project or community include using an official project e-mail
|
51
|
+
address, posting via an official social media account, or acting as an appointed
|
52
|
+
representative at an online or offline event. Representation of a project may be
|
53
|
+
further defined and clarified by project maintainers.
|
54
|
+
|
55
|
+
## Enforcement
|
56
|
+
|
57
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
58
|
+
reported by contacting the project team at rt.trojanowski@gmail.com. All
|
59
|
+
complaints will be reviewed and investigated and will result in a response that
|
60
|
+
is deemed necessary and appropriate to the circumstances. The project team is
|
61
|
+
obligated to maintain confidentiality with regard to the reporter of an incident.
|
62
|
+
Further details of specific enforcement policies may be posted separately.
|
63
|
+
|
64
|
+
Project maintainers who do not follow or enforce the Code of Conduct in good
|
65
|
+
faith may face temporary or permanent repercussions as determined by other
|
66
|
+
members of the project's leadership.
|
67
|
+
|
68
|
+
## Attribution
|
69
|
+
|
70
|
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
|
71
|
+
available at [http://contributor-covenant.org/version/1/4][version]
|
72
|
+
|
73
|
+
[homepage]: http://contributor-covenant.org
|
74
|
+
[version]: http://contributor-covenant.org/version/1/4/
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
remote_job_scraper (0.1.0)
|
5
|
+
nokogiri
|
6
|
+
spreadsheet
|
7
|
+
thor
|
8
|
+
|
9
|
+
GEM
|
10
|
+
remote: https://rubygems.org/
|
11
|
+
specs:
|
12
|
+
addressable (2.6.0)
|
13
|
+
public_suffix (>= 2.0.2, < 4.0)
|
14
|
+
byebug (10.0.2)
|
15
|
+
crack (0.4.3)
|
16
|
+
safe_yaml (~> 1.0.0)
|
17
|
+
diff-lcs (1.3)
|
18
|
+
hashdiff (0.3.8)
|
19
|
+
mini_portile2 (2.4.0)
|
20
|
+
nokogiri (1.10.1)
|
21
|
+
mini_portile2 (~> 2.4.0)
|
22
|
+
public_suffix (3.0.3)
|
23
|
+
rake (10.5.0)
|
24
|
+
rspec (3.8.0)
|
25
|
+
rspec-core (~> 3.8.0)
|
26
|
+
rspec-expectations (~> 3.8.0)
|
27
|
+
rspec-mocks (~> 3.8.0)
|
28
|
+
rspec-core (3.8.0)
|
29
|
+
rspec-support (~> 3.8.0)
|
30
|
+
rspec-expectations (3.8.1)
|
31
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
32
|
+
rspec-support (~> 3.8.0)
|
33
|
+
rspec-mocks (3.8.0)
|
34
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
35
|
+
rspec-support (~> 3.8.0)
|
36
|
+
rspec-support (3.8.0)
|
37
|
+
ruby-ole (1.2.12.1)
|
38
|
+
safe_yaml (1.0.4)
|
39
|
+
spreadsheet (1.1.8)
|
40
|
+
ruby-ole (>= 1.0)
|
41
|
+
thor (0.20.3)
|
42
|
+
vcr (4.0.0)
|
43
|
+
webmock (3.5.1)
|
44
|
+
addressable (>= 2.3.6)
|
45
|
+
crack (>= 0.3.2)
|
46
|
+
hashdiff
|
47
|
+
|
48
|
+
PLATFORMS
|
49
|
+
ruby
|
50
|
+
|
51
|
+
DEPENDENCIES
|
52
|
+
bundler (~> 1.16)
|
53
|
+
byebug
|
54
|
+
rake (~> 10.0)
|
55
|
+
remote_job_scraper!
|
56
|
+
rspec (~> 3.0)
|
57
|
+
vcr
|
58
|
+
webmock
|
59
|
+
|
60
|
+
BUNDLED WITH
|
61
|
+
1.16.5
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2019 Rafał Trojanowski
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
# RemoteJobScraper 💻🌏
|
2
|
+
|
3
|
+
Ruby gem that collects job offers for remote positions with ease.
|
4
|
+
|
5
|
+
Going through many job listings and finding the right one may be a time-consuming process. That's why this tool has been built. It allows to automate the process, retrieve necessary data and store it in CSV/Excel file in just a few minutes. The main focus is to inform a user about the location (time-zone) required for a position.
|
6
|
+
|
7
|
+
|
8
|
+
## Installation
|
9
|
+
|
10
|
+
Add this line to your application's Gemfile:
|
11
|
+
|
12
|
+
```ruby
|
13
|
+
gem 'remote_job_scraper'
|
14
|
+
```
|
15
|
+
|
16
|
+
And then execute:
|
17
|
+
|
18
|
+
$ bundle
|
19
|
+
|
20
|
+
Or install it yourself as:
|
21
|
+
|
22
|
+
$ gem install remote_job_scraper
|
23
|
+
|
24
|
+
## Usage
|
25
|
+
|
26
|
+
$ bundle exec exe/remote_job_scraper
|
27
|
+
|
28
|
+
|
29
|
+
* Tested with Ruby versions:
|
30
|
+
|
31
|
+
* [x] 2.4.1
|
32
|
+
|
33
|
+
## Development
|
34
|
+
|
35
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
36
|
+
|
37
|
+
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
38
|
+
|
39
|
+
## Contributing
|
40
|
+
|
41
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/[USERNAME]/remote_job_scraper. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
|
42
|
+
|
43
|
+
## License
|
44
|
+
|
45
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
46
|
+
|
47
|
+
## Code of Conduct
|
48
|
+
|
49
|
+
Everyone interacting in the RemoteJobScraper project’s codebases, issue trackers, chat rooms and mailing lists is expected to follow the [code of conduct](https://github.com/[USERNAME]/remote_job_scraper/blob/master/CODE_OF_CONDUCT.md).
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "remote_job_scraper"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
@@ -0,0 +1,64 @@
|
|
1
|
+
require 'remote_job_scraper/version'
|
2
|
+
|
3
|
+
require 'sites/we_work_remotely'
|
4
|
+
require 'sites/remote_ok'
|
5
|
+
require 'sites/jobs_rails42'
|
6
|
+
|
7
|
+
require 'support/offer_parser'
|
8
|
+
require 'support/user_agent'
|
9
|
+
require 'support/spreadsheet_creator'
|
10
|
+
|
11
|
+
require 'nokogiri'
|
12
|
+
require 'open-uri'
|
13
|
+
require 'csv'
|
14
|
+
require "thor"
|
15
|
+
|
16
|
+
module RemoteJobScraper
|
17
|
+
|
18
|
+
AVAILABLE_SERVICES = %w(we_work_remotely remote_ok 42jobs_rails)
|
19
|
+
|
20
|
+
class CLI < Thor
|
21
|
+
|
22
|
+
desc 'collect_jobs', 'Retrieves data from all sites'
|
23
|
+
def collect_jobs
|
24
|
+
[Sites::WeWorkRemotely, Sites::RemoteOk].each do |klass|
|
25
|
+
klass.new.collect_jobs
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
desc 'collect_jobs_from', 'Retrieves data from specified service'
|
30
|
+
def collect_jobs_from(service_name)
|
31
|
+
case service_name
|
32
|
+
when 'we_work_remotely'
|
33
|
+
then Sites::WeWorkRemotely.new.collect_jobs
|
34
|
+
when 'remote_ok'
|
35
|
+
then Sites::RemoteOk.new.collect_jobs
|
36
|
+
when '42jobs_rails'
|
37
|
+
then Sites::JobsRails42.new.collect_jobs
|
38
|
+
else
|
39
|
+
raise "#{service_name} is not correct. Use: #{AVAILABLE_SERVICES.join(', ')}."
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
desc 'generate_summary', 'Collect all data and export to XLS file'
|
44
|
+
def generate_summary
|
45
|
+
Support::SpreadsheetCreator.generate
|
46
|
+
end
|
47
|
+
|
48
|
+
desc 'clean_up', 'Removes all stored data'
|
49
|
+
def clean_up
|
50
|
+
puts "This command will remote all stored data."
|
51
|
+
puts "Press Ctrl-C to abort."
|
52
|
+
|
53
|
+
sleep 2
|
54
|
+
|
55
|
+
FileUtils.rm_rf('data')
|
56
|
+
puts "Removed data."
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.root
|
62
|
+
File.dirname __dir__
|
63
|
+
end
|
64
|
+
end
|
data/lib/sites/base.rb
ADDED
@@ -0,0 +1,62 @@
|
|
1
|
+
module Sites
|
2
|
+
class Base
|
3
|
+
|
4
|
+
attr_reader :job_type, :doc, :url
|
5
|
+
|
6
|
+
def initialize(job_type: :programming)
|
7
|
+
@job_type = job_type
|
8
|
+
@url = build_url
|
9
|
+
@doc = Nokogiri::HTML(open_page(@url))
|
10
|
+
@current_time = Time.new
|
11
|
+
@timestamp = @current_time.strftime("%Y%m%d%H%M%S")
|
12
|
+
@count = get_count
|
13
|
+
end
|
14
|
+
|
15
|
+
def open_page(url)
|
16
|
+
sleep(rand(0..2.0)) unless ENV['RAILS_ENV'] == 'test' # less mechanical behaviour
|
17
|
+
|
18
|
+
if ENV['RAILS_ENV'] == 'test'
|
19
|
+
open(url)
|
20
|
+
else
|
21
|
+
open(url, 'User-Agent' => user_agent)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def user_agent
|
28
|
+
Support::UserAgent::LIST.sample
|
29
|
+
end
|
30
|
+
|
31
|
+
def build_url
|
32
|
+
case job_type
|
33
|
+
when :programming
|
34
|
+
then "#{self.class::HOST}#{self.class::PROGRAMMING}"
|
35
|
+
when :devops
|
36
|
+
then "#{self.class::HOST}#{self.class::DEVOPS}"
|
37
|
+
else
|
38
|
+
raise "Error"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def filepath
|
43
|
+
return test_filepath if ENV["RAILS_ENV"] == 'test'
|
44
|
+
"#{self.class::STORE_DIR}/#{@timestamp}.csv"
|
45
|
+
end
|
46
|
+
|
47
|
+
def test_filepath
|
48
|
+
"spec/fixtures/data/#{underscore(self.class.name.split('::').last)}/#{@timestamp}.csv"
|
49
|
+
end
|
50
|
+
|
51
|
+
# https://stackoverflow.com/a/5622585
|
52
|
+
def underscore(camel_cased_word)
|
53
|
+
word = camel_cased_word.dup
|
54
|
+
word.gsub!(/::/, '/')
|
55
|
+
word.gsub!(/([A-Z]+)([A-Z][a-z])/,'\1_\2')
|
56
|
+
word.gsub!(/([a-z\d])([A-Z])/,'\1_\2')
|
57
|
+
word.tr!("-", "_")
|
58
|
+
word.downcase!
|
59
|
+
word
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
module Sites
|
2
|
+
class JobsRails42 < Base
|
3
|
+
|
4
|
+
# @TODO/NOTE: There is pagination on this site, it would be cool to find a way
|
5
|
+
# to grab more offers than just first page (25 items)
|
6
|
+
|
7
|
+
# I had to rename this class because we are not allowed to have numbers
|
8
|
+
# on the beginning of the class name (42JobsRails won't work).
|
9
|
+
|
10
|
+
HOST = 'https://www.42jobs.io'.freeze
|
11
|
+
PROGRAMMING = '/rails/jobs-remote'.freeze
|
12
|
+
JOB_ITEM_SELECTOR = 'li.job-offers__item a'.freeze
|
13
|
+
STORE_DIR = 'data/jobs_rails42'.freeze
|
14
|
+
|
15
|
+
NUMBER_OF_PAGES = 10
|
16
|
+
|
17
|
+
def initialize(job_type: :programming)
|
18
|
+
@job_type = job_type
|
19
|
+
@url = build_url
|
20
|
+
@doc = nil
|
21
|
+
@current_time = Time.new
|
22
|
+
@timestamp = @current_time.strftime("%Y%m%d%H%M%S")
|
23
|
+
@count = get_count
|
24
|
+
end
|
25
|
+
|
26
|
+
def collect_jobs
|
27
|
+
(1..NUMBER_OF_PAGES).to_a.each do |page|
|
28
|
+
current_page = "#{@url}?page=#{page}"
|
29
|
+
doc = Nokogiri::HTML(open_page(current_page))
|
30
|
+
process_page(doc, current_page, page)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
private
|
35
|
+
|
36
|
+
def process_page(doc, page_url, page)
|
37
|
+
puts "[Info] Getting the data from #{page_url} at #{@current_time}..."
|
38
|
+
FileUtils.mkdir_p STORE_DIR
|
39
|
+
|
40
|
+
CSV.open(filepath, 'ab') do |csv|
|
41
|
+
doc.css(JOB_ITEM_SELECTOR).each do |link|
|
42
|
+
job_url = "#{HOST}#{link["href"]}"
|
43
|
+
puts "[Info] Processing #{job_url}..."
|
44
|
+
job_page = Nokogiri::HTML(open_page(job_url))
|
45
|
+
offer_text = job_page.css('.job-offer__description').to_s
|
46
|
+
|
47
|
+
location = Support::OfferParser.get_location(offer_text)
|
48
|
+
region = nil
|
49
|
+
keywords = Support::OfferParser.get_keywords(offer_text)
|
50
|
+
|
51
|
+
csv << [job_url, location, region, keywords]
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
puts "[Done] Collected #{@count} job offers from #{url}. Data stores in: #{filepath}." if page == NUMBER_OF_PAGES
|
56
|
+
end
|
57
|
+
|
58
|
+
private
|
59
|
+
|
60
|
+
def get_count
|
61
|
+
25 * NUMBER_OF_PAGES
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,45 @@
|
|
1
|
+
require_relative 'base'
|
2
|
+
|
3
|
+
module Sites
|
4
|
+
class RemoteOk < Base
|
5
|
+
|
6
|
+
HOST = 'https://remoteok.io'.freeze
|
7
|
+
PROGRAMMING = '/remote-dev-jobs'.freeze
|
8
|
+
JOB_ITEM_SELECTOR = 'tr.job'.freeze
|
9
|
+
STORE_DIR = 'data/remote_ok'.freeze
|
10
|
+
|
11
|
+
def initialize(args = {})
|
12
|
+
super(args = {})
|
13
|
+
end
|
14
|
+
|
15
|
+
def collect_jobs
|
16
|
+
puts "[Info] Getting the data from #{url} at #{@current_time}..."
|
17
|
+
FileUtils.mkdir_p STORE_DIR
|
18
|
+
|
19
|
+
CSV.open(filepath, 'w') do |csv|
|
20
|
+
doc.css(JOB_ITEM_SELECTOR).each do |link|
|
21
|
+
job_url = "#{HOST}#{link["data-url"]}"
|
22
|
+
puts "[Info] Processing #{job_url}..."
|
23
|
+
job_page = Nokogiri::HTML(open_page(job_url))
|
24
|
+
offer_text = job_page.css('td.heading').to_s
|
25
|
+
|
26
|
+
location = Support::OfferParser.get_location(offer_text)
|
27
|
+
region = nil
|
28
|
+
keywords = Support::OfferParser.get_keywords(offer_text)
|
29
|
+
|
30
|
+
csv << [job_url, location, region, keywords]
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
puts "[Done] Collected #{@count} job offers from #{url}. Data stores in: #{filepath}."
|
35
|
+
end
|
36
|
+
|
37
|
+
private
|
38
|
+
|
39
|
+
def get_count
|
40
|
+
count = doc.css(JOB_ITEM_SELECTOR).map { |link| link['data-url'] }.size
|
41
|
+
puts "[Info] There is #{count} remote jobs available."
|
42
|
+
count
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require_relative 'base'
|
2
|
+
|
3
|
+
module Sites
|
4
|
+
class WeWorkRemotely < Base
|
5
|
+
|
6
|
+
HOST = 'https://weworkremotely.com'.freeze
|
7
|
+
PROGRAMMING = '/categories/remote-programming-jobs'.freeze
|
8
|
+
DEVOPS = '/categories/remote-devops-sysadmin-jobs'.freeze
|
9
|
+
JOB_ITEM_SELECTOR = '.jobs-container li a'.freeze
|
10
|
+
STORE_DIR = 'data/we_work_remotely'
|
11
|
+
|
12
|
+
def initialize(args = {})
|
13
|
+
super(args = {})
|
14
|
+
end
|
15
|
+
|
16
|
+
def collect_jobs
|
17
|
+
puts "[Info] Getting the data from #{url} at #{@current_time}..."
|
18
|
+
FileUtils.mkdir_p STORE_DIR
|
19
|
+
|
20
|
+
CSV.open(filepath, 'w') do |csv|
|
21
|
+
doc.css(JOB_ITEM_SELECTOR).each do |link|
|
22
|
+
if link["href"].start_with?("/remote-jobs")
|
23
|
+
job_url = "#{HOST}#{link["href"]}"
|
24
|
+
puts "[Info] Processing #{job_url}..."
|
25
|
+
job_page = Nokogiri::HTML(open_page(job_url))
|
26
|
+
offer_text = job_page.css('.listing-container').to_s
|
27
|
+
|
28
|
+
region = job_page.css('span.region').first
|
29
|
+
location = job_page.css('span.location').first
|
30
|
+
|
31
|
+
keywords = Support::OfferParser.get_keywords(offer_text)
|
32
|
+
|
33
|
+
csv << [job_url, location, region, keywords]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
puts "[Done] Collected #{@count} job offers from #{url}. Data stores in: #{filepath}."
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def get_count
|
44
|
+
count = doc.css(JOB_ITEM_SELECTOR)
|
45
|
+
.map { |link| link['href'] }
|
46
|
+
.select { |href| href.start_with?('/remote-jobs') }
|
47
|
+
.size
|
48
|
+
puts "[Info] There is #{count} remote jobs available."
|
49
|
+
count
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
module Support
|
2
|
+
module OfferParser
|
3
|
+
|
4
|
+
LOCATION_DICT = ['location', 'based']
|
5
|
+
KEYWORDS = [
|
6
|
+
'ruby',
|
7
|
+
'elixir',
|
8
|
+
'react',
|
9
|
+
'remote',
|
10
|
+
'graphql'
|
11
|
+
]
|
12
|
+
|
13
|
+
def self.get_location(content, dict = LOCATION_DICT)
|
14
|
+
indexes = Array.new
|
15
|
+
tokens = get_tokens(content)
|
16
|
+
indexes = dict.map { |q| [tokens.find_index(q), q] }
|
17
|
+
locations = Array.new
|
18
|
+
|
19
|
+
indexes.each do |index|
|
20
|
+
next if index[0].nil?
|
21
|
+
|
22
|
+
locations << tokens[index[0] + 1].gsub(',', '') if index[1] == 'location'
|
23
|
+
locations << tokens[index[0] - 1].gsub(',', '') if index[1] == 'based'
|
24
|
+
end
|
25
|
+
|
26
|
+
locations.join(', ').capitalize
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.get_keywords(content, keywords = KEYWORDS)
|
30
|
+
indexes = Array.new
|
31
|
+
tokens = get_tokens(content)
|
32
|
+
indexes = keywords.map { |q| [tokens.find_index(q), q] }
|
33
|
+
keywords = Array.new
|
34
|
+
|
35
|
+
indexes.each do |index|
|
36
|
+
next if index[0].nil?
|
37
|
+
keywords << tokens[index[0]].gsub(',', '')
|
38
|
+
end
|
39
|
+
|
40
|
+
keywords.map(&:capitalize).join(', ')
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.get_tokens(content)
|
44
|
+
content
|
45
|
+
.gsub('.', '')
|
46
|
+
.gsub(',', '')
|
47
|
+
.gsub(':', '')
|
48
|
+
.downcase
|
49
|
+
.split(/[\s-]/)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|