remote_job_scraper 0.5.0 → 0.6.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +1 -0
- data/README.md +6 -0
- data/lib/remote_job_scraper.rb +1 -0
- data/lib/remote_job_scraper/cli.rb +11 -3
- data/lib/remote_job_scraper/version.rb +1 -1
- data/lib/sites/elixir_companies.rb +87 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: efcddd08a1843eaa0509f9bdd853ae47476df9db
|
4
|
+
data.tar.gz: 7bb7b71b57b3d8b6b1b8255320092849e7ff7b97
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 03bab4b51e95b76ab697d58577843c56b5b8ce0a2beaac21946d96e11babbe21ec0d410a46dfaf400e540cee70ac7546e084f624dd2b7d7dfe5dcf74be4dd966
|
7
|
+
data.tar.gz: 3e1f3570fe6766b5b20a8acd5cd1f01768da398d9a7de759caf2efa5a0c459c6641f3f3fb81919a6b47cec0df102de7d3e0b5944eacc9664ef6af97c57945724
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -40,6 +40,12 @@ After checking out the repo, run `bin/setup` to install dependencies. Then, run
|
|
40
40
|
|
41
41
|
To install this gem onto your local machine, run `bundle exec rake install`. To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
42
42
|
|
43
|
+
## Debugging
|
44
|
+
```
|
45
|
+
require 'byebug'
|
46
|
+
byebug
|
47
|
+
```
|
48
|
+
|
43
49
|
## Contributing
|
44
50
|
|
45
51
|
Bug reports and pull requests are welcome on GitHub at https://github.com/rafaltrojanowski/remote_job_scraper. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
|
data/lib/remote_job_scraper.rb
CHANGED
@@ -5,14 +5,22 @@ module RemoteJobScraper
|
|
5
5
|
|
6
6
|
AVAILABLE_SITES = %w(we_work_remotely remote_ok 42jobs_rails)
|
7
7
|
|
8
|
-
desc '
|
8
|
+
desc 'collect_remote_companies',
|
9
9
|
"Retrieves remote companies
|
10
|
-
[Example]: remote_job_scraper
|
10
|
+
[Example]: remote_job_scraper collect_remote_companies
|
11
11
|
"
|
12
|
-
def
|
12
|
+
def collect_remote_companies
|
13
13
|
Sites::GithubRemoteJobs.new.collect_companies
|
14
14
|
end
|
15
15
|
|
16
|
+
desc 'collect_elixir_companies',
|
17
|
+
"Retrieves Elixir companies
|
18
|
+
[Example]: remote_job_scraper collect_elixir_companies
|
19
|
+
"
|
20
|
+
def collect_elixir_companies
|
21
|
+
Sites::ElixirCompanies.new.collect_companies
|
22
|
+
end
|
23
|
+
|
16
24
|
desc 'collect_jobs LIMIT DELAY',
|
17
25
|
"Retrieves data from #{AVAILABLE_SITES.join(', ')}.
|
18
26
|
[Example]: remote_job_scraper collect_jobs 10 9.0..10.0
|
@@ -0,0 +1,87 @@
|
|
1
|
+
require_relative 'base'
|
2
|
+
|
3
|
+
module Sites
|
4
|
+
class ElixirCompanies < Base
|
5
|
+
|
6
|
+
HOST = 'https://elixir-companies.com'.freeze
|
7
|
+
PATH = '/en/browse'.freeze
|
8
|
+
JOB_ITEM_SELECTOR = 'div.company.box'.freeze
|
9
|
+
STORE_DIR = 'data/elixir_companies'.freeze
|
10
|
+
|
11
|
+
def initialize
|
12
|
+
@url = "#{self.class::HOST}#{self.class::PATH}"
|
13
|
+
@current_time = Time.now
|
14
|
+
@timestamp = @current_time.strftime("%Y%m%d%H%M%S")
|
15
|
+
@doc = nil
|
16
|
+
@total_pages = 26
|
17
|
+
@rows_count = 0
|
18
|
+
@jobs_count = get_jobs_count
|
19
|
+
end
|
20
|
+
|
21
|
+
def collect_companies(limit: nil)
|
22
|
+
FileUtils.mkdir_p STORE_DIR
|
23
|
+
|
24
|
+
(1..@total_pages).each do |page|
|
25
|
+
process_page(page: page, limit: limit)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def companies_count
|
30
|
+
@rows_count
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def process_page(page:, limit:)
|
36
|
+
current_page = "#{@url}?page=#{page}"
|
37
|
+
doc = Nokogiri::HTML(open_page(current_page))
|
38
|
+
puts "[Info] Getting the data from #{current_page}"
|
39
|
+
|
40
|
+
CSV.open(filepath, 'ab') do |csv|
|
41
|
+
doc.css(JOB_ITEM_SELECTOR).each do |company_box|
|
42
|
+
return if limit == @rows_count
|
43
|
+
csv << get_row(company_box)
|
44
|
+
@rows_count += 1
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
puts "[Done] Collected #{@jobs_count} job offers from #{url}. Data stored in: #{filepath}." if page == @total_pages
|
49
|
+
end
|
50
|
+
|
51
|
+
def get_row(company_box)
|
52
|
+
company_title = company_box.css('div.content p.title').text
|
53
|
+
company_info = company_box.css('div.content.company-info p')
|
54
|
+
|
55
|
+
# A bit ugly way to get a data between span elements
|
56
|
+
array = company_info.text.split("\n").select do |element|
|
57
|
+
element =~ /[a-zA-Z]/
|
58
|
+
end.map!(&:strip).delete_if do |element|
|
59
|
+
element == "GitHub" || element == "Add a job"
|
60
|
+
end
|
61
|
+
|
62
|
+
has_blog = array[2] && (array[2].include?("/") || array[2].include?("blog"))
|
63
|
+
|
64
|
+
industry = array[0]
|
65
|
+
company_website = array[1]
|
66
|
+
is_hiring = company_box["class"].include?("has-ribbon")
|
67
|
+
|
68
|
+
if has_blog
|
69
|
+
blog = array[2]
|
70
|
+
location = array[4]
|
71
|
+
else
|
72
|
+
blog = nil
|
73
|
+
location = array[2]
|
74
|
+
end
|
75
|
+
|
76
|
+
row = [company_title, industry, company_website, blog, location]
|
77
|
+
hiring = is_hiring ? "Hiring!" : nil
|
78
|
+
row.push hiring
|
79
|
+
end
|
80
|
+
|
81
|
+
def get_jobs_count
|
82
|
+
jobs_count = 16 * @total_pages # roughly - first page has 14 items
|
83
|
+
puts "[Info] There are #{jobs_count} remote jobs on [ElixirCompanies]."
|
84
|
+
jobs_count
|
85
|
+
end
|
86
|
+
end
|
87
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: remote_job_scraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.6.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rafał Trojanowski
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2019-
|
11
|
+
date: 2019-08-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|
@@ -166,6 +166,7 @@ files:
|
|
166
166
|
- lib/remote_job_scraper/configuration.rb
|
167
167
|
- lib/remote_job_scraper/version.rb
|
168
168
|
- lib/sites/base.rb
|
169
|
+
- lib/sites/elixir_companies.rb
|
169
170
|
- lib/sites/elixir_radar.rb
|
170
171
|
- lib/sites/github_remote_jobs.rb
|
171
172
|
- lib/sites/jobs_rails42.rb
|