rubyscraper 0.3.0 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -6
- data/README.md +37 -7
- data/lib/rubyscraper.rb +14 -148
- data/lib/rubyscraper/api_dispatcher.rb +31 -0
- data/lib/rubyscraper/binary.rb +9 -6
- data/lib/rubyscraper/option_parser.rb +72 -0
- data/lib/rubyscraper/paginator.rb +59 -0
- data/lib/rubyscraper/processor.rb +47 -0
- data/lib/rubyscraper/sub_page_scraper.rb +53 -0
- data/lib/rubyscraper/summary_scraper.rb +65 -0
- data/lib/rubyscraper/version.rb +1 -1
- data/rubyscraper.gemspec +5 -6
- data/spec/paginator_spec.rb +83 -0
- data/spec/rubyscraper_spec.rb +2 -6
- data/spec/spec_helper.rb +3 -0
- data/spec/sub_page_scraper_spec.rb +51 -0
- data/spec/summary_scraper_spec.rb +125 -0
- metadata +27 -33
- data/lib/assets/scrapes.json +0 -287
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'rubyscraper/paginator'
|
3
|
+
require 'rubyscraper/summary_scraper'
|
4
|
+
require 'rubyscraper/sub_page_scraper'
|
5
|
+
|
6
|
+
class Processor
|
7
|
+
attr_reader :sites, :record_limit, :single_site, :scrape_delay
|
8
|
+
|
9
|
+
def initialize(config_file, single_site, record_limit, scrape_delay)
|
10
|
+
@scrape_file = config_file
|
11
|
+
@scrape_config = JSON.parse(File.read(@scrape_file))
|
12
|
+
@sites = @scrape_config
|
13
|
+
@single_site = single_site
|
14
|
+
@record_limit = record_limit
|
15
|
+
@scrape_delay = scrape_delay
|
16
|
+
end
|
17
|
+
|
18
|
+
def call
|
19
|
+
!single_site.empty? ? scrape_single_site : scrape_all_sites
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def scrape_single_site
|
25
|
+
site = sites.select { |s| s["name"] == single_site }.first
|
26
|
+
scrape_site(site)
|
27
|
+
end
|
28
|
+
|
29
|
+
def scrape_all_sites
|
30
|
+
sites.inject [] do |all_results, site|
|
31
|
+
all_results += scrape_site(site)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def scrape_site(site)
|
36
|
+
paginator = Paginator.new(site, record_limit)
|
37
|
+
paginator.define_pagination_params
|
38
|
+
|
39
|
+
results = SummaryScraper.new(site, paginator.add_on, paginator.steps).call
|
40
|
+
results = SubPageScraper.new(site, results, scrape_delay).call if has_sub_pages?(site)
|
41
|
+
results
|
42
|
+
end
|
43
|
+
|
44
|
+
def has_sub_pages?(site)
|
45
|
+
site["summary"]["has_sub_pages"] == "true"
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'capybara'
|
2
|
+
require 'capybara/poltergeist'
|
3
|
+
|
4
|
+
class SubPageScraper
|
5
|
+
attr_reader :site, :listings, :delay
|
6
|
+
include Capybara::DSL
|
7
|
+
|
8
|
+
def initialize(site, listings, delay)
|
9
|
+
@site = site
|
10
|
+
@listings = listings
|
11
|
+
@delay = delay
|
12
|
+
|
13
|
+
Capybara.register_driver :poltergeist do |app|
|
14
|
+
Capybara::Poltergeist::Driver.new(app, js_errors: false)
|
15
|
+
end
|
16
|
+
Capybara.default_driver = :poltergeist
|
17
|
+
end
|
18
|
+
|
19
|
+
def call
|
20
|
+
puts "Pulling #{@listings.count} listings from #{@site["name"]}:"
|
21
|
+
listings = @listings.inject [] do |results, listing|
|
22
|
+
sleep delay
|
23
|
+
listing = pull_sub_page_data(site, listing)
|
24
|
+
listing = listing_cleanup(listing)
|
25
|
+
results << listing
|
26
|
+
end; puts "\n"; listings
|
27
|
+
end
|
28
|
+
|
29
|
+
def pull_sub_page_data(site, listing)
|
30
|
+
visit listing["url"]
|
31
|
+
site["sub_page"]["fields"].each do |field|
|
32
|
+
if field["method"] == "all"
|
33
|
+
if has_css?(field["path"])
|
34
|
+
values = all(field["path"]).map do |elem|
|
35
|
+
elem.send(field["loop_collect"])
|
36
|
+
end
|
37
|
+
listing[field["field"]] = values.join(field["join"])
|
38
|
+
end
|
39
|
+
else
|
40
|
+
if has_css?(field["path"])
|
41
|
+
listing[field["field"]] =
|
42
|
+
send(field["method"].to_sym,field["path"]).text
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end; print "."; listing
|
46
|
+
end
|
47
|
+
|
48
|
+
def listing_cleanup(listing)
|
49
|
+
# Remove 'Headquarters: ' from weworkremotely jobs
|
50
|
+
listing["location"].slice!("Headquarter: ") if !listing["location"].to_s.empty?
|
51
|
+
listing
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'capybara'
|
2
|
+
require 'capybara/poltergeist'
|
3
|
+
|
4
|
+
class SummaryScraper
|
5
|
+
attr_reader :site, :pagination_addon, :pagination_pages
|
6
|
+
include Capybara::DSL
|
7
|
+
|
8
|
+
def initialize(site, pagination_addon, pagination_pages)
|
9
|
+
@site = site
|
10
|
+
@pagination_addon = pagination_addon
|
11
|
+
@pagination_pages = pagination_pages
|
12
|
+
|
13
|
+
Capybara.register_driver :poltergeist do |app|
|
14
|
+
Capybara::Poltergeist::Driver.new(app, js_errors: false)
|
15
|
+
end
|
16
|
+
Capybara.default_driver = :poltergeist
|
17
|
+
end
|
18
|
+
|
19
|
+
def call
|
20
|
+
pagination_pages.inject [] do |results, page|
|
21
|
+
results += get_summaries(page)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def get_summaries(page_no)
|
28
|
+
visit page_url(page_no)
|
29
|
+
|
30
|
+
all(site["summary"]["loop"]).inject [] do |results, listing|
|
31
|
+
record = pull_summary_data(site, listing)
|
32
|
+
record = listing_cleanup(site, record)
|
33
|
+
results << record
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def page_url(page)
|
38
|
+
site["base_url"] + site["summary"]["url"] + pagination_addon + page.to_s
|
39
|
+
end
|
40
|
+
|
41
|
+
def pull_summary_data(site, record)
|
42
|
+
output = Hash.new
|
43
|
+
site["summary"]["fields"].each do |field|
|
44
|
+
if field["attr"]
|
45
|
+
if record.has_css?(field["path"])
|
46
|
+
output[field["field"]] =
|
47
|
+
record.send(field["method"].to_sym, field["path"])[field["attr"]]
|
48
|
+
end
|
49
|
+
else
|
50
|
+
if record.has_css?(field["path"])
|
51
|
+
output[field["field"]] =
|
52
|
+
record.send(field["method"].to_sym, field["path"]).text
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end; output
|
56
|
+
end
|
57
|
+
|
58
|
+
def listing_cleanup(site, listing)
|
59
|
+
# Add base url if not present
|
60
|
+
unless listing["url"].match(/^http/)
|
61
|
+
listing["url"] = "#{site["base_url"]}#{listing["url"]}"
|
62
|
+
end
|
63
|
+
listing
|
64
|
+
end
|
65
|
+
end
|
data/lib/rubyscraper/version.rb
CHANGED
data/rubyscraper.gemspec
CHANGED
@@ -13,13 +13,12 @@ Gem::Specification.new do |s|
|
|
13
13
|
s.homepage = 'https://github.com/ndwhtlssthr/rubyscraper'
|
14
14
|
s.executables << 'rubyscraper'
|
15
15
|
|
16
|
-
s.add_dependency "capybara"
|
17
|
-
s.add_dependency "poltergeist"
|
18
|
-
s.add_dependency "rest-client"
|
19
|
-
s.add_dependency "slop"
|
16
|
+
s.add_dependency "capybara", "~> 2.4"
|
17
|
+
s.add_dependency "poltergeist", "~> 1.6"
|
18
|
+
s.add_dependency "rest-client", "~> 1.8"
|
20
19
|
|
21
20
|
s.add_development_dependency "bundler", "~> 1.9"
|
22
21
|
s.add_development_dependency "rake", "~> 10.0"
|
23
|
-
s.add_development_dependency
|
24
|
-
s.add_development_dependency
|
22
|
+
s.add_development_dependency "rspec", "~> 3.0"
|
23
|
+
s.add_development_dependency "pry", "~> 0.10"
|
25
24
|
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Paginator do
|
4
|
+
it 'returns defaults if not paginated' do
|
5
|
+
json = '{"summary":{
|
6
|
+
"paginated":"false"
|
7
|
+
}}'
|
8
|
+
site = JSON.parse(json)
|
9
|
+
|
10
|
+
paginator = Paginator.new(site, :all)
|
11
|
+
paginator.define_pagination_params
|
12
|
+
expect(paginator.add_on).to eq ""
|
13
|
+
expect(paginator.steps).to eq [""]
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'returns the correct add_on with pagination' do
|
17
|
+
json = '{"summary":{
|
18
|
+
"paginated":"true",
|
19
|
+
"pagination":{
|
20
|
+
"format":"&pg=NUM",
|
21
|
+
"start":"1",
|
22
|
+
"scale":"1",
|
23
|
+
"records_per_page":"25"
|
24
|
+
}
|
25
|
+
}}'
|
26
|
+
site = JSON.parse(json)
|
27
|
+
|
28
|
+
paginator = Paginator.new(site, 50)
|
29
|
+
paginator.define_pagination_params
|
30
|
+
expect(paginator.add_on).to eq "&pg=NUM"
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'returns the correct pages when given record limit' do
|
34
|
+
json = '{"summary":{
|
35
|
+
"paginated":"true",
|
36
|
+
"pagination":{
|
37
|
+
"format":"&pg=NUM",
|
38
|
+
"start":"1",
|
39
|
+
"scale":"1",
|
40
|
+
"records_per_page":"25"
|
41
|
+
}
|
42
|
+
}}'
|
43
|
+
site = JSON.parse(json)
|
44
|
+
|
45
|
+
paginator = Paginator.new(site, 50)
|
46
|
+
paginator.define_pagination_params
|
47
|
+
expect(paginator.steps).to eq [1, 2]
|
48
|
+
end
|
49
|
+
|
50
|
+
it 'adds an additional page if pages wrap to next page' do
|
51
|
+
json = '{"summary":{
|
52
|
+
"paginated":"true",
|
53
|
+
"pagination":{
|
54
|
+
"format":"&pg=NUM",
|
55
|
+
"start":"1",
|
56
|
+
"scale":"1",
|
57
|
+
"records_per_page":"25"
|
58
|
+
}
|
59
|
+
}}'
|
60
|
+
site = JSON.parse(json)
|
61
|
+
|
62
|
+
paginator = Paginator.new(site, 58)
|
63
|
+
paginator.define_pagination_params
|
64
|
+
expect(paginator.steps).to eq [1, 2, 3]
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'can handle a starting of 0' do
|
68
|
+
json = '{"summary":{
|
69
|
+
"paginated":"true",
|
70
|
+
"pagination":{
|
71
|
+
"format":"&pg=NUM",
|
72
|
+
"start":"0",
|
73
|
+
"scale":"10",
|
74
|
+
"records_per_page":"10"
|
75
|
+
}
|
76
|
+
}}'
|
77
|
+
site = JSON.parse(json)
|
78
|
+
|
79
|
+
paginator = Paginator.new(site, 32)
|
80
|
+
paginator.define_pagination_params
|
81
|
+
expect(paginator.steps).to eq [0, 10, 20, 30]
|
82
|
+
end
|
83
|
+
end
|
data/spec/rubyscraper_spec.rb
CHANGED
@@ -1,11 +1,7 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
|
-
describe
|
3
|
+
describe RubyScraper do
|
4
4
|
it 'has a version number' do
|
5
|
-
expect(
|
6
|
-
end
|
7
|
-
|
8
|
-
it 'does something useful' do
|
9
|
-
expect(false).to eq(true)
|
5
|
+
expect(RubyScraper::VERSION).not_to be nil
|
10
6
|
end
|
11
7
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe SubPageScraper do
|
4
|
+
it 'can pull record subfields from a list of existing jobs' do
|
5
|
+
jobs = [{"url" => "http://careers.stackoverflow.com/jobs/84266/software-developer-qhr-technologies"},
|
6
|
+
{"url" => "http://careers.stackoverflow.com/jobs/81592/service-engineer-bloomberg-lp"}]
|
7
|
+
json = '{
|
8
|
+
"sub_page":{
|
9
|
+
"fields":[
|
10
|
+
{
|
11
|
+
"field":"company",
|
12
|
+
"method":"find",
|
13
|
+
"path":"a.employer"
|
14
|
+
},
|
15
|
+
{
|
16
|
+
"field":"location",
|
17
|
+
"method":"find",
|
18
|
+
"path":"span.location"
|
19
|
+
},
|
20
|
+
{
|
21
|
+
"field":"description",
|
22
|
+
"method":"all",
|
23
|
+
"path":"div.description p",
|
24
|
+
"loop_collect":"text",
|
25
|
+
"join":"\n"
|
26
|
+
},
|
27
|
+
{
|
28
|
+
"field":"tags",
|
29
|
+
"method":"all",
|
30
|
+
"path":"div.tags a.post-tag",
|
31
|
+
"loop_collect":"text",
|
32
|
+
"join":", "
|
33
|
+
}
|
34
|
+
]
|
35
|
+
}
|
36
|
+
}'
|
37
|
+
site = JSON.parse(json)
|
38
|
+
|
39
|
+
scraper = SubPageScraper.new(site, jobs)
|
40
|
+
results = scraper.call
|
41
|
+
expect(results.length).to eq 2
|
42
|
+
expect(results.first["company"]).to be_a String
|
43
|
+
expect(results.first["company"]).to_not be_empty
|
44
|
+
expect(results.first["location"]).to be_a String
|
45
|
+
expect(results.first["location"]).to_not be_empty
|
46
|
+
expect(results.first["description"]).to be_a String
|
47
|
+
expect(results.first["description"]).to_not be_empty
|
48
|
+
expect(results.first["tags"]).to be_a String
|
49
|
+
expect(results.first["tags"]).to_not be_empty
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe SummaryScraper do
|
4
|
+
it 'can pull records from first page of paginated site' do
|
5
|
+
paginator = OpenStruct.new(add_on: "&pg=", pages: [1])
|
6
|
+
json = '{
|
7
|
+
"base_url":"http://www.careers.stackoverflow.com",
|
8
|
+
"summary":{
|
9
|
+
"url":"/jobs/tag/ruby?sort=p",
|
10
|
+
"loop":".listResults .-item",
|
11
|
+
"fields":[
|
12
|
+
{
|
13
|
+
"field":"position",
|
14
|
+
"method":"find",
|
15
|
+
"path":"h3.-title a"
|
16
|
+
},
|
17
|
+
{
|
18
|
+
"field":"url",
|
19
|
+
"method":"find",
|
20
|
+
"path":"h3.-title a",
|
21
|
+
"attr":"href"
|
22
|
+
},
|
23
|
+
{
|
24
|
+
"field":"posting_date",
|
25
|
+
"method":"first",
|
26
|
+
"path":"p._muted"
|
27
|
+
}
|
28
|
+
]
|
29
|
+
}
|
30
|
+
}'
|
31
|
+
site = JSON.parse(json)
|
32
|
+
|
33
|
+
scraper = SummaryScraper.new(site, paginator.add_on, paginator.pages)
|
34
|
+
results = scraper.call
|
35
|
+
expect(results.length).to eq 25
|
36
|
+
expect(results.first["position"]).to be_a String
|
37
|
+
expect(results.first["position"]).to_not be_empty
|
38
|
+
expect(results.first["url"]).to be_a String
|
39
|
+
expect(results.first["url"]).to match(/^http/)
|
40
|
+
expect(results.first["posting_date"]).to be_a String
|
41
|
+
expect(results.first["posting_date"]).to_not be_empty
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'can pull records from multiple pages of paginated site' do
|
45
|
+
paginator = OpenStruct.new(add_on: "&pg=", pages: [1, 2])
|
46
|
+
json = '{
|
47
|
+
"base_url":"http://www.careers.stackoverflow.com",
|
48
|
+
"summary":{
|
49
|
+
"url":"/jobs/tag/ruby?sort=p",
|
50
|
+
"loop":".listResults .-item",
|
51
|
+
"fields":[
|
52
|
+
{
|
53
|
+
"field":"position",
|
54
|
+
"method":"find",
|
55
|
+
"path":"h3.-title a"
|
56
|
+
},
|
57
|
+
{
|
58
|
+
"field":"url",
|
59
|
+
"method":"find",
|
60
|
+
"path":"h3.-title a",
|
61
|
+
"attr":"href"
|
62
|
+
},
|
63
|
+
{
|
64
|
+
"field":"posting_date",
|
65
|
+
"method":"first",
|
66
|
+
"path":"p._muted"
|
67
|
+
}
|
68
|
+
]
|
69
|
+
}
|
70
|
+
}'
|
71
|
+
site = JSON.parse(json)
|
72
|
+
|
73
|
+
scraper = SummaryScraper.new(site, paginator.add_on, paginator.pages)
|
74
|
+
results = scraper.call
|
75
|
+
expect(results.length).to be > 26
|
76
|
+
end
|
77
|
+
|
78
|
+
it 'can pull records from non-paginated site' do
|
79
|
+
paginator = OpenStruct.new(add_on: "", pages: [""])
|
80
|
+
json = '{
|
81
|
+
"base_url":"https://weworkremotely.com",
|
82
|
+
"summary":{
|
83
|
+
"url":"/categories/2/jobs",
|
84
|
+
"has_sub_pages":"false",
|
85
|
+
"loop":"section.jobs ul li",
|
86
|
+
"fields":[
|
87
|
+
{
|
88
|
+
"field":"position",
|
89
|
+
"method":"find",
|
90
|
+
"path":"span.title"
|
91
|
+
},
|
92
|
+
{
|
93
|
+
"field":"company",
|
94
|
+
"method":"find",
|
95
|
+
"path":"span.company"
|
96
|
+
},
|
97
|
+
{
|
98
|
+
"field":"url",
|
99
|
+
"method":"find",
|
100
|
+
"path":"a",
|
101
|
+
"attr":"href"
|
102
|
+
},
|
103
|
+
{
|
104
|
+
"field":"posting_date",
|
105
|
+
"method":"find",
|
106
|
+
"path":"span.date"
|
107
|
+
}
|
108
|
+
]
|
109
|
+
}
|
110
|
+
}'
|
111
|
+
site = JSON.parse(json)
|
112
|
+
|
113
|
+
scraper = SummaryScraper.new(site, paginator.add_on, paginator.pages)
|
114
|
+
results = scraper.call
|
115
|
+
expect(results.length).to be > 1
|
116
|
+
expect(results.first["position"]).to be_a String
|
117
|
+
expect(results.first["position"]).to_not be_empty
|
118
|
+
expect(results.first["company"]).to be_a String
|
119
|
+
expect(results.first["company"]).to_not be_empty
|
120
|
+
expect(results.first["url"]).to be_a String
|
121
|
+
expect(results.first["url"]).to match(/^http/)
|
122
|
+
expect(results.first["posting_date"]).to be_a String
|
123
|
+
expect(results.first["posting_date"]).to_not be_empty
|
124
|
+
end
|
125
|
+
end
|