rubyscraper 0.3.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -6
- data/README.md +37 -7
- data/lib/rubyscraper.rb +14 -148
- data/lib/rubyscraper/api_dispatcher.rb +31 -0
- data/lib/rubyscraper/binary.rb +9 -6
- data/lib/rubyscraper/option_parser.rb +72 -0
- data/lib/rubyscraper/paginator.rb +59 -0
- data/lib/rubyscraper/processor.rb +47 -0
- data/lib/rubyscraper/sub_page_scraper.rb +53 -0
- data/lib/rubyscraper/summary_scraper.rb +65 -0
- data/lib/rubyscraper/version.rb +1 -1
- data/rubyscraper.gemspec +5 -6
- data/spec/paginator_spec.rb +83 -0
- data/spec/rubyscraper_spec.rb +2 -6
- data/spec/spec_helper.rb +3 -0
- data/spec/sub_page_scraper_spec.rb +51 -0
- data/spec/summary_scraper_spec.rb +125 -0
- metadata +27 -33
- data/lib/assets/scrapes.json +0 -287
@@ -0,0 +1,47 @@
|
|
1
|
+
require 'json'
|
2
|
+
require 'rubyscraper/paginator'
|
3
|
+
require 'rubyscraper/summary_scraper'
|
4
|
+
require 'rubyscraper/sub_page_scraper'
|
5
|
+
|
6
|
+
class Processor
|
7
|
+
attr_reader :sites, :record_limit, :single_site, :scrape_delay
|
8
|
+
|
9
|
+
def initialize(config_file, single_site, record_limit, scrape_delay)
|
10
|
+
@scrape_file = config_file
|
11
|
+
@scrape_config = JSON.parse(File.read(@scrape_file))
|
12
|
+
@sites = @scrape_config
|
13
|
+
@single_site = single_site
|
14
|
+
@record_limit = record_limit
|
15
|
+
@scrape_delay = scrape_delay
|
16
|
+
end
|
17
|
+
|
18
|
+
def call
|
19
|
+
!single_site.empty? ? scrape_single_site : scrape_all_sites
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def scrape_single_site
|
25
|
+
site = sites.select { |s| s["name"] == single_site }.first
|
26
|
+
scrape_site(site)
|
27
|
+
end
|
28
|
+
|
29
|
+
def scrape_all_sites
|
30
|
+
sites.inject [] do |all_results, site|
|
31
|
+
all_results += scrape_site(site)
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def scrape_site(site)
|
36
|
+
paginator = Paginator.new(site, record_limit)
|
37
|
+
paginator.define_pagination_params
|
38
|
+
|
39
|
+
results = SummaryScraper.new(site, paginator.add_on, paginator.steps).call
|
40
|
+
results = SubPageScraper.new(site, results, scrape_delay).call if has_sub_pages?(site)
|
41
|
+
results
|
42
|
+
end
|
43
|
+
|
44
|
+
def has_sub_pages?(site)
|
45
|
+
site["summary"]["has_sub_pages"] == "true"
|
46
|
+
end
|
47
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'capybara'
|
2
|
+
require 'capybara/poltergeist'
|
3
|
+
|
4
|
+
class SubPageScraper
|
5
|
+
attr_reader :site, :listings, :delay
|
6
|
+
include Capybara::DSL
|
7
|
+
|
8
|
+
def initialize(site, listings, delay)
|
9
|
+
@site = site
|
10
|
+
@listings = listings
|
11
|
+
@delay = delay
|
12
|
+
|
13
|
+
Capybara.register_driver :poltergeist do |app|
|
14
|
+
Capybara::Poltergeist::Driver.new(app, js_errors: false)
|
15
|
+
end
|
16
|
+
Capybara.default_driver = :poltergeist
|
17
|
+
end
|
18
|
+
|
19
|
+
def call
|
20
|
+
puts "Pulling #{@listings.count} listings from #{@site["name"]}:"
|
21
|
+
listings = @listings.inject [] do |results, listing|
|
22
|
+
sleep delay
|
23
|
+
listing = pull_sub_page_data(site, listing)
|
24
|
+
listing = listing_cleanup(listing)
|
25
|
+
results << listing
|
26
|
+
end; puts "\n"; listings
|
27
|
+
end
|
28
|
+
|
29
|
+
def pull_sub_page_data(site, listing)
|
30
|
+
visit listing["url"]
|
31
|
+
site["sub_page"]["fields"].each do |field|
|
32
|
+
if field["method"] == "all"
|
33
|
+
if has_css?(field["path"])
|
34
|
+
values = all(field["path"]).map do |elem|
|
35
|
+
elem.send(field["loop_collect"])
|
36
|
+
end
|
37
|
+
listing[field["field"]] = values.join(field["join"])
|
38
|
+
end
|
39
|
+
else
|
40
|
+
if has_css?(field["path"])
|
41
|
+
listing[field["field"]] =
|
42
|
+
send(field["method"].to_sym,field["path"]).text
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end; print "."; listing
|
46
|
+
end
|
47
|
+
|
48
|
+
def listing_cleanup(listing)
|
49
|
+
# Remove 'Headquarters: ' from weworkremotely jobs
|
50
|
+
listing["location"].slice!("Headquarter: ") if !listing["location"].to_s.empty?
|
51
|
+
listing
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,65 @@
|
|
1
|
+
require 'capybara'
|
2
|
+
require 'capybara/poltergeist'
|
3
|
+
|
4
|
+
class SummaryScraper
|
5
|
+
attr_reader :site, :pagination_addon, :pagination_pages
|
6
|
+
include Capybara::DSL
|
7
|
+
|
8
|
+
def initialize(site, pagination_addon, pagination_pages)
|
9
|
+
@site = site
|
10
|
+
@pagination_addon = pagination_addon
|
11
|
+
@pagination_pages = pagination_pages
|
12
|
+
|
13
|
+
Capybara.register_driver :poltergeist do |app|
|
14
|
+
Capybara::Poltergeist::Driver.new(app, js_errors: false)
|
15
|
+
end
|
16
|
+
Capybara.default_driver = :poltergeist
|
17
|
+
end
|
18
|
+
|
19
|
+
def call
|
20
|
+
pagination_pages.inject [] do |results, page|
|
21
|
+
results += get_summaries(page)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
private
|
26
|
+
|
27
|
+
def get_summaries(page_no)
|
28
|
+
visit page_url(page_no)
|
29
|
+
|
30
|
+
all(site["summary"]["loop"]).inject [] do |results, listing|
|
31
|
+
record = pull_summary_data(site, listing)
|
32
|
+
record = listing_cleanup(site, record)
|
33
|
+
results << record
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def page_url(page)
|
38
|
+
site["base_url"] + site["summary"]["url"] + pagination_addon + page.to_s
|
39
|
+
end
|
40
|
+
|
41
|
+
def pull_summary_data(site, record)
|
42
|
+
output = Hash.new
|
43
|
+
site["summary"]["fields"].each do |field|
|
44
|
+
if field["attr"]
|
45
|
+
if record.has_css?(field["path"])
|
46
|
+
output[field["field"]] =
|
47
|
+
record.send(field["method"].to_sym, field["path"])[field["attr"]]
|
48
|
+
end
|
49
|
+
else
|
50
|
+
if record.has_css?(field["path"])
|
51
|
+
output[field["field"]] =
|
52
|
+
record.send(field["method"].to_sym, field["path"]).text
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end; output
|
56
|
+
end
|
57
|
+
|
58
|
+
def listing_cleanup(site, listing)
|
59
|
+
# Add base url if not present
|
60
|
+
unless listing["url"].match(/^http/)
|
61
|
+
listing["url"] = "#{site["base_url"]}#{listing["url"]}"
|
62
|
+
end
|
63
|
+
listing
|
64
|
+
end
|
65
|
+
end
|
data/lib/rubyscraper/version.rb
CHANGED
data/rubyscraper.gemspec
CHANGED
@@ -13,13 +13,12 @@ Gem::Specification.new do |s|
|
|
13
13
|
s.homepage = 'https://github.com/ndwhtlssthr/rubyscraper'
|
14
14
|
s.executables << 'rubyscraper'
|
15
15
|
|
16
|
-
s.add_dependency "capybara"
|
17
|
-
s.add_dependency "poltergeist"
|
18
|
-
s.add_dependency "rest-client"
|
19
|
-
s.add_dependency "slop"
|
16
|
+
s.add_dependency "capybara", "~> 2.4"
|
17
|
+
s.add_dependency "poltergeist", "~> 1.6"
|
18
|
+
s.add_dependency "rest-client", "~> 1.8"
|
20
19
|
|
21
20
|
s.add_development_dependency "bundler", "~> 1.9"
|
22
21
|
s.add_development_dependency "rake", "~> 10.0"
|
23
|
-
s.add_development_dependency
|
24
|
-
s.add_development_dependency
|
22
|
+
s.add_development_dependency "rspec", "~> 3.0"
|
23
|
+
s.add_development_dependency "pry", "~> 0.10"
|
25
24
|
end
|
@@ -0,0 +1,83 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe Paginator do
|
4
|
+
it 'returns defaults if not paginated' do
|
5
|
+
json = '{"summary":{
|
6
|
+
"paginated":"false"
|
7
|
+
}}'
|
8
|
+
site = JSON.parse(json)
|
9
|
+
|
10
|
+
paginator = Paginator.new(site, :all)
|
11
|
+
paginator.define_pagination_params
|
12
|
+
expect(paginator.add_on).to eq ""
|
13
|
+
expect(paginator.steps).to eq [""]
|
14
|
+
end
|
15
|
+
|
16
|
+
it 'returns the correct add_on with pagination' do
|
17
|
+
json = '{"summary":{
|
18
|
+
"paginated":"true",
|
19
|
+
"pagination":{
|
20
|
+
"format":"&pg=NUM",
|
21
|
+
"start":"1",
|
22
|
+
"scale":"1",
|
23
|
+
"records_per_page":"25"
|
24
|
+
}
|
25
|
+
}}'
|
26
|
+
site = JSON.parse(json)
|
27
|
+
|
28
|
+
paginator = Paginator.new(site, 50)
|
29
|
+
paginator.define_pagination_params
|
30
|
+
expect(paginator.add_on).to eq "&pg=NUM"
|
31
|
+
end
|
32
|
+
|
33
|
+
it 'returns the correct pages when given record limit' do
|
34
|
+
json = '{"summary":{
|
35
|
+
"paginated":"true",
|
36
|
+
"pagination":{
|
37
|
+
"format":"&pg=NUM",
|
38
|
+
"start":"1",
|
39
|
+
"scale":"1",
|
40
|
+
"records_per_page":"25"
|
41
|
+
}
|
42
|
+
}}'
|
43
|
+
site = JSON.parse(json)
|
44
|
+
|
45
|
+
paginator = Paginator.new(site, 50)
|
46
|
+
paginator.define_pagination_params
|
47
|
+
expect(paginator.steps).to eq [1, 2]
|
48
|
+
end
|
49
|
+
|
50
|
+
it 'adds an additional page if pages wrap to next page' do
|
51
|
+
json = '{"summary":{
|
52
|
+
"paginated":"true",
|
53
|
+
"pagination":{
|
54
|
+
"format":"&pg=NUM",
|
55
|
+
"start":"1",
|
56
|
+
"scale":"1",
|
57
|
+
"records_per_page":"25"
|
58
|
+
}
|
59
|
+
}}'
|
60
|
+
site = JSON.parse(json)
|
61
|
+
|
62
|
+
paginator = Paginator.new(site, 58)
|
63
|
+
paginator.define_pagination_params
|
64
|
+
expect(paginator.steps).to eq [1, 2, 3]
|
65
|
+
end
|
66
|
+
|
67
|
+
it 'can handle a starting of 0' do
|
68
|
+
json = '{"summary":{
|
69
|
+
"paginated":"true",
|
70
|
+
"pagination":{
|
71
|
+
"format":"&pg=NUM",
|
72
|
+
"start":"0",
|
73
|
+
"scale":"10",
|
74
|
+
"records_per_page":"10"
|
75
|
+
}
|
76
|
+
}}'
|
77
|
+
site = JSON.parse(json)
|
78
|
+
|
79
|
+
paginator = Paginator.new(site, 32)
|
80
|
+
paginator.define_pagination_params
|
81
|
+
expect(paginator.steps).to eq [0, 10, 20, 30]
|
82
|
+
end
|
83
|
+
end
|
data/spec/rubyscraper_spec.rb
CHANGED
@@ -1,11 +1,7 @@
|
|
1
1
|
require 'spec_helper'
|
2
2
|
|
3
|
-
describe
|
3
|
+
describe RubyScraper do
|
4
4
|
it 'has a version number' do
|
5
|
-
expect(
|
6
|
-
end
|
7
|
-
|
8
|
-
it 'does something useful' do
|
9
|
-
expect(false).to eq(true)
|
5
|
+
expect(RubyScraper::VERSION).not_to be nil
|
10
6
|
end
|
11
7
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -0,0 +1,51 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe SubPageScraper do
|
4
|
+
it 'can pull record subfields from a list of existing jobs' do
|
5
|
+
jobs = [{"url" => "http://careers.stackoverflow.com/jobs/84266/software-developer-qhr-technologies"},
|
6
|
+
{"url" => "http://careers.stackoverflow.com/jobs/81592/service-engineer-bloomberg-lp"}]
|
7
|
+
json = '{
|
8
|
+
"sub_page":{
|
9
|
+
"fields":[
|
10
|
+
{
|
11
|
+
"field":"company",
|
12
|
+
"method":"find",
|
13
|
+
"path":"a.employer"
|
14
|
+
},
|
15
|
+
{
|
16
|
+
"field":"location",
|
17
|
+
"method":"find",
|
18
|
+
"path":"span.location"
|
19
|
+
},
|
20
|
+
{
|
21
|
+
"field":"description",
|
22
|
+
"method":"all",
|
23
|
+
"path":"div.description p",
|
24
|
+
"loop_collect":"text",
|
25
|
+
"join":"\n"
|
26
|
+
},
|
27
|
+
{
|
28
|
+
"field":"tags",
|
29
|
+
"method":"all",
|
30
|
+
"path":"div.tags a.post-tag",
|
31
|
+
"loop_collect":"text",
|
32
|
+
"join":", "
|
33
|
+
}
|
34
|
+
]
|
35
|
+
}
|
36
|
+
}'
|
37
|
+
site = JSON.parse(json)
|
38
|
+
|
39
|
+
scraper = SubPageScraper.new(site, jobs)
|
40
|
+
results = scraper.call
|
41
|
+
expect(results.length).to eq 2
|
42
|
+
expect(results.first["company"]).to be_a String
|
43
|
+
expect(results.first["company"]).to_not be_empty
|
44
|
+
expect(results.first["location"]).to be_a String
|
45
|
+
expect(results.first["location"]).to_not be_empty
|
46
|
+
expect(results.first["description"]).to be_a String
|
47
|
+
expect(results.first["description"]).to_not be_empty
|
48
|
+
expect(results.first["tags"]).to be_a String
|
49
|
+
expect(results.first["tags"]).to_not be_empty
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,125 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe SummaryScraper do
|
4
|
+
it 'can pull records from first page of paginated site' do
|
5
|
+
paginator = OpenStruct.new(add_on: "&pg=", pages: [1])
|
6
|
+
json = '{
|
7
|
+
"base_url":"http://www.careers.stackoverflow.com",
|
8
|
+
"summary":{
|
9
|
+
"url":"/jobs/tag/ruby?sort=p",
|
10
|
+
"loop":".listResults .-item",
|
11
|
+
"fields":[
|
12
|
+
{
|
13
|
+
"field":"position",
|
14
|
+
"method":"find",
|
15
|
+
"path":"h3.-title a"
|
16
|
+
},
|
17
|
+
{
|
18
|
+
"field":"url",
|
19
|
+
"method":"find",
|
20
|
+
"path":"h3.-title a",
|
21
|
+
"attr":"href"
|
22
|
+
},
|
23
|
+
{
|
24
|
+
"field":"posting_date",
|
25
|
+
"method":"first",
|
26
|
+
"path":"p._muted"
|
27
|
+
}
|
28
|
+
]
|
29
|
+
}
|
30
|
+
}'
|
31
|
+
site = JSON.parse(json)
|
32
|
+
|
33
|
+
scraper = SummaryScraper.new(site, paginator.add_on, paginator.pages)
|
34
|
+
results = scraper.call
|
35
|
+
expect(results.length).to eq 25
|
36
|
+
expect(results.first["position"]).to be_a String
|
37
|
+
expect(results.first["position"]).to_not be_empty
|
38
|
+
expect(results.first["url"]).to be_a String
|
39
|
+
expect(results.first["url"]).to match(/^http/)
|
40
|
+
expect(results.first["posting_date"]).to be_a String
|
41
|
+
expect(results.first["posting_date"]).to_not be_empty
|
42
|
+
end
|
43
|
+
|
44
|
+
it 'can pull records from multiple pages of paginated site' do
|
45
|
+
paginator = OpenStruct.new(add_on: "&pg=", pages: [1, 2])
|
46
|
+
json = '{
|
47
|
+
"base_url":"http://www.careers.stackoverflow.com",
|
48
|
+
"summary":{
|
49
|
+
"url":"/jobs/tag/ruby?sort=p",
|
50
|
+
"loop":".listResults .-item",
|
51
|
+
"fields":[
|
52
|
+
{
|
53
|
+
"field":"position",
|
54
|
+
"method":"find",
|
55
|
+
"path":"h3.-title a"
|
56
|
+
},
|
57
|
+
{
|
58
|
+
"field":"url",
|
59
|
+
"method":"find",
|
60
|
+
"path":"h3.-title a",
|
61
|
+
"attr":"href"
|
62
|
+
},
|
63
|
+
{
|
64
|
+
"field":"posting_date",
|
65
|
+
"method":"first",
|
66
|
+
"path":"p._muted"
|
67
|
+
}
|
68
|
+
]
|
69
|
+
}
|
70
|
+
}'
|
71
|
+
site = JSON.parse(json)
|
72
|
+
|
73
|
+
scraper = SummaryScraper.new(site, paginator.add_on, paginator.pages)
|
74
|
+
results = scraper.call
|
75
|
+
expect(results.length).to be > 26
|
76
|
+
end
|
77
|
+
|
78
|
+
it 'can pull records from non-paginated site' do
|
79
|
+
paginator = OpenStruct.new(add_on: "", pages: [""])
|
80
|
+
json = '{
|
81
|
+
"base_url":"https://weworkremotely.com",
|
82
|
+
"summary":{
|
83
|
+
"url":"/categories/2/jobs",
|
84
|
+
"has_sub_pages":"false",
|
85
|
+
"loop":"section.jobs ul li",
|
86
|
+
"fields":[
|
87
|
+
{
|
88
|
+
"field":"position",
|
89
|
+
"method":"find",
|
90
|
+
"path":"span.title"
|
91
|
+
},
|
92
|
+
{
|
93
|
+
"field":"company",
|
94
|
+
"method":"find",
|
95
|
+
"path":"span.company"
|
96
|
+
},
|
97
|
+
{
|
98
|
+
"field":"url",
|
99
|
+
"method":"find",
|
100
|
+
"path":"a",
|
101
|
+
"attr":"href"
|
102
|
+
},
|
103
|
+
{
|
104
|
+
"field":"posting_date",
|
105
|
+
"method":"find",
|
106
|
+
"path":"span.date"
|
107
|
+
}
|
108
|
+
]
|
109
|
+
}
|
110
|
+
}'
|
111
|
+
site = JSON.parse(json)
|
112
|
+
|
113
|
+
scraper = SummaryScraper.new(site, paginator.add_on, paginator.pages)
|
114
|
+
results = scraper.call
|
115
|
+
expect(results.length).to be > 1
|
116
|
+
expect(results.first["position"]).to be_a String
|
117
|
+
expect(results.first["position"]).to_not be_empty
|
118
|
+
expect(results.first["company"]).to be_a String
|
119
|
+
expect(results.first["company"]).to_not be_empty
|
120
|
+
expect(results.first["url"]).to be_a String
|
121
|
+
expect(results.first["url"]).to match(/^http/)
|
122
|
+
expect(results.first["posting_date"]).to be_a String
|
123
|
+
expect(results.first["posting_date"]).to_not be_empty
|
124
|
+
end
|
125
|
+
end
|