rubyscraper 0.3.0 → 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,47 @@
1
+ require 'json'
2
+ require 'rubyscraper/paginator'
3
+ require 'rubyscraper/summary_scraper'
4
+ require 'rubyscraper/sub_page_scraper'
5
+
6
+ class Processor
7
+ attr_reader :sites, :record_limit, :single_site, :scrape_delay
8
+
9
+ def initialize(config_file, single_site, record_limit, scrape_delay)
10
+ @scrape_file = config_file
11
+ @scrape_config = JSON.parse(File.read(@scrape_file))
12
+ @sites = @scrape_config
13
+ @single_site = single_site
14
+ @record_limit = record_limit
15
+ @scrape_delay = scrape_delay
16
+ end
17
+
18
+ def call
19
+ !single_site.empty? ? scrape_single_site : scrape_all_sites
20
+ end
21
+
22
+ private
23
+
24
+ def scrape_single_site
25
+ site = sites.select { |s| s["name"] == single_site }.first
26
+ scrape_site(site)
27
+ end
28
+
29
+ def scrape_all_sites
30
+ sites.inject [] do |all_results, site|
31
+ all_results += scrape_site(site)
32
+ end
33
+ end
34
+
35
+ def scrape_site(site)
36
+ paginator = Paginator.new(site, record_limit)
37
+ paginator.define_pagination_params
38
+
39
+ results = SummaryScraper.new(site, paginator.add_on, paginator.steps).call
40
+ results = SubPageScraper.new(site, results, scrape_delay).call if has_sub_pages?(site)
41
+ results
42
+ end
43
+
44
+ def has_sub_pages?(site)
45
+ site["summary"]["has_sub_pages"] == "true"
46
+ end
47
+ end
@@ -0,0 +1,53 @@
1
+ require 'capybara'
2
+ require 'capybara/poltergeist'
3
+
4
+ class SubPageScraper
5
+ attr_reader :site, :listings, :delay
6
+ include Capybara::DSL
7
+
8
+ def initialize(site, listings, delay)
9
+ @site = site
10
+ @listings = listings
11
+ @delay = delay
12
+
13
+ Capybara.register_driver :poltergeist do |app|
14
+ Capybara::Poltergeist::Driver.new(app, js_errors: false)
15
+ end
16
+ Capybara.default_driver = :poltergeist
17
+ end
18
+
19
+ def call
20
+ puts "Pulling #{@listings.count} listings from #{@site["name"]}:"
21
+ listings = @listings.inject [] do |results, listing|
22
+ sleep delay
23
+ listing = pull_sub_page_data(site, listing)
24
+ listing = listing_cleanup(listing)
25
+ results << listing
26
+ end; puts "\n"; listings
27
+ end
28
+
29
+ def pull_sub_page_data(site, listing)
30
+ visit listing["url"]
31
+ site["sub_page"]["fields"].each do |field|
32
+ if field["method"] == "all"
33
+ if has_css?(field["path"])
34
+ values = all(field["path"]).map do |elem|
35
+ elem.send(field["loop_collect"])
36
+ end
37
+ listing[field["field"]] = values.join(field["join"])
38
+ end
39
+ else
40
+ if has_css?(field["path"])
41
+ listing[field["field"]] =
42
+ send(field["method"].to_sym,field["path"]).text
43
+ end
44
+ end
45
+ end; print "."; listing
46
+ end
47
+
48
+ def listing_cleanup(listing)
49
+ # Remove 'Headquarters: ' from weworkremotely jobs
50
+ listing["location"].slice!("Headquarter: ") if !listing["location"].to_s.empty?
51
+ listing
52
+ end
53
+ end
@@ -0,0 +1,65 @@
1
+ require 'capybara'
2
+ require 'capybara/poltergeist'
3
+
4
+ class SummaryScraper
5
+ attr_reader :site, :pagination_addon, :pagination_pages
6
+ include Capybara::DSL
7
+
8
+ def initialize(site, pagination_addon, pagination_pages)
9
+ @site = site
10
+ @pagination_addon = pagination_addon
11
+ @pagination_pages = pagination_pages
12
+
13
+ Capybara.register_driver :poltergeist do |app|
14
+ Capybara::Poltergeist::Driver.new(app, js_errors: false)
15
+ end
16
+ Capybara.default_driver = :poltergeist
17
+ end
18
+
19
+ def call
20
+ pagination_pages.inject [] do |results, page|
21
+ results += get_summaries(page)
22
+ end
23
+ end
24
+
25
+ private
26
+
27
+ def get_summaries(page_no)
28
+ visit page_url(page_no)
29
+
30
+ all(site["summary"]["loop"]).inject [] do |results, listing|
31
+ record = pull_summary_data(site, listing)
32
+ record = listing_cleanup(site, record)
33
+ results << record
34
+ end
35
+ end
36
+
37
+ def page_url(page)
38
+ site["base_url"] + site["summary"]["url"] + pagination_addon + page.to_s
39
+ end
40
+
41
+ def pull_summary_data(site, record)
42
+ output = Hash.new
43
+ site["summary"]["fields"].each do |field|
44
+ if field["attr"]
45
+ if record.has_css?(field["path"])
46
+ output[field["field"]] =
47
+ record.send(field["method"].to_sym, field["path"])[field["attr"]]
48
+ end
49
+ else
50
+ if record.has_css?(field["path"])
51
+ output[field["field"]] =
52
+ record.send(field["method"].to_sym, field["path"]).text
53
+ end
54
+ end
55
+ end; output
56
+ end
57
+
58
+ def listing_cleanup(site, listing)
59
+ # Add base url if not present
60
+ unless listing["url"].match(/^http/)
61
+ listing["url"] = "#{site["base_url"]}#{listing["url"]}"
62
+ end
63
+ listing
64
+ end
65
+ end
@@ -1,3 +1,3 @@
1
1
  class RubyScraper
2
- VERSION = "0.3.0"
2
+ VERSION = "0.9.0"
3
3
  end
data/rubyscraper.gemspec CHANGED
@@ -13,13 +13,12 @@ Gem::Specification.new do |s|
13
13
  s.homepage = 'https://github.com/ndwhtlssthr/rubyscraper'
14
14
  s.executables << 'rubyscraper'
15
15
 
16
- s.add_dependency "capybara"
17
- s.add_dependency "poltergeist"
18
- s.add_dependency "rest-client"
19
- s.add_dependency "slop"
16
+ s.add_dependency "capybara", "~> 2.4"
17
+ s.add_dependency "poltergeist", "~> 1.6"
18
+ s.add_dependency "rest-client", "~> 1.8"
20
19
 
21
20
  s.add_development_dependency "bundler", "~> 1.9"
22
21
  s.add_development_dependency "rake", "~> 10.0"
23
- s.add_development_dependency 'rspec', '~> 3.0'
24
- s.add_development_dependency 'pry'
22
+ s.add_development_dependency "rspec", "~> 3.0"
23
+ s.add_development_dependency "pry", "~> 0.10"
25
24
  end
@@ -0,0 +1,83 @@
1
+ require 'spec_helper'
2
+
3
+ describe Paginator do
4
+ it 'returns defaults if not paginated' do
5
+ json = '{"summary":{
6
+ "paginated":"false"
7
+ }}'
8
+ site = JSON.parse(json)
9
+
10
+ paginator = Paginator.new(site, :all)
11
+ paginator.define_pagination_params
12
+ expect(paginator.add_on).to eq ""
13
+ expect(paginator.steps).to eq [""]
14
+ end
15
+
16
+ it 'returns the correct add_on with pagination' do
17
+ json = '{"summary":{
18
+ "paginated":"true",
19
+ "pagination":{
20
+ "format":"&pg=NUM",
21
+ "start":"1",
22
+ "scale":"1",
23
+ "records_per_page":"25"
24
+ }
25
+ }}'
26
+ site = JSON.parse(json)
27
+
28
+ paginator = Paginator.new(site, 50)
29
+ paginator.define_pagination_params
30
+ expect(paginator.add_on).to eq "&pg=NUM"
31
+ end
32
+
33
+ it 'returns the correct pages when given record limit' do
34
+ json = '{"summary":{
35
+ "paginated":"true",
36
+ "pagination":{
37
+ "format":"&pg=NUM",
38
+ "start":"1",
39
+ "scale":"1",
40
+ "records_per_page":"25"
41
+ }
42
+ }}'
43
+ site = JSON.parse(json)
44
+
45
+ paginator = Paginator.new(site, 50)
46
+ paginator.define_pagination_params
47
+ expect(paginator.steps).to eq [1, 2]
48
+ end
49
+
50
+ it 'adds an additional page if pages wrap to next page' do
51
+ json = '{"summary":{
52
+ "paginated":"true",
53
+ "pagination":{
54
+ "format":"&pg=NUM",
55
+ "start":"1",
56
+ "scale":"1",
57
+ "records_per_page":"25"
58
+ }
59
+ }}'
60
+ site = JSON.parse(json)
61
+
62
+ paginator = Paginator.new(site, 58)
63
+ paginator.define_pagination_params
64
+ expect(paginator.steps).to eq [1, 2, 3]
65
+ end
66
+
67
+ it 'can handle a starting of 0' do
68
+ json = '{"summary":{
69
+ "paginated":"true",
70
+ "pagination":{
71
+ "format":"&pg=NUM",
72
+ "start":"0",
73
+ "scale":"10",
74
+ "records_per_page":"10"
75
+ }
76
+ }}'
77
+ site = JSON.parse(json)
78
+
79
+ paginator = Paginator.new(site, 32)
80
+ paginator.define_pagination_params
81
+ expect(paginator.steps).to eq [0, 10, 20, 30]
82
+ end
83
+ end
@@ -1,11 +1,7 @@
1
1
  require 'spec_helper'
2
2
 
3
- describe Rubyscraper do
3
+ describe RubyScraper do
4
4
  it 'has a version number' do
5
- expect(Rubyscraper::VERSION).not_to be nil
6
- end
7
-
8
- it 'does something useful' do
9
- expect(false).to eq(true)
5
+ expect(RubyScraper::VERSION).not_to be nil
10
6
  end
11
7
  end
data/spec/spec_helper.rb CHANGED
@@ -1,2 +1,5 @@
1
1
  $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
2
2
  require 'rubyscraper'
3
+ require 'rubyscraper/paginator'
4
+ require 'rubyscraper/summary_scraper'
5
+ require 'rubyscraper/sub_page_scraper'
@@ -0,0 +1,51 @@
1
+ require 'spec_helper'
2
+
3
+ describe SubPageScraper do
4
+ it 'can pull record subfields from a list of existing jobs' do
5
+ jobs = [{"url" => "http://careers.stackoverflow.com/jobs/84266/software-developer-qhr-technologies"},
6
+ {"url" => "http://careers.stackoverflow.com/jobs/81592/service-engineer-bloomberg-lp"}]
7
+ json = '{
8
+ "sub_page":{
9
+ "fields":[
10
+ {
11
+ "field":"company",
12
+ "method":"find",
13
+ "path":"a.employer"
14
+ },
15
+ {
16
+ "field":"location",
17
+ "method":"find",
18
+ "path":"span.location"
19
+ },
20
+ {
21
+ "field":"description",
22
+ "method":"all",
23
+ "path":"div.description p",
24
+ "loop_collect":"text",
25
+ "join":"\n"
26
+ },
27
+ {
28
+ "field":"tags",
29
+ "method":"all",
30
+ "path":"div.tags a.post-tag",
31
+ "loop_collect":"text",
32
+ "join":", "
33
+ }
34
+ ]
35
+ }
36
+ }'
37
+ site = JSON.parse(json)
38
+
39
+ scraper = SubPageScraper.new(site, jobs)
40
+ results = scraper.call
41
+ expect(results.length).to eq 2
42
+ expect(results.first["company"]).to be_a String
43
+ expect(results.first["company"]).to_not be_empty
44
+ expect(results.first["location"]).to be_a String
45
+ expect(results.first["location"]).to_not be_empty
46
+ expect(results.first["description"]).to be_a String
47
+ expect(results.first["description"]).to_not be_empty
48
+ expect(results.first["tags"]).to be_a String
49
+ expect(results.first["tags"]).to_not be_empty
50
+ end
51
+ end
@@ -0,0 +1,125 @@
1
+ require 'spec_helper'
2
+
3
+ describe SummaryScraper do
4
+ it 'can pull records from first page of paginated site' do
5
+ paginator = OpenStruct.new(add_on: "&pg=", pages: [1])
6
+ json = '{
7
+ "base_url":"http://www.careers.stackoverflow.com",
8
+ "summary":{
9
+ "url":"/jobs/tag/ruby?sort=p",
10
+ "loop":".listResults .-item",
11
+ "fields":[
12
+ {
13
+ "field":"position",
14
+ "method":"find",
15
+ "path":"h3.-title a"
16
+ },
17
+ {
18
+ "field":"url",
19
+ "method":"find",
20
+ "path":"h3.-title a",
21
+ "attr":"href"
22
+ },
23
+ {
24
+ "field":"posting_date",
25
+ "method":"first",
26
+ "path":"p._muted"
27
+ }
28
+ ]
29
+ }
30
+ }'
31
+ site = JSON.parse(json)
32
+
33
+ scraper = SummaryScraper.new(site, paginator.add_on, paginator.pages)
34
+ results = scraper.call
35
+ expect(results.length).to eq 25
36
+ expect(results.first["position"]).to be_a String
37
+ expect(results.first["position"]).to_not be_empty
38
+ expect(results.first["url"]).to be_a String
39
+ expect(results.first["url"]).to match(/^http/)
40
+ expect(results.first["posting_date"]).to be_a String
41
+ expect(results.first["posting_date"]).to_not be_empty
42
+ end
43
+
44
+ it 'can pull records from multiple pages of paginated site' do
45
+ paginator = OpenStruct.new(add_on: "&pg=", pages: [1, 2])
46
+ json = '{
47
+ "base_url":"http://www.careers.stackoverflow.com",
48
+ "summary":{
49
+ "url":"/jobs/tag/ruby?sort=p",
50
+ "loop":".listResults .-item",
51
+ "fields":[
52
+ {
53
+ "field":"position",
54
+ "method":"find",
55
+ "path":"h3.-title a"
56
+ },
57
+ {
58
+ "field":"url",
59
+ "method":"find",
60
+ "path":"h3.-title a",
61
+ "attr":"href"
62
+ },
63
+ {
64
+ "field":"posting_date",
65
+ "method":"first",
66
+ "path":"p._muted"
67
+ }
68
+ ]
69
+ }
70
+ }'
71
+ site = JSON.parse(json)
72
+
73
+ scraper = SummaryScraper.new(site, paginator.add_on, paginator.pages)
74
+ results = scraper.call
75
+ expect(results.length).to be > 26
76
+ end
77
+
78
+ it 'can pull records from non-paginated site' do
79
+ paginator = OpenStruct.new(add_on: "", pages: [""])
80
+ json = '{
81
+ "base_url":"https://weworkremotely.com",
82
+ "summary":{
83
+ "url":"/categories/2/jobs",
84
+ "has_sub_pages":"false",
85
+ "loop":"section.jobs ul li",
86
+ "fields":[
87
+ {
88
+ "field":"position",
89
+ "method":"find",
90
+ "path":"span.title"
91
+ },
92
+ {
93
+ "field":"company",
94
+ "method":"find",
95
+ "path":"span.company"
96
+ },
97
+ {
98
+ "field":"url",
99
+ "method":"find",
100
+ "path":"a",
101
+ "attr":"href"
102
+ },
103
+ {
104
+ "field":"posting_date",
105
+ "method":"find",
106
+ "path":"span.date"
107
+ }
108
+ ]
109
+ }
110
+ }'
111
+ site = JSON.parse(json)
112
+
113
+ scraper = SummaryScraper.new(site, paginator.add_on, paginator.pages)
114
+ results = scraper.call
115
+ expect(results.length).to be > 1
116
+ expect(results.first["position"]).to be_a String
117
+ expect(results.first["position"]).to_not be_empty
118
+ expect(results.first["company"]).to be_a String
119
+ expect(results.first["company"]).to_not be_empty
120
+ expect(results.first["url"]).to be_a String
121
+ expect(results.first["url"]).to match(/^http/)
122
+ expect(results.first["posting_date"]).to be_a String
123
+ expect(results.first["posting_date"]).to_not be_empty
124
+ end
125
+ end