rubyscraper 0.3.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,47 @@
1
+ require 'json'
2
+ require 'rubyscraper/paginator'
3
+ require 'rubyscraper/summary_scraper'
4
+ require 'rubyscraper/sub_page_scraper'
5
+
6
+ class Processor
7
+ attr_reader :sites, :record_limit, :single_site, :scrape_delay
8
+
9
+ def initialize(config_file, single_site, record_limit, scrape_delay)
10
+ @scrape_file = config_file
11
+ @scrape_config = JSON.parse(File.read(@scrape_file))
12
+ @sites = @scrape_config
13
+ @single_site = single_site
14
+ @record_limit = record_limit
15
+ @scrape_delay = scrape_delay
16
+ end
17
+
18
+ def call
19
+ !single_site.empty? ? scrape_single_site : scrape_all_sites
20
+ end
21
+
22
+ private
23
+
24
+ def scrape_single_site
25
+ site = sites.select { |s| s["name"] == single_site }.first
26
+ scrape_site(site)
27
+ end
28
+
29
+ def scrape_all_sites
30
+ sites.inject [] do |all_results, site|
31
+ all_results += scrape_site(site)
32
+ end
33
+ end
34
+
35
+ def scrape_site(site)
36
+ paginator = Paginator.new(site, record_limit)
37
+ paginator.define_pagination_params
38
+
39
+ results = SummaryScraper.new(site, paginator.add_on, paginator.steps).call
40
+ results = SubPageScraper.new(site, results, scrape_delay).call if has_sub_pages?(site)
41
+ results
42
+ end
43
+
44
+ def has_sub_pages?(site)
45
+ site["summary"]["has_sub_pages"] == "true"
46
+ end
47
+ end
@@ -0,0 +1,53 @@
1
+ require 'capybara'
2
+ require 'capybara/poltergeist'
3
+
4
+ class SubPageScraper
5
+ attr_reader :site, :listings, :delay
6
+ include Capybara::DSL
7
+
8
+ def initialize(site, listings, delay)
9
+ @site = site
10
+ @listings = listings
11
+ @delay = delay
12
+
13
+ Capybara.register_driver :poltergeist do |app|
14
+ Capybara::Poltergeist::Driver.new(app, js_errors: false)
15
+ end
16
+ Capybara.default_driver = :poltergeist
17
+ end
18
+
19
+ def call
20
+ puts "Pulling #{@listings.count} listings from #{@site["name"]}:"
21
+ listings = @listings.inject [] do |results, listing|
22
+ sleep delay
23
+ listing = pull_sub_page_data(site, listing)
24
+ listing = listing_cleanup(listing)
25
+ results << listing
26
+ end; puts "\n"; listings
27
+ end
28
+
29
+ def pull_sub_page_data(site, listing)
30
+ visit listing["url"]
31
+ site["sub_page"]["fields"].each do |field|
32
+ if field["method"] == "all"
33
+ if has_css?(field["path"])
34
+ values = all(field["path"]).map do |elem|
35
+ elem.send(field["loop_collect"])
36
+ end
37
+ listing[field["field"]] = values.join(field["join"])
38
+ end
39
+ else
40
+ if has_css?(field["path"])
41
+ listing[field["field"]] =
42
+ send(field["method"].to_sym,field["path"]).text
43
+ end
44
+ end
45
+ end; print "."; listing
46
+ end
47
+
48
+ def listing_cleanup(listing)
49
+ # Remove 'Headquarters: ' from weworkremotely jobs
50
+ listing["location"].slice!("Headquarter: ") if !listing["location"].to_s.empty?
51
+ listing
52
+ end
53
+ end
@@ -0,0 +1,65 @@
1
+ require 'capybara'
2
+ require 'capybara/poltergeist'
3
+
4
+ class SummaryScraper
5
+ attr_reader :site, :pagination_addon, :pagination_pages
6
+ include Capybara::DSL
7
+
8
+ def initialize(site, pagination_addon, pagination_pages)
9
+ @site = site
10
+ @pagination_addon = pagination_addon
11
+ @pagination_pages = pagination_pages
12
+
13
+ Capybara.register_driver :poltergeist do |app|
14
+ Capybara::Poltergeist::Driver.new(app, js_errors: false)
15
+ end
16
+ Capybara.default_driver = :poltergeist
17
+ end
18
+
19
+ def call
20
+ pagination_pages.inject [] do |results, page|
21
+ results += get_summaries(page)
22
+ end
23
+ end
24
+
25
+ private
26
+
27
+ def get_summaries(page_no)
28
+ visit page_url(page_no)
29
+
30
+ all(site["summary"]["loop"]).inject [] do |results, listing|
31
+ record = pull_summary_data(site, listing)
32
+ record = listing_cleanup(site, record)
33
+ results << record
34
+ end
35
+ end
36
+
37
+ def page_url(page)
38
+ site["base_url"] + site["summary"]["url"] + pagination_addon + page.to_s
39
+ end
40
+
41
+ def pull_summary_data(site, record)
42
+ output = Hash.new
43
+ site["summary"]["fields"].each do |field|
44
+ if field["attr"]
45
+ if record.has_css?(field["path"])
46
+ output[field["field"]] =
47
+ record.send(field["method"].to_sym, field["path"])[field["attr"]]
48
+ end
49
+ else
50
+ if record.has_css?(field["path"])
51
+ output[field["field"]] =
52
+ record.send(field["method"].to_sym, field["path"]).text
53
+ end
54
+ end
55
+ end; output
56
+ end
57
+
58
+ def listing_cleanup(site, listing)
59
+ # Add base url if not present
60
+ unless listing["url"].match(/^http/)
61
+ listing["url"] = "#{site["base_url"]}#{listing["url"]}"
62
+ end
63
+ listing
64
+ end
65
+ end
@@ -1,3 +1,3 @@
1
1
  class RubyScraper
2
- VERSION = "0.3.0"
2
+ VERSION = "0.9.0"
3
3
  end
data/rubyscraper.gemspec CHANGED
@@ -13,13 +13,12 @@ Gem::Specification.new do |s|
13
13
  s.homepage = 'https://github.com/ndwhtlssthr/rubyscraper'
14
14
  s.executables << 'rubyscraper'
15
15
 
16
- s.add_dependency "capybara"
17
- s.add_dependency "poltergeist"
18
- s.add_dependency "rest-client"
19
- s.add_dependency "slop"
16
+ s.add_dependency "capybara", "~> 2.4"
17
+ s.add_dependency "poltergeist", "~> 1.6"
18
+ s.add_dependency "rest-client", "~> 1.8"
20
19
 
21
20
  s.add_development_dependency "bundler", "~> 1.9"
22
21
  s.add_development_dependency "rake", "~> 10.0"
23
- s.add_development_dependency 'rspec', '~> 3.0'
24
- s.add_development_dependency 'pry'
22
+ s.add_development_dependency "rspec", "~> 3.0"
23
+ s.add_development_dependency "pry", "~> 0.10"
25
24
  end
@@ -0,0 +1,83 @@
1
+ require 'spec_helper'
2
+
3
+ describe Paginator do
4
+ it 'returns defaults if not paginated' do
5
+ json = '{"summary":{
6
+ "paginated":"false"
7
+ }}'
8
+ site = JSON.parse(json)
9
+
10
+ paginator = Paginator.new(site, :all)
11
+ paginator.define_pagination_params
12
+ expect(paginator.add_on).to eq ""
13
+ expect(paginator.steps).to eq [""]
14
+ end
15
+
16
+ it 'returns the correct add_on with pagination' do
17
+ json = '{"summary":{
18
+ "paginated":"true",
19
+ "pagination":{
20
+ "format":"&pg=NUM",
21
+ "start":"1",
22
+ "scale":"1",
23
+ "records_per_page":"25"
24
+ }
25
+ }}'
26
+ site = JSON.parse(json)
27
+
28
+ paginator = Paginator.new(site, 50)
29
+ paginator.define_pagination_params
30
+ expect(paginator.add_on).to eq "&pg=NUM"
31
+ end
32
+
33
+ it 'returns the correct pages when given record limit' do
34
+ json = '{"summary":{
35
+ "paginated":"true",
36
+ "pagination":{
37
+ "format":"&pg=NUM",
38
+ "start":"1",
39
+ "scale":"1",
40
+ "records_per_page":"25"
41
+ }
42
+ }}'
43
+ site = JSON.parse(json)
44
+
45
+ paginator = Paginator.new(site, 50)
46
+ paginator.define_pagination_params
47
+ expect(paginator.steps).to eq [1, 2]
48
+ end
49
+
50
+ it 'adds an additional page if pages wrap to next page' do
51
+ json = '{"summary":{
52
+ "paginated":"true",
53
+ "pagination":{
54
+ "format":"&pg=NUM",
55
+ "start":"1",
56
+ "scale":"1",
57
+ "records_per_page":"25"
58
+ }
59
+ }}'
60
+ site = JSON.parse(json)
61
+
62
+ paginator = Paginator.new(site, 58)
63
+ paginator.define_pagination_params
64
+ expect(paginator.steps).to eq [1, 2, 3]
65
+ end
66
+
67
+ it 'can handle a starting of 0' do
68
+ json = '{"summary":{
69
+ "paginated":"true",
70
+ "pagination":{
71
+ "format":"&pg=NUM",
72
+ "start":"0",
73
+ "scale":"10",
74
+ "records_per_page":"10"
75
+ }
76
+ }}'
77
+ site = JSON.parse(json)
78
+
79
+ paginator = Paginator.new(site, 32)
80
+ paginator.define_pagination_params
81
+ expect(paginator.steps).to eq [0, 10, 20, 30]
82
+ end
83
+ end
@@ -1,11 +1,7 @@
1
1
  require 'spec_helper'
2
2
 
3
- describe Rubyscraper do
3
+ describe RubyScraper do
4
4
  it 'has a version number' do
5
- expect(Rubyscraper::VERSION).not_to be nil
6
- end
7
-
8
- it 'does something useful' do
9
- expect(false).to eq(true)
5
+ expect(RubyScraper::VERSION).not_to be nil
10
6
  end
11
7
  end
data/spec/spec_helper.rb CHANGED
@@ -1,2 +1,5 @@
1
1
  $LOAD_PATH.unshift File.expand_path('../../lib', __FILE__)
2
2
  require 'rubyscraper'
3
+ require 'rubyscraper/paginator'
4
+ require 'rubyscraper/summary_scraper'
5
+ require 'rubyscraper/sub_page_scraper'
@@ -0,0 +1,51 @@
1
+ require 'spec_helper'
2
+
3
+ describe SubPageScraper do
4
+ it 'can pull record subfields from a list of existing jobs' do
5
+ jobs = [{"url" => "http://careers.stackoverflow.com/jobs/84266/software-developer-qhr-technologies"},
6
+ {"url" => "http://careers.stackoverflow.com/jobs/81592/service-engineer-bloomberg-lp"}]
7
+ json = '{
8
+ "sub_page":{
9
+ "fields":[
10
+ {
11
+ "field":"company",
12
+ "method":"find",
13
+ "path":"a.employer"
14
+ },
15
+ {
16
+ "field":"location",
17
+ "method":"find",
18
+ "path":"span.location"
19
+ },
20
+ {
21
+ "field":"description",
22
+ "method":"all",
23
+ "path":"div.description p",
24
+ "loop_collect":"text",
25
+ "join":"\n"
26
+ },
27
+ {
28
+ "field":"tags",
29
+ "method":"all",
30
+ "path":"div.tags a.post-tag",
31
+ "loop_collect":"text",
32
+ "join":", "
33
+ }
34
+ ]
35
+ }
36
+ }'
37
+ site = JSON.parse(json)
38
+
39
+ scraper = SubPageScraper.new(site, jobs)
40
+ results = scraper.call
41
+ expect(results.length).to eq 2
42
+ expect(results.first["company"]).to be_a String
43
+ expect(results.first["company"]).to_not be_empty
44
+ expect(results.first["location"]).to be_a String
45
+ expect(results.first["location"]).to_not be_empty
46
+ expect(results.first["description"]).to be_a String
47
+ expect(results.first["description"]).to_not be_empty
48
+ expect(results.first["tags"]).to be_a String
49
+ expect(results.first["tags"]).to_not be_empty
50
+ end
51
+ end
@@ -0,0 +1,125 @@
1
+ require 'spec_helper'
2
+
3
+ describe SummaryScraper do
4
+ it 'can pull records from first page of paginated site' do
5
+ paginator = OpenStruct.new(add_on: "&pg=", pages: [1])
6
+ json = '{
7
+ "base_url":"http://www.careers.stackoverflow.com",
8
+ "summary":{
9
+ "url":"/jobs/tag/ruby?sort=p",
10
+ "loop":".listResults .-item",
11
+ "fields":[
12
+ {
13
+ "field":"position",
14
+ "method":"find",
15
+ "path":"h3.-title a"
16
+ },
17
+ {
18
+ "field":"url",
19
+ "method":"find",
20
+ "path":"h3.-title a",
21
+ "attr":"href"
22
+ },
23
+ {
24
+ "field":"posting_date",
25
+ "method":"first",
26
+ "path":"p._muted"
27
+ }
28
+ ]
29
+ }
30
+ }'
31
+ site = JSON.parse(json)
32
+
33
+ scraper = SummaryScraper.new(site, paginator.add_on, paginator.pages)
34
+ results = scraper.call
35
+ expect(results.length).to eq 25
36
+ expect(results.first["position"]).to be_a String
37
+ expect(results.first["position"]).to_not be_empty
38
+ expect(results.first["url"]).to be_a String
39
+ expect(results.first["url"]).to match(/^http/)
40
+ expect(results.first["posting_date"]).to be_a String
41
+ expect(results.first["posting_date"]).to_not be_empty
42
+ end
43
+
44
+ it 'can pull records from multiple pages of paginated site' do
45
+ paginator = OpenStruct.new(add_on: "&pg=", pages: [1, 2])
46
+ json = '{
47
+ "base_url":"http://www.careers.stackoverflow.com",
48
+ "summary":{
49
+ "url":"/jobs/tag/ruby?sort=p",
50
+ "loop":".listResults .-item",
51
+ "fields":[
52
+ {
53
+ "field":"position",
54
+ "method":"find",
55
+ "path":"h3.-title a"
56
+ },
57
+ {
58
+ "field":"url",
59
+ "method":"find",
60
+ "path":"h3.-title a",
61
+ "attr":"href"
62
+ },
63
+ {
64
+ "field":"posting_date",
65
+ "method":"first",
66
+ "path":"p._muted"
67
+ }
68
+ ]
69
+ }
70
+ }'
71
+ site = JSON.parse(json)
72
+
73
+ scraper = SummaryScraper.new(site, paginator.add_on, paginator.pages)
74
+ results = scraper.call
75
+ expect(results.length).to be > 26
76
+ end
77
+
78
+ it 'can pull records from non-paginated site' do
79
+ paginator = OpenStruct.new(add_on: "", pages: [""])
80
+ json = '{
81
+ "base_url":"https://weworkremotely.com",
82
+ "summary":{
83
+ "url":"/categories/2/jobs",
84
+ "has_sub_pages":"false",
85
+ "loop":"section.jobs ul li",
86
+ "fields":[
87
+ {
88
+ "field":"position",
89
+ "method":"find",
90
+ "path":"span.title"
91
+ },
92
+ {
93
+ "field":"company",
94
+ "method":"find",
95
+ "path":"span.company"
96
+ },
97
+ {
98
+ "field":"url",
99
+ "method":"find",
100
+ "path":"a",
101
+ "attr":"href"
102
+ },
103
+ {
104
+ "field":"posting_date",
105
+ "method":"find",
106
+ "path":"span.date"
107
+ }
108
+ ]
109
+ }
110
+ }'
111
+ site = JSON.parse(json)
112
+
113
+ scraper = SummaryScraper.new(site, paginator.add_on, paginator.pages)
114
+ results = scraper.call
115
+ expect(results.length).to be > 1
116
+ expect(results.first["position"]).to be_a String
117
+ expect(results.first["position"]).to_not be_empty
118
+ expect(results.first["company"]).to be_a String
119
+ expect(results.first["company"]).to_not be_empty
120
+ expect(results.first["url"]).to be_a String
121
+ expect(results.first["url"]).to match(/^http/)
122
+ expect(results.first["posting_date"]).to be_a String
123
+ expect(results.first["posting_date"]).to_not be_empty
124
+ end
125
+ end