rubyscraper 0.3.0 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -6
- data/README.md +37 -7
- data/lib/rubyscraper.rb +14 -148
- data/lib/rubyscraper/api_dispatcher.rb +31 -0
- data/lib/rubyscraper/binary.rb +9 -6
- data/lib/rubyscraper/option_parser.rb +72 -0
- data/lib/rubyscraper/paginator.rb +59 -0
- data/lib/rubyscraper/processor.rb +47 -0
- data/lib/rubyscraper/sub_page_scraper.rb +53 -0
- data/lib/rubyscraper/summary_scraper.rb +65 -0
- data/lib/rubyscraper/version.rb +1 -1
- data/rubyscraper.gemspec +5 -6
- data/spec/paginator_spec.rb +83 -0
- data/spec/rubyscraper_spec.rb +2 -6
- data/spec/spec_helper.rb +3 -0
- data/spec/sub_page_scraper_spec.rb +51 -0
- data/spec/summary_scraper_spec.rb +125 -0
- metadata +27 -33
- data/lib/assets/scrapes.json +0 -287
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d243d8f44eb571d03e741459af753e87c10fc254
|
4
|
+
data.tar.gz: a49de3dfd374cd2bfcc6fa2d7e2d0f735a4ce567
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 705135c2b75b796ce8aa7db591d435da5a1643c6a7934241529deca24959249720a00b1b980e335fc29b7590991f275d41c32b275302170316ec0ab121459132
|
7
|
+
data.tar.gz: 23313e00a3eb5d907c27881a1e4c3c0640ba6afc25189a86e53494f2c545bd1f7968b46ef130bbcf93f38aca88c6de2ab345c82716c7d8cac039060657688404
|
data/Gemfile.lock
CHANGED
@@ -1,11 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
rubyscraper (0.
|
5
|
-
capybara
|
6
|
-
poltergeist
|
7
|
-
rest-client
|
8
|
-
slop
|
4
|
+
rubyscraper (0.1.0)
|
5
|
+
capybara (~> 2.4)
|
6
|
+
poltergeist (~> 1.6)
|
7
|
+
rest-client (~> 1.8)
|
9
8
|
|
10
9
|
GEM
|
11
10
|
remote: https://rubygems.org/
|
@@ -75,7 +74,7 @@ PLATFORMS
|
|
75
74
|
|
76
75
|
DEPENDENCIES
|
77
76
|
bundler (~> 1.9)
|
78
|
-
pry
|
77
|
+
pry (~> 0.10)
|
79
78
|
rake (~> 10.0)
|
80
79
|
rspec (~> 3.0)
|
81
80
|
rubyscraper!
|
data/README.md
CHANGED
@@ -1,11 +1,24 @@
|
|
1
1
|
# RubyScraper
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
TODO: Delete this and the text above, and describe your gem
|
3
|
+
RubyScraper is a gem built
|
6
4
|
|
7
5
|
## Installation
|
6
|
+
### Dependency
|
7
|
+
RubyScraper relies on PhantomJS as its headless web browser. Install this before installing the gem with:
|
8
|
+
|
9
|
+
```
|
10
|
+
brew install phantomjs
|
11
|
+
```
|
12
|
+
|
13
|
+
### CLI
|
14
|
+
Install RubyScraper by running:
|
15
|
+
|
16
|
+
```
|
17
|
+
gem install rubyscraper
|
18
|
+
```
|
8
19
|
|
20
|
+
### Gemfile
|
21
|
+
*Work in Progress*
|
9
22
|
Add this line to your application's Gemfile:
|
10
23
|
|
11
24
|
```ruby
|
@@ -22,18 +35,35 @@ Or install it yourself as:
|
|
22
35
|
|
23
36
|
## Usage
|
24
37
|
|
25
|
-
|
38
|
+
```
|
39
|
+
Usage: RubyScraper [options]
|
40
|
+
|
41
|
+
Specific options:
|
42
|
+
|
43
|
+
REQUIRED:
|
44
|
+
-f, --file FILENAME.JSON Specify the file_name of your RubyScraper config file
|
26
45
|
|
27
|
-
|
46
|
+
REQUIRED (if using as service to send results as post requests):
|
47
|
+
-e, --endpoint URL Enter the api endpoint URL here
|
48
|
+
(If using scraper as a service to send post requests to server)
|
28
49
|
|
29
|
-
|
50
|
+
OPTIONAL:
|
51
|
+
-r, --record-limit N Pull N records per site
|
52
|
+
(approximate because if there are 25 records per
|
53
|
+
page, and 51 is provided, it will go to 3 pages)
|
54
|
+
-d, --delay N Delay N seconds before executing
|
55
|
+
-s, --site SITENAME Scrape a single SITENAME from the config file
|
30
56
|
|
31
|
-
|
57
|
+
Common options:
|
58
|
+
-h, --help Show this message
|
59
|
+
--version Show version
|
60
|
+
```
|
32
61
|
|
33
62
|
## Contributing
|
34
63
|
|
35
64
|
1. Fork it ( https://github.com/[my-github-username]/rubyscraper/fork )
|
36
65
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
66
|
+
3. Write your tests and don't break anything :) *run tests with `rspec`*
|
37
67
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
38
68
|
4. Push to the branch (`git push origin my-new-feature`)
|
39
69
|
5. Create a new Pull Request
|
data/lib/rubyscraper.rb
CHANGED
@@ -1,153 +1,19 @@
|
|
1
|
-
require 'capybara'
|
2
|
-
require 'capybara/poltergeist'
|
3
|
-
require 'rest-client'
|
4
1
|
require 'rubyscraper/version'
|
2
|
+
require 'rubyscraper/processor'
|
3
|
+
require 'rubyscraper/api_dispatcher'
|
5
4
|
|
6
5
|
class RubyScraper
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
@pages = pages
|
20
|
-
@endpoint = endpoint
|
21
|
-
@scrape_file = File.expand_path('../assets/scrapes.json', __FILE__)
|
22
|
-
@scrape_config = JSON.parse(File.read(@scrape_file))
|
23
|
-
end
|
24
|
-
|
25
|
-
def scrape(single_site=nil)
|
26
|
-
if single_site
|
27
|
-
search_site = scrape_config.select { |site| site["name"] == single_site }
|
28
|
-
if search_site
|
29
|
-
get_data(search_site.first)
|
30
|
-
else
|
31
|
-
raise "Invalid single site name #{single_site}. Not in scrape file."
|
32
|
-
end
|
33
|
-
else
|
34
|
-
scrape_config.each do |site|
|
35
|
-
unless site["skip"] == "true"
|
36
|
-
get_data(site)
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
40
|
-
return scraped_jobs, posted_jobs
|
41
|
-
end
|
42
|
-
|
43
|
-
def get_data(site)
|
44
|
-
get_summaries(site)
|
45
|
-
get_bodies(site)
|
46
|
-
send_to_server
|
47
|
-
end
|
48
|
-
|
49
|
-
def get_summaries(site)
|
50
|
-
if site["summary"]["params"].length > 0 && !site["summary"]["no_pagination?"]
|
51
|
-
site["summary"]["params"][0]["SEARCHTERM"].each do |term|
|
52
|
-
summary_url = "#{site["base_url"]}#{site["summary"]["url"].sub("SEARCHTERM", term)}"
|
53
|
-
pagination_start = site["summary"]["pagination_start"].to_i
|
54
|
-
pagination_end = pagination_start + pages - 1
|
55
|
-
(pagination_start..pagination_end).to_a.each do |page|
|
56
|
-
visit "#{summary_url}#{site["summary"]["pagination_fmt"]}#{page * site["summary"]["pagination_scale"].to_i}"
|
57
|
-
all(site["summary"]["loop"]).each do |listing|
|
58
|
-
job = pull_summary_data(site, listing)
|
59
|
-
job = modify_data(site, job)
|
60
|
-
jobs << job
|
61
|
-
end
|
62
|
-
puts "Pulled #{site["name"]}: #{term} (page: #{page}) job summaries."
|
63
|
-
end
|
64
|
-
end
|
65
|
-
else
|
66
|
-
summary_url = "#{site["base_url"]}#{site["summary"]["url"]}"
|
67
|
-
visit summary_url
|
68
|
-
all(site["summary"]["loop"]).each do |listing|
|
69
|
-
job = pull_summary_data(site, listing)
|
70
|
-
job = modify_data(site, job)
|
71
|
-
jobs << job
|
72
|
-
end
|
73
|
-
puts "Pulled #{site["name"]} job summaries."
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
def pull_summary_data(site, listing)
|
78
|
-
job = Hash.new
|
79
|
-
site["summary"]["fields"].each do |field|
|
80
|
-
if field["attr"]
|
81
|
-
if listing.has_css?(field["path"])
|
82
|
-
job[field["field"]] =
|
83
|
-
listing.send(field["method"].to_sym, field["path"])[field["attr"]]
|
84
|
-
end
|
85
|
-
else
|
86
|
-
if listing.has_css?(field["path"])
|
87
|
-
job[field["field"]] =
|
88
|
-
listing.send(field["method"].to_sym, field["path"]).text
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end; job
|
92
|
-
end
|
93
|
-
|
94
|
-
def modify_data(site, job)
|
95
|
-
job["url"] = "#{site["base_url"]}#{job["url"]}" unless job["url"].match(/^http/)
|
96
|
-
job
|
97
|
-
end
|
98
|
-
|
99
|
-
def get_bodies(site)
|
100
|
-
jobs.each_with_index do |job, i|
|
101
|
-
sleep 1
|
102
|
-
pull_job_data(site, job)
|
103
|
-
puts "Job #{i+1} pulled."
|
104
|
-
end
|
105
|
-
end
|
106
|
-
|
107
|
-
def pull_job_data(site, job)
|
108
|
-
visit job["url"]
|
109
|
-
site["sub_page"]["fields"].each do |field|
|
110
|
-
if field["method"] == "all"
|
111
|
-
if has_css?(field["path"])
|
112
|
-
values = all(field["path"]).map do |elem|
|
113
|
-
elem.send(field["loop_collect"])
|
114
|
-
end
|
115
|
-
job[field["field"]] = values.join(field["join"])
|
116
|
-
end
|
117
|
-
else
|
118
|
-
if has_css?(field["path"])
|
119
|
-
job[field["field"]] =
|
120
|
-
send(field["method"].to_sym,field["path"]).text
|
121
|
-
end
|
122
|
-
end
|
123
|
-
end
|
124
|
-
end
|
125
|
-
|
126
|
-
def send_to_server
|
127
|
-
@scraped_jobs += jobs.length
|
128
|
-
jobs.each do |job|
|
129
|
-
tags = job["tags"] || ""
|
130
|
-
new_job = {
|
131
|
-
position: job["position"],
|
132
|
-
location: job["location"],
|
133
|
-
description: job["description"],
|
134
|
-
source: job["url"],
|
135
|
-
company: job["company"],
|
136
|
-
tags: tags.split(", ")
|
137
|
-
}
|
138
|
-
|
139
|
-
RestClient.post(endpoint, job: new_job){ |response, request, result, &block|
|
140
|
-
case response.code
|
141
|
-
when 201
|
142
|
-
@posted_jobs += 1
|
143
|
-
puts "Job saved."
|
144
|
-
when 302
|
145
|
-
puts "Job already exists."
|
146
|
-
else
|
147
|
-
puts "Bad request."
|
148
|
-
end
|
149
|
-
}
|
150
|
-
end
|
151
|
-
@jobs = []
|
6
|
+
def self.call(opts)
|
7
|
+
record_limit = opts.record_limit
|
8
|
+
config_file = File.expand_path(opts.config_file, Dir.pwd)
|
9
|
+
single_site = opts.single_site
|
10
|
+
scrape_delay = opts.scrape_delay
|
11
|
+
endpoint = opts.endpoint
|
12
|
+
|
13
|
+
processor = Processor.new(config_file, single_site, record_limit, scrape_delay)
|
14
|
+
results = processor.call
|
15
|
+
num_saved = ApiDispatcher.post(results, endpoint)
|
16
|
+
|
17
|
+
return results.count, num_saved
|
152
18
|
end
|
153
19
|
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'rest-client'
|
2
|
+
|
3
|
+
class ApiDispatcher
|
4
|
+
def self.post(results, endpoint)
|
5
|
+
results.inject 0 do |posted, listing|
|
6
|
+
tags = listing["tags"].split(", ") if listing["tags"]
|
7
|
+
new_listing = {
|
8
|
+
position: listing["position"],
|
9
|
+
location: listing["location"],
|
10
|
+
company: listing["company"],
|
11
|
+
description: listing["description"],
|
12
|
+
source: listing["url"],
|
13
|
+
tags: tags
|
14
|
+
}
|
15
|
+
|
16
|
+
RestClient.post(endpoint, job: new_listing){ |response, request, result, &block|
|
17
|
+
case response.code
|
18
|
+
when 201
|
19
|
+
puts "Job saved."
|
20
|
+
posted += 1
|
21
|
+
when 302
|
22
|
+
puts "Job already exists."
|
23
|
+
posted
|
24
|
+
else
|
25
|
+
puts "Bad request."
|
26
|
+
posted
|
27
|
+
end
|
28
|
+
}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
data/lib/rubyscraper/binary.rb
CHANGED
@@ -1,16 +1,19 @@
|
|
1
1
|
require 'rubyscraper'
|
2
|
+
require 'rubyscraper/option_parser'
|
2
3
|
|
3
4
|
class RubyScraper
|
4
5
|
class Binary
|
5
6
|
def self.call(argv, outstream, errstream)
|
6
|
-
outstream.puts "
|
7
|
+
outstream.puts "RubyScraper"
|
7
8
|
outstream.puts "---------------------------------------------"
|
8
9
|
outstream.puts "Started scraping..."
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
outstream.puts "---------------------------------------------"
|
11
|
+
|
12
|
+
options = OptparseExample.parse(argv)
|
13
|
+
records_scraped, records_saved = RubyScraper.call(options)
|
14
|
+
|
15
|
+
outstream.puts "---------------------------------------------"
|
16
|
+
outstream.puts "Scraped #{records_scraped} records, succesfully posted #{records_saved} records."
|
14
17
|
outstream.puts "---------------------------------------------"
|
15
18
|
outstream.puts "Completed!"
|
16
19
|
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'rubyscraper/version'
|
2
|
+
require 'optparse'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
class OptparseExample
|
6
|
+
def self.parse(args)
|
7
|
+
options = OpenStruct.new
|
8
|
+
options.config_file = ""
|
9
|
+
options.endpoint = ""
|
10
|
+
options.record_limit = 50
|
11
|
+
options.single_site = ""
|
12
|
+
options.scrape_delay = 1
|
13
|
+
|
14
|
+
opt_parser = OptionParser.new do |opts|
|
15
|
+
opts.banner = "Usage: RubyScraper [options]"
|
16
|
+
|
17
|
+
opts.separator ""
|
18
|
+
opts.separator "Specific options:"
|
19
|
+
opts.separator ""
|
20
|
+
|
21
|
+
opts.separator "REQUIRED:"
|
22
|
+
# Mandatory argument
|
23
|
+
opts.on("-f", "--file FILENAME.JSON",
|
24
|
+
"Specify the file_name of your RubyScraper config file") do |file|
|
25
|
+
options.config_file = file
|
26
|
+
end
|
27
|
+
|
28
|
+
opts.separator ""
|
29
|
+
opts.separator "REQUIRED (if using as service to send results as post requests):"
|
30
|
+
# Mandatory argument if sending results to POST endpoint
|
31
|
+
opts.on("-e", "--endpoint URL",
|
32
|
+
"Enter the api endpoint URL here",
|
33
|
+
" (If using scraper as a service to send post requests to server)",) do |url|
|
34
|
+
options.endpoint = url
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.separator ""
|
38
|
+
opts.separator "OPTIONAL:"
|
39
|
+
|
40
|
+
opts.on("-rl", "--record-limit N", Integer,
|
41
|
+
"Pull N records per site",
|
42
|
+
" (approximate because if there are 25 records per",
|
43
|
+
" page, and 51 is provided, it will go to 3 pages)") do |limit|
|
44
|
+
options.record_limit = limit
|
45
|
+
end
|
46
|
+
|
47
|
+
opts.on("-d", "--delay N", Float, "Delay N seconds before executing") do |n|
|
48
|
+
options.delay = n
|
49
|
+
end
|
50
|
+
|
51
|
+
opts.on("-s", "--site SITENAME", "Scrape a single SITENAME from the config file") do |site|
|
52
|
+
options.single_site = site
|
53
|
+
end
|
54
|
+
|
55
|
+
opts.separator ""
|
56
|
+
opts.separator "Common options:"
|
57
|
+
|
58
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
59
|
+
puts opts
|
60
|
+
exit
|
61
|
+
end
|
62
|
+
|
63
|
+
opts.on_tail("--version", "Show version") do
|
64
|
+
puts RubyScraper::VERSION
|
65
|
+
exit
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
opt_parser.parse!(args)
|
70
|
+
options
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
class Paginator
|
2
|
+
attr_reader :site, :record_limit, :pagination
|
3
|
+
|
4
|
+
def initialize(site, record_limit)
|
5
|
+
@site = site
|
6
|
+
@pagination = site["summary"]["pagination"]
|
7
|
+
@record_limit = record_limit
|
8
|
+
end
|
9
|
+
|
10
|
+
def define_pagination_params
|
11
|
+
if paginated_site?
|
12
|
+
@steps = url_page_addons
|
13
|
+
@add_on = pagination["format"]
|
14
|
+
else
|
15
|
+
@steps = [""]
|
16
|
+
@add_on = ""
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def add_on
|
21
|
+
@add_on
|
22
|
+
end
|
23
|
+
|
24
|
+
def steps
|
25
|
+
@steps
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def url_page_addons
|
31
|
+
output = []
|
32
|
+
num_pages.times do |i|
|
33
|
+
output << pagination_start + pagination_scale * i
|
34
|
+
end
|
35
|
+
output
|
36
|
+
end
|
37
|
+
|
38
|
+
def num_pages
|
39
|
+
output = record_limit / records_per_page
|
40
|
+
output += 1 if record_limit % records_per_page != 0
|
41
|
+
output
|
42
|
+
end
|
43
|
+
|
44
|
+
def records_per_page
|
45
|
+
pagination["records_per_page"].to_i
|
46
|
+
end
|
47
|
+
|
48
|
+
def pagination_start
|
49
|
+
pagination["start"].to_i
|
50
|
+
end
|
51
|
+
|
52
|
+
def pagination_scale
|
53
|
+
pagination["scale"].to_i
|
54
|
+
end
|
55
|
+
|
56
|
+
def paginated_site?
|
57
|
+
site["summary"]["paginated"] == "true"
|
58
|
+
end
|
59
|
+
end
|