rubyscraper 0.3.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +5 -6
- data/README.md +37 -7
- data/lib/rubyscraper.rb +14 -148
- data/lib/rubyscraper/api_dispatcher.rb +31 -0
- data/lib/rubyscraper/binary.rb +9 -6
- data/lib/rubyscraper/option_parser.rb +72 -0
- data/lib/rubyscraper/paginator.rb +59 -0
- data/lib/rubyscraper/processor.rb +47 -0
- data/lib/rubyscraper/sub_page_scraper.rb +53 -0
- data/lib/rubyscraper/summary_scraper.rb +65 -0
- data/lib/rubyscraper/version.rb +1 -1
- data/rubyscraper.gemspec +5 -6
- data/spec/paginator_spec.rb +83 -0
- data/spec/rubyscraper_spec.rb +2 -6
- data/spec/spec_helper.rb +3 -0
- data/spec/sub_page_scraper_spec.rb +51 -0
- data/spec/summary_scraper_spec.rb +125 -0
- metadata +27 -33
- data/lib/assets/scrapes.json +0 -287
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d243d8f44eb571d03e741459af753e87c10fc254
|
4
|
+
data.tar.gz: a49de3dfd374cd2bfcc6fa2d7e2d0f735a4ce567
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 705135c2b75b796ce8aa7db591d435da5a1643c6a7934241529deca24959249720a00b1b980e335fc29b7590991f275d41c32b275302170316ec0ab121459132
|
7
|
+
data.tar.gz: 23313e00a3eb5d907c27881a1e4c3c0640ba6afc25189a86e53494f2c545bd1f7968b46ef130bbcf93f38aca88c6de2ab345c82716c7d8cac039060657688404
|
data/Gemfile.lock
CHANGED
@@ -1,11 +1,10 @@
|
|
1
1
|
PATH
|
2
2
|
remote: .
|
3
3
|
specs:
|
4
|
-
rubyscraper (0.
|
5
|
-
capybara
|
6
|
-
poltergeist
|
7
|
-
rest-client
|
8
|
-
slop
|
4
|
+
rubyscraper (0.1.0)
|
5
|
+
capybara (~> 2.4)
|
6
|
+
poltergeist (~> 1.6)
|
7
|
+
rest-client (~> 1.8)
|
9
8
|
|
10
9
|
GEM
|
11
10
|
remote: https://rubygems.org/
|
@@ -75,7 +74,7 @@ PLATFORMS
|
|
75
74
|
|
76
75
|
DEPENDENCIES
|
77
76
|
bundler (~> 1.9)
|
78
|
-
pry
|
77
|
+
pry (~> 0.10)
|
79
78
|
rake (~> 10.0)
|
80
79
|
rspec (~> 3.0)
|
81
80
|
rubyscraper!
|
data/README.md
CHANGED
@@ -1,11 +1,24 @@
|
|
1
1
|
# RubyScraper
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
TODO: Delete this and the text above, and describe your gem
|
3
|
+
RubyScraper is a gem built
|
6
4
|
|
7
5
|
## Installation
|
6
|
+
### Dependency
|
7
|
+
RubyScraper relies on PhantomJS as its headless web browser. Install this before installing the gem with:
|
8
|
+
|
9
|
+
```
|
10
|
+
brew install phantomjs
|
11
|
+
```
|
12
|
+
|
13
|
+
### CLI
|
14
|
+
Install RubyScraper by running:
|
15
|
+
|
16
|
+
```
|
17
|
+
gem install rubyscraper
|
18
|
+
```
|
8
19
|
|
20
|
+
### Gemfile
|
21
|
+
*Work in Progress*
|
9
22
|
Add this line to your application's Gemfile:
|
10
23
|
|
11
24
|
```ruby
|
@@ -22,18 +35,35 @@ Or install it yourself as:
|
|
22
35
|
|
23
36
|
## Usage
|
24
37
|
|
25
|
-
|
38
|
+
```
|
39
|
+
Usage: RubyScraper [options]
|
40
|
+
|
41
|
+
Specific options:
|
42
|
+
|
43
|
+
REQUIRED:
|
44
|
+
-f, --file FILENAME.JSON Specify the file_name of your RubyScraper config file
|
26
45
|
|
27
|
-
|
46
|
+
REQUIRED (if using as service to send results as post requests):
|
47
|
+
-e, --endpoint URL Enter the api endpoint URL here
|
48
|
+
(If using scraper as a service to send post requests to server)
|
28
49
|
|
29
|
-
|
50
|
+
OPTIONAL:
|
51
|
+
-r, --record-limit N Pull N records per site
|
52
|
+
(approximate because if there are 25 records per
|
53
|
+
page, and 51 is provided, it will go to 3 pages)
|
54
|
+
-d, --delay N Delay N seconds before executing
|
55
|
+
-s, --site SITENAME Scrape a single SITENAME from the config file
|
30
56
|
|
31
|
-
|
57
|
+
Common options:
|
58
|
+
-h, --help Show this message
|
59
|
+
--version Show version
|
60
|
+
```
|
32
61
|
|
33
62
|
## Contributing
|
34
63
|
|
35
64
|
1. Fork it ( https://github.com/[my-github-username]/rubyscraper/fork )
|
36
65
|
2. Create your feature branch (`git checkout -b my-new-feature`)
|
66
|
+
3. Write your tests and don't break anything :) *run tests with `rspec`*
|
37
67
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
38
68
|
4. Push to the branch (`git push origin my-new-feature`)
|
39
69
|
5. Create a new Pull Request
|
data/lib/rubyscraper.rb
CHANGED
@@ -1,153 +1,19 @@
|
|
1
|
-
require 'capybara'
|
2
|
-
require 'capybara/poltergeist'
|
3
|
-
require 'rest-client'
|
4
1
|
require 'rubyscraper/version'
|
2
|
+
require 'rubyscraper/processor'
|
3
|
+
require 'rubyscraper/api_dispatcher'
|
5
4
|
|
6
5
|
class RubyScraper
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
@pages = pages
|
20
|
-
@endpoint = endpoint
|
21
|
-
@scrape_file = File.expand_path('../assets/scrapes.json', __FILE__)
|
22
|
-
@scrape_config = JSON.parse(File.read(@scrape_file))
|
23
|
-
end
|
24
|
-
|
25
|
-
def scrape(single_site=nil)
|
26
|
-
if single_site
|
27
|
-
search_site = scrape_config.select { |site| site["name"] == single_site }
|
28
|
-
if search_site
|
29
|
-
get_data(search_site.first)
|
30
|
-
else
|
31
|
-
raise "Invalid single site name #{single_site}. Not in scrape file."
|
32
|
-
end
|
33
|
-
else
|
34
|
-
scrape_config.each do |site|
|
35
|
-
unless site["skip"] == "true"
|
36
|
-
get_data(site)
|
37
|
-
end
|
38
|
-
end
|
39
|
-
end
|
40
|
-
return scraped_jobs, posted_jobs
|
41
|
-
end
|
42
|
-
|
43
|
-
def get_data(site)
|
44
|
-
get_summaries(site)
|
45
|
-
get_bodies(site)
|
46
|
-
send_to_server
|
47
|
-
end
|
48
|
-
|
49
|
-
def get_summaries(site)
|
50
|
-
if site["summary"]["params"].length > 0 && !site["summary"]["no_pagination?"]
|
51
|
-
site["summary"]["params"][0]["SEARCHTERM"].each do |term|
|
52
|
-
summary_url = "#{site["base_url"]}#{site["summary"]["url"].sub("SEARCHTERM", term)}"
|
53
|
-
pagination_start = site["summary"]["pagination_start"].to_i
|
54
|
-
pagination_end = pagination_start + pages - 1
|
55
|
-
(pagination_start..pagination_end).to_a.each do |page|
|
56
|
-
visit "#{summary_url}#{site["summary"]["pagination_fmt"]}#{page * site["summary"]["pagination_scale"].to_i}"
|
57
|
-
all(site["summary"]["loop"]).each do |listing|
|
58
|
-
job = pull_summary_data(site, listing)
|
59
|
-
job = modify_data(site, job)
|
60
|
-
jobs << job
|
61
|
-
end
|
62
|
-
puts "Pulled #{site["name"]}: #{term} (page: #{page}) job summaries."
|
63
|
-
end
|
64
|
-
end
|
65
|
-
else
|
66
|
-
summary_url = "#{site["base_url"]}#{site["summary"]["url"]}"
|
67
|
-
visit summary_url
|
68
|
-
all(site["summary"]["loop"]).each do |listing|
|
69
|
-
job = pull_summary_data(site, listing)
|
70
|
-
job = modify_data(site, job)
|
71
|
-
jobs << job
|
72
|
-
end
|
73
|
-
puts "Pulled #{site["name"]} job summaries."
|
74
|
-
end
|
75
|
-
end
|
76
|
-
|
77
|
-
def pull_summary_data(site, listing)
|
78
|
-
job = Hash.new
|
79
|
-
site["summary"]["fields"].each do |field|
|
80
|
-
if field["attr"]
|
81
|
-
if listing.has_css?(field["path"])
|
82
|
-
job[field["field"]] =
|
83
|
-
listing.send(field["method"].to_sym, field["path"])[field["attr"]]
|
84
|
-
end
|
85
|
-
else
|
86
|
-
if listing.has_css?(field["path"])
|
87
|
-
job[field["field"]] =
|
88
|
-
listing.send(field["method"].to_sym, field["path"]).text
|
89
|
-
end
|
90
|
-
end
|
91
|
-
end; job
|
92
|
-
end
|
93
|
-
|
94
|
-
def modify_data(site, job)
|
95
|
-
job["url"] = "#{site["base_url"]}#{job["url"]}" unless job["url"].match(/^http/)
|
96
|
-
job
|
97
|
-
end
|
98
|
-
|
99
|
-
def get_bodies(site)
|
100
|
-
jobs.each_with_index do |job, i|
|
101
|
-
sleep 1
|
102
|
-
pull_job_data(site, job)
|
103
|
-
puts "Job #{i+1} pulled."
|
104
|
-
end
|
105
|
-
end
|
106
|
-
|
107
|
-
def pull_job_data(site, job)
|
108
|
-
visit job["url"]
|
109
|
-
site["sub_page"]["fields"].each do |field|
|
110
|
-
if field["method"] == "all"
|
111
|
-
if has_css?(field["path"])
|
112
|
-
values = all(field["path"]).map do |elem|
|
113
|
-
elem.send(field["loop_collect"])
|
114
|
-
end
|
115
|
-
job[field["field"]] = values.join(field["join"])
|
116
|
-
end
|
117
|
-
else
|
118
|
-
if has_css?(field["path"])
|
119
|
-
job[field["field"]] =
|
120
|
-
send(field["method"].to_sym,field["path"]).text
|
121
|
-
end
|
122
|
-
end
|
123
|
-
end
|
124
|
-
end
|
125
|
-
|
126
|
-
def send_to_server
|
127
|
-
@scraped_jobs += jobs.length
|
128
|
-
jobs.each do |job|
|
129
|
-
tags = job["tags"] || ""
|
130
|
-
new_job = {
|
131
|
-
position: job["position"],
|
132
|
-
location: job["location"],
|
133
|
-
description: job["description"],
|
134
|
-
source: job["url"],
|
135
|
-
company: job["company"],
|
136
|
-
tags: tags.split(", ")
|
137
|
-
}
|
138
|
-
|
139
|
-
RestClient.post(endpoint, job: new_job){ |response, request, result, &block|
|
140
|
-
case response.code
|
141
|
-
when 201
|
142
|
-
@posted_jobs += 1
|
143
|
-
puts "Job saved."
|
144
|
-
when 302
|
145
|
-
puts "Job already exists."
|
146
|
-
else
|
147
|
-
puts "Bad request."
|
148
|
-
end
|
149
|
-
}
|
150
|
-
end
|
151
|
-
@jobs = []
|
6
|
+
def self.call(opts)
|
7
|
+
record_limit = opts.record_limit
|
8
|
+
config_file = File.expand_path(opts.config_file, Dir.pwd)
|
9
|
+
single_site = opts.single_site
|
10
|
+
scrape_delay = opts.scrape_delay
|
11
|
+
endpoint = opts.endpoint
|
12
|
+
|
13
|
+
processor = Processor.new(config_file, single_site, record_limit, scrape_delay)
|
14
|
+
results = processor.call
|
15
|
+
num_saved = ApiDispatcher.post(results, endpoint)
|
16
|
+
|
17
|
+
return results.count, num_saved
|
152
18
|
end
|
153
19
|
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'rest-client'
|
2
|
+
|
3
|
+
class ApiDispatcher
|
4
|
+
def self.post(results, endpoint)
|
5
|
+
results.inject 0 do |posted, listing|
|
6
|
+
tags = listing["tags"].split(", ") if listing["tags"]
|
7
|
+
new_listing = {
|
8
|
+
position: listing["position"],
|
9
|
+
location: listing["location"],
|
10
|
+
company: listing["company"],
|
11
|
+
description: listing["description"],
|
12
|
+
source: listing["url"],
|
13
|
+
tags: tags
|
14
|
+
}
|
15
|
+
|
16
|
+
RestClient.post(endpoint, job: new_listing){ |response, request, result, &block|
|
17
|
+
case response.code
|
18
|
+
when 201
|
19
|
+
puts "Job saved."
|
20
|
+
posted += 1
|
21
|
+
when 302
|
22
|
+
puts "Job already exists."
|
23
|
+
posted
|
24
|
+
else
|
25
|
+
puts "Bad request."
|
26
|
+
posted
|
27
|
+
end
|
28
|
+
}
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
data/lib/rubyscraper/binary.rb
CHANGED
@@ -1,16 +1,19 @@
|
|
1
1
|
require 'rubyscraper'
|
2
|
+
require 'rubyscraper/option_parser'
|
2
3
|
|
3
4
|
class RubyScraper
|
4
5
|
class Binary
|
5
6
|
def self.call(argv, outstream, errstream)
|
6
|
-
outstream.puts "
|
7
|
+
outstream.puts "RubyScraper"
|
7
8
|
outstream.puts "---------------------------------------------"
|
8
9
|
outstream.puts "Started scraping..."
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
10
|
+
outstream.puts "---------------------------------------------"
|
11
|
+
|
12
|
+
options = OptparseExample.parse(argv)
|
13
|
+
records_scraped, records_saved = RubyScraper.call(options)
|
14
|
+
|
15
|
+
outstream.puts "---------------------------------------------"
|
16
|
+
outstream.puts "Scraped #{records_scraped} records, succesfully posted #{records_saved} records."
|
14
17
|
outstream.puts "---------------------------------------------"
|
15
18
|
outstream.puts "Completed!"
|
16
19
|
end
|
@@ -0,0 +1,72 @@
|
|
1
|
+
require 'rubyscraper/version'
|
2
|
+
require 'optparse'
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
class OptparseExample
|
6
|
+
def self.parse(args)
|
7
|
+
options = OpenStruct.new
|
8
|
+
options.config_file = ""
|
9
|
+
options.endpoint = ""
|
10
|
+
options.record_limit = 50
|
11
|
+
options.single_site = ""
|
12
|
+
options.scrape_delay = 1
|
13
|
+
|
14
|
+
opt_parser = OptionParser.new do |opts|
|
15
|
+
opts.banner = "Usage: RubyScraper [options]"
|
16
|
+
|
17
|
+
opts.separator ""
|
18
|
+
opts.separator "Specific options:"
|
19
|
+
opts.separator ""
|
20
|
+
|
21
|
+
opts.separator "REQUIRED:"
|
22
|
+
# Mandatory argument
|
23
|
+
opts.on("-f", "--file FILENAME.JSON",
|
24
|
+
"Specify the file_name of your RubyScraper config file") do |file|
|
25
|
+
options.config_file = file
|
26
|
+
end
|
27
|
+
|
28
|
+
opts.separator ""
|
29
|
+
opts.separator "REQUIRED (if using as service to send results as post requests):"
|
30
|
+
# Mandatory argument if sending results to POST endpoint
|
31
|
+
opts.on("-e", "--endpoint URL",
|
32
|
+
"Enter the api endpoint URL here",
|
33
|
+
" (If using scraper as a service to send post requests to server)",) do |url|
|
34
|
+
options.endpoint = url
|
35
|
+
end
|
36
|
+
|
37
|
+
opts.separator ""
|
38
|
+
opts.separator "OPTIONAL:"
|
39
|
+
|
40
|
+
opts.on("-rl", "--record-limit N", Integer,
|
41
|
+
"Pull N records per site",
|
42
|
+
" (approximate because if there are 25 records per",
|
43
|
+
" page, and 51 is provided, it will go to 3 pages)") do |limit|
|
44
|
+
options.record_limit = limit
|
45
|
+
end
|
46
|
+
|
47
|
+
opts.on("-d", "--delay N", Float, "Delay N seconds before executing") do |n|
|
48
|
+
options.delay = n
|
49
|
+
end
|
50
|
+
|
51
|
+
opts.on("-s", "--site SITENAME", "Scrape a single SITENAME from the config file") do |site|
|
52
|
+
options.single_site = site
|
53
|
+
end
|
54
|
+
|
55
|
+
opts.separator ""
|
56
|
+
opts.separator "Common options:"
|
57
|
+
|
58
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
59
|
+
puts opts
|
60
|
+
exit
|
61
|
+
end
|
62
|
+
|
63
|
+
opts.on_tail("--version", "Show version") do
|
64
|
+
puts RubyScraper::VERSION
|
65
|
+
exit
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
opt_parser.parse!(args)
|
70
|
+
options
|
71
|
+
end
|
72
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
class Paginator
|
2
|
+
attr_reader :site, :record_limit, :pagination
|
3
|
+
|
4
|
+
def initialize(site, record_limit)
|
5
|
+
@site = site
|
6
|
+
@pagination = site["summary"]["pagination"]
|
7
|
+
@record_limit = record_limit
|
8
|
+
end
|
9
|
+
|
10
|
+
def define_pagination_params
|
11
|
+
if paginated_site?
|
12
|
+
@steps = url_page_addons
|
13
|
+
@add_on = pagination["format"]
|
14
|
+
else
|
15
|
+
@steps = [""]
|
16
|
+
@add_on = ""
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def add_on
|
21
|
+
@add_on
|
22
|
+
end
|
23
|
+
|
24
|
+
def steps
|
25
|
+
@steps
|
26
|
+
end
|
27
|
+
|
28
|
+
private
|
29
|
+
|
30
|
+
def url_page_addons
|
31
|
+
output = []
|
32
|
+
num_pages.times do |i|
|
33
|
+
output << pagination_start + pagination_scale * i
|
34
|
+
end
|
35
|
+
output
|
36
|
+
end
|
37
|
+
|
38
|
+
def num_pages
|
39
|
+
output = record_limit / records_per_page
|
40
|
+
output += 1 if record_limit % records_per_page != 0
|
41
|
+
output
|
42
|
+
end
|
43
|
+
|
44
|
+
def records_per_page
|
45
|
+
pagination["records_per_page"].to_i
|
46
|
+
end
|
47
|
+
|
48
|
+
def pagination_start
|
49
|
+
pagination["start"].to_i
|
50
|
+
end
|
51
|
+
|
52
|
+
def pagination_scale
|
53
|
+
pagination["scale"].to_i
|
54
|
+
end
|
55
|
+
|
56
|
+
def paginated_site?
|
57
|
+
site["summary"]["paginated"] == "true"
|
58
|
+
end
|
59
|
+
end
|