TOSwimScraper 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/bin/scrape +35 -0
- data/lib/scraper.rb +199 -0
- metadata +123 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 74a57bb46c96004254173ffaa32157460ee5f0d6
|
4
|
+
data.tar.gz: df2230bef488a7f8cf0eadbd170b1dfffcd05db2
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 6181c523844d05a449e65f08b6d909537d4e6dd4a0010d40698c2cc742b923d102142a7814cddba775ce6fe8922f1261ff2590d1db21ffbb638e9b8d435b4329
|
7
|
+
data.tar.gz: ecd7f0859e7349378cf2c11c7d6eb4f78948258000d8ce5f597db0e84c3cdb7529334cb0140de14c1ec83360906b5df4e8af724c53e3ac8fdf7910766ba5e6b8
|
data/bin/scrape
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
require_relative '../lib/scraper'
|
5
|
+
|
6
|
+
if ARGV.empty?
|
7
|
+
puts " Usage: scrape UPDATE_TYPE DISPLAY_TYPE \n\n"
|
8
|
+
puts " Update type:\n"
|
9
|
+
puts " '-f' Full refresh including regrabbing swim times and pool locations \n"
|
10
|
+
puts " '-s' Just update swim schedules \n\n"
|
11
|
+
puts " Display type: \n"
|
12
|
+
puts " '-v' Verbose output \n"
|
13
|
+
puts " '-c' Concise output(default) \n"
|
14
|
+
else
|
15
|
+
|
16
|
+
display_mode = ARGV[1] || '-c'
|
17
|
+
|
18
|
+
aliases = {
|
19
|
+
"-c" => "concise",
|
20
|
+
"-v" => "verbose"
|
21
|
+
}
|
22
|
+
|
23
|
+
display_mode = aliases[display_mode] || "concise"
|
24
|
+
|
25
|
+
Scraper.display_mode(display_mode)
|
26
|
+
|
27
|
+
if ARGV.include?('-f')
|
28
|
+
Scraper.gather_pool_info
|
29
|
+
Scraper.gather_pool_swim_times
|
30
|
+
Scraper.gather_pool_program_cost_status
|
31
|
+
elsif ARGV.include?('-s')
|
32
|
+
Scraper.gather_pool_swim_times
|
33
|
+
Scraper.gather_pool_program_cost_status
|
34
|
+
end
|
35
|
+
end
|
data/lib/scraper.rb
ADDED
@@ -0,0 +1,199 @@
|
|
1
|
+
# Gems
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'pry'
|
5
|
+
require 'json'
|
6
|
+
require 'geocoder'
|
7
|
+
|
8
|
+
module Scraper
|
9
|
+
class << self
|
10
|
+
|
11
|
+
def display_mode(display_mode)
|
12
|
+
@display_mode = display_mode
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
# faster testing
|
17
|
+
# POOL_LIST_URLS = ["http://www1.toronto.ca/parks/prd/facilities/indoor-pools/index.htm"]
|
18
|
+
# Full list
|
19
|
+
POOL_LIST_URLS = ["http://www1.toronto.ca/parks/prd/facilities/indoor-pools/index.htm",
|
20
|
+
"http://www1.toronto.ca/parks/prd/facilities/indoor-pools/2-indoor_pool.htm",
|
21
|
+
"http://www1.toronto.ca/parks/prd/facilities/outdoor-pools/index.htm",
|
22
|
+
"http://www1.toronto.ca/parks/prd/facilities/outdoor-pools/2-outdoor_pool.htm"]
|
23
|
+
|
24
|
+
Geocoder.configure(:timeout => 10)
|
25
|
+
|
26
|
+
def gather_pool_info
|
27
|
+
@pool_urls, pool_names, pool_addresses, pool_links, pool_coordinates = [],[],[],[],[]
|
28
|
+
|
29
|
+
POOL_LIST_URLS.each do |url|
|
30
|
+
doc = Nokogiri::HTML(open(url))
|
31
|
+
pools = doc.at_css("#pfrBody > div.pfrListing > table > tbody")
|
32
|
+
pool_names += pools.css('a').map { |link| link.children.text }
|
33
|
+
pool_links += pools.css('a').map { |link| link['href'] }
|
34
|
+
pool_addresses += gather_pool_addresses(pools)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Geotag pools
|
38
|
+
puts "\n--- Scraping pool coordinates ---"
|
39
|
+
pool_coordinates = pool_addresses.map { |address| gather_pool_coordinates(address) }
|
40
|
+
|
41
|
+
# Convert Pool Data to Hash
|
42
|
+
pool_names.each_with_index do |pool, index|
|
43
|
+
current_pool = {}
|
44
|
+
current_pool[:name] = pool_names[index]
|
45
|
+
current_pool[:url] = pool_links[index]
|
46
|
+
current_pool[:address] = pool_addresses[index]
|
47
|
+
current_pool[:coordinates] = pool_coordinates[index]
|
48
|
+
@pool_urls << current_pool
|
49
|
+
end
|
50
|
+
|
51
|
+
# Write Hash
|
52
|
+
File.open("pool_urls.json","w") do |f|
|
53
|
+
f.write(@pool_urls.to_json)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def simple_equal(num)
|
58
|
+
2 * num
|
59
|
+
end
|
60
|
+
|
61
|
+
def swim_time_finder(week, lane_swim_row_index)
|
62
|
+
week.at_css("tbody").css('tr')[lane_swim_row_index].children
|
63
|
+
.map do |el|
|
64
|
+
nodes = el.children.find_all(&:text?)
|
65
|
+
if nodes.length == 1
|
66
|
+
nodes = [el.children.text]
|
67
|
+
else
|
68
|
+
nodes.map!(&:text)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def build_pool_schedule_array_from_html(doc)
|
74
|
+
weeks = {}
|
75
|
+
|
76
|
+
for i in 0..1 #eventually poll more weeks, possibly 4 of available 7
|
77
|
+
week = doc.at_css("#dropin_Swimming_#{i}")
|
78
|
+
!week.nil?? week_dates = week.at_css('tr').children.map(&:text) : next
|
79
|
+
|
80
|
+
!week_dates.nil?? lane_swim_row_index = week.at_css("tbody").css('tr').find_index { |el| el.text=~ /Lane Swim/ } : next
|
81
|
+
|
82
|
+
if !lane_swim_row_index.nil?
|
83
|
+
week_lane_swim_times = swim_time_finder(week, lane_swim_row_index)
|
84
|
+
weeks.merge!(week_dates.zip(week_lane_swim_times).to_h)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# remove days with no swim times
|
89
|
+
weeks.delete_if { |day, time| time == [" "] || time == [] }
|
90
|
+
end
|
91
|
+
|
92
|
+
def gather_pool_addresses(pools)
|
93
|
+
pool_addresses = []
|
94
|
+
address_index_incrementer = pools.css('td').length / pools.css('tr').length
|
95
|
+
pools.css('td').each_with_index do |node, index|
|
96
|
+
# Address is always second column, table width varies for indoor vs. outdoor
|
97
|
+
if index % address_index_incrementer == 1
|
98
|
+
pool_addresses << node.text
|
99
|
+
end
|
100
|
+
end
|
101
|
+
pool_addresses
|
102
|
+
end
|
103
|
+
|
104
|
+
# Method accepting a block that supresses stdout/console logging
|
105
|
+
# https://gist.github.com/moertel/11091573
|
106
|
+
|
107
|
+
def suppress_output
|
108
|
+
begin
|
109
|
+
original_stderr = $stderr.clone
|
110
|
+
original_stdout = $stdout.clone
|
111
|
+
$stderr.reopen(File.new('/dev/null', 'w'))
|
112
|
+
$stdout.reopen(File.new('/dev/null', 'w'))
|
113
|
+
retval = yield
|
114
|
+
rescue Exception => e
|
115
|
+
$stdout.reopen(original_stdout)
|
116
|
+
$stderr.reopen(original_stderr)
|
117
|
+
raise e
|
118
|
+
ensure
|
119
|
+
$stdout.reopen(original_stdout)
|
120
|
+
$stderr.reopen(original_stderr)
|
121
|
+
end
|
122
|
+
retval
|
123
|
+
end
|
124
|
+
|
125
|
+
def gather_pool_coordinates(address)
|
126
|
+
if @display_mode == "verbose"
|
127
|
+
puts "Geocoding: #{address}"
|
128
|
+
else
|
129
|
+
print "."
|
130
|
+
end
|
131
|
+
|
132
|
+
coordinates_arr = suppress_output{ Geocoder.coordinates("#{address}, Toronto") }
|
133
|
+
|
134
|
+
# To avoid triggering google API limit of 10 queries per second
|
135
|
+
sleep(0.15)
|
136
|
+
return { latitude: coordinates_arr[0], longitude: coordinates_arr[1] }
|
137
|
+
end
|
138
|
+
|
139
|
+
#####Parse Weekly Leisure Swim Data#####
|
140
|
+
def gather_pool_swim_times
|
141
|
+
begin
|
142
|
+
@pool_urls ||= JSON.parse(File.read('pool_urls.json'), symbolize_names: true)
|
143
|
+
rescue => e
|
144
|
+
puts "Couldn't open pool_info, run scrape -f or run in path with pool_urls.json file"
|
145
|
+
exit
|
146
|
+
end
|
147
|
+
|
148
|
+
puts "\n--- Scraping pool swim times ---"
|
149
|
+
@pool_urls.each do |pool|
|
150
|
+
|
151
|
+
if @display_mode == "verbose"
|
152
|
+
puts "Scraping: " + pool[:name]
|
153
|
+
else
|
154
|
+
print "."
|
155
|
+
end
|
156
|
+
|
157
|
+
url = "http://www1.toronto.ca" + pool[:url]
|
158
|
+
doc = Nokogiri::HTML(open(url))
|
159
|
+
pool[:times] = build_pool_schedule_array_from_html(doc)
|
160
|
+
end
|
161
|
+
|
162
|
+
File.open("pools_data.json","w") do |f|
|
163
|
+
f.write(@pool_urls.to_json)
|
164
|
+
puts "\nWriting pools_data.json complete"
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def gather_pool_program_cost_status
|
169
|
+
@pools = JSON.parse(File.read('pools_data.json'), symbolize_names: true)
|
170
|
+
|
171
|
+
page = "http://www1.toronto.ca/wps/portal/contentonly?vgnextoid=aaafdada600f0410VgnVCM10000071d60f89RCRD&vgnextchannel=a96adada600f0410VgnVCM10000071d60f89RCRD"
|
172
|
+
doc = Nokogiri::HTML(open(page))
|
173
|
+
free_facility_article = doc.at_css("#maincontent")
|
174
|
+
links = free_facility_article.css('a')
|
175
|
+
all_hrefs = links.map { |link| link.attribute('href').to_s }.uniq.sort.delete_if { |href| href.empty? }
|
176
|
+
|
177
|
+
free_facility_urls_regexed = all_hrefs.keep_if{ |href| href.match("\/parks/prd/facilities/complex\w*") }
|
178
|
+
.map{ |url| url.match(/\/parks\/prd\/facilities\/complex\/\d*/).to_s }
|
179
|
+
|
180
|
+
@pools.each do |pool|
|
181
|
+
pool_url_regex = pool[:url].match(/\/parks\/prd\/facilities\/complex\/\d*/).to_s
|
182
|
+
match = free_facility_urls_regexed.find{ |e| pool_url_regex == e }
|
183
|
+
pool[:free_swim] = match ? true : false
|
184
|
+
end
|
185
|
+
|
186
|
+
File.open("pools_data.json","w") do |f|
|
187
|
+
f.write(@pools.to_json)
|
188
|
+
puts "Writing program cost status to pools_data.json complete"
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
# Todo
|
196
|
+
# add a test suite
|
197
|
+
# remind self how to log name of vars while blown up (smaller method with info passed in probably!)
|
198
|
+
#start displaying, filtering?
|
199
|
+
#maybe transform date save
|
metadata
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: TOSwimScraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Erich Welz
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-04-21 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.3'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.3'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: geocoder
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.2'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.2'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: json
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.8'
|
48
|
+
- - ">="
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: 1.8.3
|
51
|
+
type: :runtime
|
52
|
+
prerelease: false
|
53
|
+
version_requirements: !ruby/object:Gem::Requirement
|
54
|
+
requirements:
|
55
|
+
- - "~>"
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '1.8'
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 1.8.3
|
61
|
+
- !ruby/object:Gem::Dependency
|
62
|
+
name: nokogiri
|
63
|
+
requirement: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - "~>"
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: '1.6'
|
68
|
+
type: :runtime
|
69
|
+
prerelease: false
|
70
|
+
version_requirements: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - "~>"
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '1.6'
|
75
|
+
- !ruby/object:Gem::Dependency
|
76
|
+
name: pry
|
77
|
+
requirement: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - "~>"
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '0.10'
|
82
|
+
type: :runtime
|
83
|
+
prerelease: false
|
84
|
+
version_requirements: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - "~>"
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0.10'
|
89
|
+
description: Scraper to grab City of Toronto lane swim data creating a JSON file with
|
90
|
+
geotagged pools
|
91
|
+
email: erichwelz@gmail.com
|
92
|
+
executables:
|
93
|
+
- scrape
|
94
|
+
extensions: []
|
95
|
+
extra_rdoc_files: []
|
96
|
+
files:
|
97
|
+
- bin/scrape
|
98
|
+
- lib/scraper.rb
|
99
|
+
homepage: https://github.com/erichwelz/TOSwim
|
100
|
+
licenses: []
|
101
|
+
metadata: {}
|
102
|
+
post_install_message:
|
103
|
+
rdoc_options: []
|
104
|
+
require_paths:
|
105
|
+
- lib
|
106
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
112
|
+
requirements:
|
113
|
+
- - ">="
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: '0'
|
116
|
+
requirements: []
|
117
|
+
rubyforge_project:
|
118
|
+
rubygems_version: 2.4.8
|
119
|
+
signing_key:
|
120
|
+
specification_version: 4
|
121
|
+
summary: Scraper to grab City of Toronto lane swim data creating a JSON file with
|
122
|
+
geotagged pools
|
123
|
+
test_files: []
|