TOSwimScraper 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/bin/scrape +35 -0
- data/lib/scraper.rb +199 -0
- metadata +123 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 74a57bb46c96004254173ffaa32157460ee5f0d6
|
4
|
+
data.tar.gz: df2230bef488a7f8cf0eadbd170b1dfffcd05db2
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 6181c523844d05a449e65f08b6d909537d4e6dd4a0010d40698c2cc742b923d102142a7814cddba775ce6fe8922f1261ff2590d1db21ffbb638e9b8d435b4329
|
7
|
+
data.tar.gz: ecd7f0859e7349378cf2c11c7d6eb4f78948258000d8ce5f597db0e84c3cdb7529334cb0140de14c1ec83360906b5df4e8af724c53e3ac8fdf7910766ba5e6b8
|
data/bin/scrape
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
# coding: utf-8
|
3
|
+
|
4
|
+
require_relative '../lib/scraper'
|
5
|
+
|
6
|
+
if ARGV.empty?
|
7
|
+
puts " Usage: scrape UPDATE_TYPE DISPLAY_TYPE \n\n"
|
8
|
+
puts " Update type:\n"
|
9
|
+
puts " '-f' Full refresh including regrabbing swim times and pool locations \n"
|
10
|
+
puts " '-s' Just update swim schedules \n\n"
|
11
|
+
puts " Display type: \n"
|
12
|
+
puts " '-v' Verbose output \n"
|
13
|
+
puts " '-c' Concise output(default) \n"
|
14
|
+
else
|
15
|
+
|
16
|
+
display_mode = ARGV[1] || '-c'
|
17
|
+
|
18
|
+
aliases = {
|
19
|
+
"-c" => "concise",
|
20
|
+
"-v" => "verbose"
|
21
|
+
}
|
22
|
+
|
23
|
+
display_mode = aliases[display_mode] || "concise"
|
24
|
+
|
25
|
+
Scraper.display_mode(display_mode)
|
26
|
+
|
27
|
+
if ARGV.include?('-f')
|
28
|
+
Scraper.gather_pool_info
|
29
|
+
Scraper.gather_pool_swim_times
|
30
|
+
Scraper.gather_pool_program_cost_status
|
31
|
+
elsif ARGV.include?('-s')
|
32
|
+
Scraper.gather_pool_swim_times
|
33
|
+
Scraper.gather_pool_program_cost_status
|
34
|
+
end
|
35
|
+
end
|
data/lib/scraper.rb
ADDED
@@ -0,0 +1,199 @@
|
|
1
|
+
# Gems
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'open-uri'
|
4
|
+
require 'pry'
|
5
|
+
require 'json'
|
6
|
+
require 'geocoder'
|
7
|
+
|
8
|
+
module Scraper
|
9
|
+
class << self
|
10
|
+
|
11
|
+
def display_mode(display_mode)
|
12
|
+
@display_mode = display_mode
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
# faster testing
|
17
|
+
# POOL_LIST_URLS = ["http://www1.toronto.ca/parks/prd/facilities/indoor-pools/index.htm"]
|
18
|
+
# Full list
|
19
|
+
POOL_LIST_URLS = ["http://www1.toronto.ca/parks/prd/facilities/indoor-pools/index.htm",
|
20
|
+
"http://www1.toronto.ca/parks/prd/facilities/indoor-pools/2-indoor_pool.htm",
|
21
|
+
"http://www1.toronto.ca/parks/prd/facilities/outdoor-pools/index.htm",
|
22
|
+
"http://www1.toronto.ca/parks/prd/facilities/outdoor-pools/2-outdoor_pool.htm"]
|
23
|
+
|
24
|
+
Geocoder.configure(:timeout => 10)
|
25
|
+
|
26
|
+
def gather_pool_info
|
27
|
+
@pool_urls, pool_names, pool_addresses, pool_links, pool_coordinates = [],[],[],[],[]
|
28
|
+
|
29
|
+
POOL_LIST_URLS.each do |url|
|
30
|
+
doc = Nokogiri::HTML(open(url))
|
31
|
+
pools = doc.at_css("#pfrBody > div.pfrListing > table > tbody")
|
32
|
+
pool_names += pools.css('a').map { |link| link.children.text }
|
33
|
+
pool_links += pools.css('a').map { |link| link['href'] }
|
34
|
+
pool_addresses += gather_pool_addresses(pools)
|
35
|
+
end
|
36
|
+
|
37
|
+
# Geotag pools
|
38
|
+
puts "\n--- Scraping pool coordinates ---"
|
39
|
+
pool_coordinates = pool_addresses.map { |address| gather_pool_coordinates(address) }
|
40
|
+
|
41
|
+
# Convert Pool Data to Hash
|
42
|
+
pool_names.each_with_index do |pool, index|
|
43
|
+
current_pool = {}
|
44
|
+
current_pool[:name] = pool_names[index]
|
45
|
+
current_pool[:url] = pool_links[index]
|
46
|
+
current_pool[:address] = pool_addresses[index]
|
47
|
+
current_pool[:coordinates] = pool_coordinates[index]
|
48
|
+
@pool_urls << current_pool
|
49
|
+
end
|
50
|
+
|
51
|
+
# Write Hash
|
52
|
+
File.open("pool_urls.json","w") do |f|
|
53
|
+
f.write(@pool_urls.to_json)
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def simple_equal(num)
|
58
|
+
2 * num
|
59
|
+
end
|
60
|
+
|
61
|
+
def swim_time_finder(week, lane_swim_row_index)
|
62
|
+
week.at_css("tbody").css('tr')[lane_swim_row_index].children
|
63
|
+
.map do |el|
|
64
|
+
nodes = el.children.find_all(&:text?)
|
65
|
+
if nodes.length == 1
|
66
|
+
nodes = [el.children.text]
|
67
|
+
else
|
68
|
+
nodes.map!(&:text)
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
def build_pool_schedule_array_from_html(doc)
|
74
|
+
weeks = {}
|
75
|
+
|
76
|
+
for i in 0..1 #eventually poll more weeks, possibly 4 of available 7
|
77
|
+
week = doc.at_css("#dropin_Swimming_#{i}")
|
78
|
+
!week.nil?? week_dates = week.at_css('tr').children.map(&:text) : next
|
79
|
+
|
80
|
+
!week_dates.nil?? lane_swim_row_index = week.at_css("tbody").css('tr').find_index { |el| el.text=~ /Lane Swim/ } : next
|
81
|
+
|
82
|
+
if !lane_swim_row_index.nil?
|
83
|
+
week_lane_swim_times = swim_time_finder(week, lane_swim_row_index)
|
84
|
+
weeks.merge!(week_dates.zip(week_lane_swim_times).to_h)
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
# remove days with no swim times
|
89
|
+
weeks.delete_if { |day, time| time == [" "] || time == [] }
|
90
|
+
end
|
91
|
+
|
92
|
+
def gather_pool_addresses(pools)
|
93
|
+
pool_addresses = []
|
94
|
+
address_index_incrementer = pools.css('td').length / pools.css('tr').length
|
95
|
+
pools.css('td').each_with_index do |node, index|
|
96
|
+
# Address is always second column, table width varies for indoor vs. outdoor
|
97
|
+
if index % address_index_incrementer == 1
|
98
|
+
pool_addresses << node.text
|
99
|
+
end
|
100
|
+
end
|
101
|
+
pool_addresses
|
102
|
+
end
|
103
|
+
|
104
|
+
# Method accepting a block that supresses stdout/console logging
|
105
|
+
# https://gist.github.com/moertel/11091573
|
106
|
+
|
107
|
+
def suppress_output
|
108
|
+
begin
|
109
|
+
original_stderr = $stderr.clone
|
110
|
+
original_stdout = $stdout.clone
|
111
|
+
$stderr.reopen(File.new('/dev/null', 'w'))
|
112
|
+
$stdout.reopen(File.new('/dev/null', 'w'))
|
113
|
+
retval = yield
|
114
|
+
rescue Exception => e
|
115
|
+
$stdout.reopen(original_stdout)
|
116
|
+
$stderr.reopen(original_stderr)
|
117
|
+
raise e
|
118
|
+
ensure
|
119
|
+
$stdout.reopen(original_stdout)
|
120
|
+
$stderr.reopen(original_stderr)
|
121
|
+
end
|
122
|
+
retval
|
123
|
+
end
|
124
|
+
|
125
|
+
def gather_pool_coordinates(address)
|
126
|
+
if @display_mode == "verbose"
|
127
|
+
puts "Geocoding: #{address}"
|
128
|
+
else
|
129
|
+
print "."
|
130
|
+
end
|
131
|
+
|
132
|
+
coordinates_arr = suppress_output{ Geocoder.coordinates("#{address}, Toronto") }
|
133
|
+
|
134
|
+
# To avoid triggering google API limit of 10 queries per second
|
135
|
+
sleep(0.15)
|
136
|
+
return { latitude: coordinates_arr[0], longitude: coordinates_arr[1] }
|
137
|
+
end
|
138
|
+
|
139
|
+
#####Parse Weekly Leisure Swim Data#####
|
140
|
+
def gather_pool_swim_times
|
141
|
+
begin
|
142
|
+
@pool_urls ||= JSON.parse(File.read('pool_urls.json'), symbolize_names: true)
|
143
|
+
rescue => e
|
144
|
+
puts "Couldn't open pool_info, run scrape -f or run in path with pool_urls.json file"
|
145
|
+
exit
|
146
|
+
end
|
147
|
+
|
148
|
+
puts "\n--- Scraping pool swim times ---"
|
149
|
+
@pool_urls.each do |pool|
|
150
|
+
|
151
|
+
if @display_mode == "verbose"
|
152
|
+
puts "Scraping: " + pool[:name]
|
153
|
+
else
|
154
|
+
print "."
|
155
|
+
end
|
156
|
+
|
157
|
+
url = "http://www1.toronto.ca" + pool[:url]
|
158
|
+
doc = Nokogiri::HTML(open(url))
|
159
|
+
pool[:times] = build_pool_schedule_array_from_html(doc)
|
160
|
+
end
|
161
|
+
|
162
|
+
File.open("pools_data.json","w") do |f|
|
163
|
+
f.write(@pool_urls.to_json)
|
164
|
+
puts "\nWriting pools_data.json complete"
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
def gather_pool_program_cost_status
|
169
|
+
@pools = JSON.parse(File.read('pools_data.json'), symbolize_names: true)
|
170
|
+
|
171
|
+
page = "http://www1.toronto.ca/wps/portal/contentonly?vgnextoid=aaafdada600f0410VgnVCM10000071d60f89RCRD&vgnextchannel=a96adada600f0410VgnVCM10000071d60f89RCRD"
|
172
|
+
doc = Nokogiri::HTML(open(page))
|
173
|
+
free_facility_article = doc.at_css("#maincontent")
|
174
|
+
links = free_facility_article.css('a')
|
175
|
+
all_hrefs = links.map { |link| link.attribute('href').to_s }.uniq.sort.delete_if { |href| href.empty? }
|
176
|
+
|
177
|
+
free_facility_urls_regexed = all_hrefs.keep_if{ |href| href.match("\/parks/prd/facilities/complex\w*") }
|
178
|
+
.map{ |url| url.match(/\/parks\/prd\/facilities\/complex\/\d*/).to_s }
|
179
|
+
|
180
|
+
@pools.each do |pool|
|
181
|
+
pool_url_regex = pool[:url].match(/\/parks\/prd\/facilities\/complex\/\d*/).to_s
|
182
|
+
match = free_facility_urls_regexed.find{ |e| pool_url_regex == e }
|
183
|
+
pool[:free_swim] = match ? true : false
|
184
|
+
end
|
185
|
+
|
186
|
+
File.open("pools_data.json","w") do |f|
|
187
|
+
f.write(@pools.to_json)
|
188
|
+
puts "Writing program cost status to pools_data.json complete"
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
# Todo
|
196
|
+
# add a test suite
|
197
|
+
# remind self how to log name of vars while blown up (smaller method with info passed in probably!)
|
198
|
+
#start displaying, filtering?
|
199
|
+
#maybe transform date save
|
metadata
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: TOSwimScraper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Erich Welz
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-04-21 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.3'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.3'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: geocoder
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.2'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.2'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: json
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '1.8'
|
48
|
+
- - ">="
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: 1.8.3
|
51
|
+
type: :runtime
|
52
|
+
prerelease: false
|
53
|
+
version_requirements: !ruby/object:Gem::Requirement
|
54
|
+
requirements:
|
55
|
+
- - "~>"
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
version: '1.8'
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: 1.8.3
|
61
|
+
- !ruby/object:Gem::Dependency
|
62
|
+
name: nokogiri
|
63
|
+
requirement: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - "~>"
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: '1.6'
|
68
|
+
type: :runtime
|
69
|
+
prerelease: false
|
70
|
+
version_requirements: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - "~>"
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '1.6'
|
75
|
+
- !ruby/object:Gem::Dependency
|
76
|
+
name: pry
|
77
|
+
requirement: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - "~>"
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '0.10'
|
82
|
+
type: :runtime
|
83
|
+
prerelease: false
|
84
|
+
version_requirements: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - "~>"
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0.10'
|
89
|
+
description: Scraper to grab City of Toronto lane swim data creating a JSON file with
|
90
|
+
geotagged pools
|
91
|
+
email: erichwelz@gmail.com
|
92
|
+
executables:
|
93
|
+
- scrape
|
94
|
+
extensions: []
|
95
|
+
extra_rdoc_files: []
|
96
|
+
files:
|
97
|
+
- bin/scrape
|
98
|
+
- lib/scraper.rb
|
99
|
+
homepage: https://github.com/erichwelz/TOSwim
|
100
|
+
licenses: []
|
101
|
+
metadata: {}
|
102
|
+
post_install_message:
|
103
|
+
rdoc_options: []
|
104
|
+
require_paths:
|
105
|
+
- lib
|
106
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
111
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
112
|
+
requirements:
|
113
|
+
- - ">="
|
114
|
+
- !ruby/object:Gem::Version
|
115
|
+
version: '0'
|
116
|
+
requirements: []
|
117
|
+
rubyforge_project:
|
118
|
+
rubygems_version: 2.4.8
|
119
|
+
signing_key:
|
120
|
+
specification_version: 4
|
121
|
+
summary: Scraper to grab City of Toronto lane swim data creating a JSON file with
|
122
|
+
geotagged pools
|
123
|
+
test_files: []
|