TOSwimScraper 0.1.2 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/scrape +3 -8
- data/lib/scraper.rb +16 -42
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1944dab62289f4921c52d1e38567c2ab2ff1f6cd
|
4
|
+
data.tar.gz: 778e414fa793222f93520bbe3e0e3fced1ff5d81
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 048837648e2d9ac433403ad87ae7ce3e49caba5798ba230657a7338500cbda51f7adf3e076e4df70bc8311f02d2cfa8bb29a92ccdc6492fee5ea13b4becc9f97
|
7
|
+
data.tar.gz: 2d6407539497a44d432cf46130ccc16e125b8afff8521ea96ff4ead43bf0b492132ffd7899e98ea37a4378cb838ca53571b9d01f37272e7c86fa8b4f3e75a74f
|
data/bin/scrape
CHANGED
@@ -24,12 +24,7 @@ else
|
|
24
24
|
|
25
25
|
Scraper.display_mode(display_mode)
|
26
26
|
|
27
|
-
if ARGV.include?('-f')
|
28
|
-
|
29
|
-
|
30
|
-
Scraper.gather_pool_program_cost_status
|
31
|
-
elsif ARGV.include?('-s')
|
32
|
-
Scraper.gather_pool_swim_times
|
33
|
-
Scraper.gather_pool_program_cost_status
|
34
|
-
end
|
27
|
+
Scraper.gather_pool_info if ARGV.include?('-f')
|
28
|
+
Scraper.gather_pool_swim_times
|
29
|
+
Scraper.gather_pool_program_cost_status
|
35
30
|
end
|
data/lib/scraper.rb
CHANGED
@@ -12,14 +12,10 @@ module Scraper
|
|
12
12
|
@display_mode = display_mode
|
13
13
|
end
|
14
14
|
|
15
|
-
|
16
15
|
# faster testing
|
17
|
-
# POOL_LIST_URLS = ["
|
16
|
+
# POOL_LIST_URLS = ["https://web.toronto.ca/data/parks/prd/facilities/indoor-pools/index.html"]
|
18
17
|
# Full list
|
19
|
-
POOL_LIST_URLS = ["
|
20
|
-
"http://www1.toronto.ca/parks/prd/facilities/indoor-pools/2-indoor_pool.htm",
|
21
|
-
"http://www1.toronto.ca/parks/prd/facilities/outdoor-pools/index.htm",
|
22
|
-
"http://www1.toronto.ca/parks/prd/facilities/outdoor-pools/2-outdoor_pool.htm"]
|
18
|
+
POOL_LIST_URLS = [ "https://web.toronto.ca/data/parks/prd/facilities/indoor-pools/index.html","https://web.toronto.ca/data/parks/prd/facilities/outdoor-pools/index.html" ]
|
23
19
|
|
24
20
|
Geocoder.configure(:timeout => 10)
|
25
21
|
|
@@ -29,11 +25,14 @@ module Scraper
|
|
29
25
|
POOL_LIST_URLS.each do |url|
|
30
26
|
doc = Nokogiri::HTML(open(url))
|
31
27
|
pools = doc.at_css("#pfrBody > div.pfrListing > table > tbody")
|
32
|
-
pool_names += pools.css('a').map { |link| link.children.text }
|
33
|
-
pool_links += pools.css('a').map { |link| link['href'] }
|
28
|
+
pool_names += pools.css('a').map { |link| link.children.text unless link.children.text == "" }.compact
|
29
|
+
pool_links += pools.css('a').map { |link| link['href'] if link['href'].match(/parks\/prd\/facilities\/complex/) }.compact
|
34
30
|
pool_addresses += gather_pool_addresses(pools)
|
35
31
|
end
|
36
32
|
|
33
|
+
array_length_equality = pool_names.length == pool_links.length && pool_links.length == pool_addresses.length
|
34
|
+
raise "Pool information lengths are unequal, the website schema has likely changed" unless array_length_equality
|
35
|
+
|
37
36
|
# Geotag pools
|
38
37
|
puts "\n--- Scraping pool coordinates ---"
|
39
38
|
pool_coordinates = pool_addresses.map { |address| gather_pool_coordinates(address) }
|
@@ -52,7 +51,6 @@ module Scraper
|
|
52
51
|
File.open("pool_urls.json","w") do |f|
|
53
52
|
f.write(@pool_urls.to_json)
|
54
53
|
end
|
55
|
-
|
56
54
|
@pool_urls
|
57
55
|
end
|
58
56
|
|
@@ -88,36 +86,14 @@ module Scraper
|
|
88
86
|
end
|
89
87
|
|
90
88
|
def gather_pool_addresses(pools)
|
91
|
-
|
92
|
-
|
93
|
-
pools.css('td').
|
89
|
+
address_index = pools.css('td').length / pools.css('tr').length
|
90
|
+
|
91
|
+
pools.css('td').each_with_object([]).with_index do |(node, pool_addresses), index|
|
94
92
|
# Address is always second column, table width varies for indoor vs. outdoor
|
95
|
-
if index %
|
93
|
+
if index % address_index == 1
|
96
94
|
pool_addresses << node.text
|
97
95
|
end
|
98
96
|
end
|
99
|
-
pool_addresses
|
100
|
-
end
|
101
|
-
|
102
|
-
# Method accepting a block that supresses stdout/console logging
|
103
|
-
# https://gist.github.com/moertel/11091573
|
104
|
-
|
105
|
-
def suppress_output
|
106
|
-
begin
|
107
|
-
original_stderr = $stderr.clone
|
108
|
-
original_stdout = $stdout.clone
|
109
|
-
$stderr.reopen(File.new('/dev/null', 'w'))
|
110
|
-
$stdout.reopen(File.new('/dev/null', 'w'))
|
111
|
-
retval = yield
|
112
|
-
rescue Exception => e
|
113
|
-
$stdout.reopen(original_stdout)
|
114
|
-
$stderr.reopen(original_stderr)
|
115
|
-
raise e
|
116
|
-
ensure
|
117
|
-
$stdout.reopen(original_stdout)
|
118
|
-
$stderr.reopen(original_stderr)
|
119
|
-
end
|
120
|
-
retval
|
121
97
|
end
|
122
98
|
|
123
99
|
def gather_pool_coordinates(address)
|
@@ -127,10 +103,10 @@ module Scraper
|
|
127
103
|
print "."
|
128
104
|
end
|
129
105
|
|
130
|
-
coordinates_arr =
|
106
|
+
coordinates_arr = Geocoder.coordinates("#{address}, Toronto")
|
131
107
|
|
132
|
-
# To avoid triggering google API limit of
|
133
|
-
sleep(0.
|
108
|
+
# To avoid triggering google API limit of 50 queries per second
|
109
|
+
sleep(0.02)
|
134
110
|
return { latitude: coordinates_arr[0], longitude: coordinates_arr[1] }
|
135
111
|
end
|
136
112
|
|
@@ -145,14 +121,12 @@ module Scraper
|
|
145
121
|
|
146
122
|
puts "\n--- Scraping pool swim times ---"
|
147
123
|
@pool_urls.each do |pool|
|
148
|
-
|
149
124
|
if @display_mode == "verbose"
|
150
125
|
puts "Scraping: " + pool[:name]
|
151
126
|
else
|
152
127
|
print "."
|
153
128
|
end
|
154
|
-
|
155
|
-
url = "http://www1.toronto.ca" + pool[:url]
|
129
|
+
url = "https://www.toronto.ca" + pool[:url]
|
156
130
|
doc = Nokogiri::HTML(open(url))
|
157
131
|
pool[:times] = build_pool_schedule_array_from_html(doc)
|
158
132
|
end
|
@@ -168,7 +142,7 @@ module Scraper
|
|
168
142
|
def gather_pool_program_cost_status
|
169
143
|
@pools = JSON.parse(File.read('pools_data.json'), symbolize_names: true)
|
170
144
|
|
171
|
-
page = "
|
145
|
+
page = "https://www1.toronto.ca/wps/portal/contentonly?vgnextoid=aaafdada600f0410VgnVCM10000071d60f89RCRD&vgnextchannel=a96adada600f0410VgnVCM10000071d60f89RCRD"
|
172
146
|
doc = Nokogiri::HTML(open(page))
|
173
147
|
free_facility_article = doc.at_css("#maincontent")
|
174
148
|
links = free_facility_article.css('a')
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: TOSwimScraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erich Welz
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-11-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -115,9 +115,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
115
115
|
version: '0'
|
116
116
|
requirements: []
|
117
117
|
rubyforge_project:
|
118
|
-
rubygems_version: 2.
|
118
|
+
rubygems_version: 2.5.1
|
119
119
|
signing_key:
|
120
120
|
specification_version: 4
|
121
121
|
summary: Scraper to grab City of Toronto lane swim data creating a JSON file with
|
122
122
|
geotagged pools
|
123
123
|
test_files: []
|
124
|
+
has_rdoc:
|