TOSwimScraper 0.1.2 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/scrape +3 -8
- data/lib/scraper.rb +16 -42
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1944dab62289f4921c52d1e38567c2ab2ff1f6cd
|
4
|
+
data.tar.gz: 778e414fa793222f93520bbe3e0e3fced1ff5d81
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 048837648e2d9ac433403ad87ae7ce3e49caba5798ba230657a7338500cbda51f7adf3e076e4df70bc8311f02d2cfa8bb29a92ccdc6492fee5ea13b4becc9f97
|
7
|
+
data.tar.gz: 2d6407539497a44d432cf46130ccc16e125b8afff8521ea96ff4ead43bf0b492132ffd7899e98ea37a4378cb838ca53571b9d01f37272e7c86fa8b4f3e75a74f
|
data/bin/scrape
CHANGED
@@ -24,12 +24,7 @@ else
|
|
24
24
|
|
25
25
|
Scraper.display_mode(display_mode)
|
26
26
|
|
27
|
-
if ARGV.include?('-f')
|
28
|
-
|
29
|
-
|
30
|
-
Scraper.gather_pool_program_cost_status
|
31
|
-
elsif ARGV.include?('-s')
|
32
|
-
Scraper.gather_pool_swim_times
|
33
|
-
Scraper.gather_pool_program_cost_status
|
34
|
-
end
|
27
|
+
Scraper.gather_pool_info if ARGV.include?('-f')
|
28
|
+
Scraper.gather_pool_swim_times
|
29
|
+
Scraper.gather_pool_program_cost_status
|
35
30
|
end
|
data/lib/scraper.rb
CHANGED
@@ -12,14 +12,10 @@ module Scraper
|
|
12
12
|
@display_mode = display_mode
|
13
13
|
end
|
14
14
|
|
15
|
-
|
16
15
|
# faster testing
|
17
|
-
# POOL_LIST_URLS = ["
|
16
|
+
# POOL_LIST_URLS = ["https://web.toronto.ca/data/parks/prd/facilities/indoor-pools/index.html"]
|
18
17
|
# Full list
|
19
|
-
POOL_LIST_URLS = ["
|
20
|
-
"http://www1.toronto.ca/parks/prd/facilities/indoor-pools/2-indoor_pool.htm",
|
21
|
-
"http://www1.toronto.ca/parks/prd/facilities/outdoor-pools/index.htm",
|
22
|
-
"http://www1.toronto.ca/parks/prd/facilities/outdoor-pools/2-outdoor_pool.htm"]
|
18
|
+
POOL_LIST_URLS = [ "https://web.toronto.ca/data/parks/prd/facilities/indoor-pools/index.html","https://web.toronto.ca/data/parks/prd/facilities/outdoor-pools/index.html" ]
|
23
19
|
|
24
20
|
Geocoder.configure(:timeout => 10)
|
25
21
|
|
@@ -29,11 +25,14 @@ module Scraper
|
|
29
25
|
POOL_LIST_URLS.each do |url|
|
30
26
|
doc = Nokogiri::HTML(open(url))
|
31
27
|
pools = doc.at_css("#pfrBody > div.pfrListing > table > tbody")
|
32
|
-
pool_names += pools.css('a').map { |link| link.children.text }
|
33
|
-
pool_links += pools.css('a').map { |link| link['href'] }
|
28
|
+
pool_names += pools.css('a').map { |link| link.children.text unless link.children.text == "" }.compact
|
29
|
+
pool_links += pools.css('a').map { |link| link['href'] if link['href'].match(/parks\/prd\/facilities\/complex/) }.compact
|
34
30
|
pool_addresses += gather_pool_addresses(pools)
|
35
31
|
end
|
36
32
|
|
33
|
+
array_length_equality = pool_names.length == pool_links.length && pool_links.length == pool_addresses.length
|
34
|
+
raise "Pool information lengths are unequal, the website schema has likely changed" unless array_length_equality
|
35
|
+
|
37
36
|
# Geotag pools
|
38
37
|
puts "\n--- Scraping pool coordinates ---"
|
39
38
|
pool_coordinates = pool_addresses.map { |address| gather_pool_coordinates(address) }
|
@@ -52,7 +51,6 @@ module Scraper
|
|
52
51
|
File.open("pool_urls.json","w") do |f|
|
53
52
|
f.write(@pool_urls.to_json)
|
54
53
|
end
|
55
|
-
|
56
54
|
@pool_urls
|
57
55
|
end
|
58
56
|
|
@@ -88,36 +86,14 @@ module Scraper
|
|
88
86
|
end
|
89
87
|
|
90
88
|
def gather_pool_addresses(pools)
|
91
|
-
|
92
|
-
|
93
|
-
pools.css('td').
|
89
|
+
address_index = pools.css('td').length / pools.css('tr').length
|
90
|
+
|
91
|
+
pools.css('td').each_with_object([]).with_index do |(node, pool_addresses), index|
|
94
92
|
# Address is always second column, table width varies for indoor vs. outdoor
|
95
|
-
if index %
|
93
|
+
if index % address_index == 1
|
96
94
|
pool_addresses << node.text
|
97
95
|
end
|
98
96
|
end
|
99
|
-
pool_addresses
|
100
|
-
end
|
101
|
-
|
102
|
-
# Method accepting a block that supresses stdout/console logging
|
103
|
-
# https://gist.github.com/moertel/11091573
|
104
|
-
|
105
|
-
def suppress_output
|
106
|
-
begin
|
107
|
-
original_stderr = $stderr.clone
|
108
|
-
original_stdout = $stdout.clone
|
109
|
-
$stderr.reopen(File.new('/dev/null', 'w'))
|
110
|
-
$stdout.reopen(File.new('/dev/null', 'w'))
|
111
|
-
retval = yield
|
112
|
-
rescue Exception => e
|
113
|
-
$stdout.reopen(original_stdout)
|
114
|
-
$stderr.reopen(original_stderr)
|
115
|
-
raise e
|
116
|
-
ensure
|
117
|
-
$stdout.reopen(original_stdout)
|
118
|
-
$stderr.reopen(original_stderr)
|
119
|
-
end
|
120
|
-
retval
|
121
97
|
end
|
122
98
|
|
123
99
|
def gather_pool_coordinates(address)
|
@@ -127,10 +103,10 @@ module Scraper
|
|
127
103
|
print "."
|
128
104
|
end
|
129
105
|
|
130
|
-
coordinates_arr =
|
106
|
+
coordinates_arr = Geocoder.coordinates("#{address}, Toronto")
|
131
107
|
|
132
|
-
# To avoid triggering google API limit of
|
133
|
-
sleep(0.
|
108
|
+
# To avoid triggering google API limit of 50 queries per second
|
109
|
+
sleep(0.02)
|
134
110
|
return { latitude: coordinates_arr[0], longitude: coordinates_arr[1] }
|
135
111
|
end
|
136
112
|
|
@@ -145,14 +121,12 @@ module Scraper
|
|
145
121
|
|
146
122
|
puts "\n--- Scraping pool swim times ---"
|
147
123
|
@pool_urls.each do |pool|
|
148
|
-
|
149
124
|
if @display_mode == "verbose"
|
150
125
|
puts "Scraping: " + pool[:name]
|
151
126
|
else
|
152
127
|
print "."
|
153
128
|
end
|
154
|
-
|
155
|
-
url = "http://www1.toronto.ca" + pool[:url]
|
129
|
+
url = "https://www.toronto.ca" + pool[:url]
|
156
130
|
doc = Nokogiri::HTML(open(url))
|
157
131
|
pool[:times] = build_pool_schedule_array_from_html(doc)
|
158
132
|
end
|
@@ -168,7 +142,7 @@ module Scraper
|
|
168
142
|
def gather_pool_program_cost_status
|
169
143
|
@pools = JSON.parse(File.read('pools_data.json'), symbolize_names: true)
|
170
144
|
|
171
|
-
page = "
|
145
|
+
page = "https://www1.toronto.ca/wps/portal/contentonly?vgnextoid=aaafdada600f0410VgnVCM10000071d60f89RCRD&vgnextchannel=a96adada600f0410VgnVCM10000071d60f89RCRD"
|
172
146
|
doc = Nokogiri::HTML(open(page))
|
173
147
|
free_facility_article = doc.at_css("#maincontent")
|
174
148
|
links = free_facility_article.css('a')
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: TOSwimScraper
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Erich Welz
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2017-11-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|
@@ -115,9 +115,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
115
115
|
version: '0'
|
116
116
|
requirements: []
|
117
117
|
rubyforge_project:
|
118
|
-
rubygems_version: 2.
|
118
|
+
rubygems_version: 2.5.1
|
119
119
|
signing_key:
|
120
120
|
specification_version: 4
|
121
121
|
summary: Scraper to grab City of Toronto lane swim data creating a JSON file with
|
122
122
|
geotagged pools
|
123
123
|
test_files: []
|
124
|
+
has_rdoc:
|