TOSwimScraper 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +7 -0
  2. data/bin/scrape +35 -0
  3. data/lib/scraper.rb +199 -0
  4. metadata +123 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 74a57bb46c96004254173ffaa32157460ee5f0d6
4
+ data.tar.gz: df2230bef488a7f8cf0eadbd170b1dfffcd05db2
5
+ SHA512:
6
+ metadata.gz: 6181c523844d05a449e65f08b6d909537d4e6dd4a0010d40698c2cc742b923d102142a7814cddba775ce6fe8922f1261ff2590d1db21ffbb638e9b8d435b4329
7
+ data.tar.gz: ecd7f0859e7349378cf2c11c7d6eb4f78948258000d8ce5f597db0e84c3cdb7529334cb0140de14c1ec83360906b5df4e8af724c53e3ac8fdf7910766ba5e6b8
data/bin/scrape ADDED
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+
4
+ require_relative '../lib/scraper'
5
+
6
+ if ARGV.empty?
7
+ puts " Usage: scrape UPDATE_TYPE DISPLAY_TYPE \n\n"
8
+ puts " Update type:\n"
9
+ puts " '-f' Full refresh including regrabbing swim times and pool locations \n"
10
+ puts " '-s' Just update swim schedules \n\n"
11
+ puts " Display type: \n"
12
+ puts " '-v' Verbose output \n"
13
+ puts " '-c' Concise output(default) \n"
14
+ else
15
+
16
+ display_mode = ARGV[1] || '-c'
17
+
18
+ aliases = {
19
+ "-c" => "concise",
20
+ "-v" => "verbose"
21
+ }
22
+
23
+ display_mode = aliases[display_mode] || "concise"
24
+
25
+ Scraper.display_mode(display_mode)
26
+
27
+ if ARGV.include?('-f')
28
+ Scraper.gather_pool_info
29
+ Scraper.gather_pool_swim_times
30
+ Scraper.gather_pool_program_cost_status
31
+ elsif ARGV.include?('-s')
32
+ Scraper.gather_pool_swim_times
33
+ Scraper.gather_pool_program_cost_status
34
+ end
35
+ end
data/lib/scraper.rb ADDED
@@ -0,0 +1,199 @@
1
+ # Gems
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+ require 'pry'
5
+ require 'json'
6
+ require 'geocoder'
7
+
8
+ module Scraper
9
+ class << self
10
+
11
+ def display_mode(display_mode)
12
+ @display_mode = display_mode
13
+ end
14
+
15
+
16
+ # faster testing
17
+ # POOL_LIST_URLS = ["http://www1.toronto.ca/parks/prd/facilities/indoor-pools/index.htm"]
18
+ # Full list
19
+ POOL_LIST_URLS = ["http://www1.toronto.ca/parks/prd/facilities/indoor-pools/index.htm",
20
+ "http://www1.toronto.ca/parks/prd/facilities/indoor-pools/2-indoor_pool.htm",
21
+ "http://www1.toronto.ca/parks/prd/facilities/outdoor-pools/index.htm",
22
+ "http://www1.toronto.ca/parks/prd/facilities/outdoor-pools/2-outdoor_pool.htm"]
23
+
24
+ Geocoder.configure(:timeout => 10)
25
+
26
+ def gather_pool_info
27
+ @pool_urls, pool_names, pool_addresses, pool_links, pool_coordinates = [],[],[],[],[]
28
+
29
+ POOL_LIST_URLS.each do |url|
30
+ doc = Nokogiri::HTML(open(url))
31
+ pools = doc.at_css("#pfrBody > div.pfrListing > table > tbody")
32
+ pool_names += pools.css('a').map { |link| link.children.text }
33
+ pool_links += pools.css('a').map { |link| link['href'] }
34
+ pool_addresses += gather_pool_addresses(pools)
35
+ end
36
+
37
+ # Geotag pools
38
+ puts "\n--- Scraping pool coordinates ---"
39
+ pool_coordinates = pool_addresses.map { |address| gather_pool_coordinates(address) }
40
+
41
+ # Convert Pool Data to Hash
42
+ pool_names.each_with_index do |pool, index|
43
+ current_pool = {}
44
+ current_pool[:name] = pool_names[index]
45
+ current_pool[:url] = pool_links[index]
46
+ current_pool[:address] = pool_addresses[index]
47
+ current_pool[:coordinates] = pool_coordinates[index]
48
+ @pool_urls << current_pool
49
+ end
50
+
51
+ # Write Hash
52
+ File.open("pool_urls.json","w") do |f|
53
+ f.write(@pool_urls.to_json)
54
+ end
55
+ end
56
+
57
+ def simple_equal(num)
58
+ 2 * num
59
+ end
60
+
61
+ def swim_time_finder(week, lane_swim_row_index)
62
+ week.at_css("tbody").css('tr')[lane_swim_row_index].children
63
+ .map do |el|
64
+ nodes = el.children.find_all(&:text?)
65
+ if nodes.length == 1
66
+ nodes = [el.children.text]
67
+ else
68
+ nodes.map!(&:text)
69
+ end
70
+ end
71
+ end
72
+
73
+ def build_pool_schedule_array_from_html(doc)
74
+ weeks = {}
75
+
76
+ for i in 0..1 #eventually poll more weeks, possibly 4 of available 7
77
+ week = doc.at_css("#dropin_Swimming_#{i}")
78
+ !week.nil?? week_dates = week.at_css('tr').children.map(&:text) : next
79
+
80
+ !week_dates.nil?? lane_swim_row_index = week.at_css("tbody").css('tr').find_index { |el| el.text=~ /Lane Swim/ } : next
81
+
82
+ if !lane_swim_row_index.nil?
83
+ week_lane_swim_times = swim_time_finder(week, lane_swim_row_index)
84
+ weeks.merge!(week_dates.zip(week_lane_swim_times).to_h)
85
+ end
86
+ end
87
+
88
+ # remove days with no swim times
89
+ weeks.delete_if { |day, time| time == [" "] || time == [] }
90
+ end
91
+
92
+ def gather_pool_addresses(pools)
93
+ pool_addresses = []
94
+ address_index_incrementer = pools.css('td').length / pools.css('tr').length
95
+ pools.css('td').each_with_index do |node, index|
96
+ # Address is always second column, table width varies for indoor vs. outdoor
97
+ if index % address_index_incrementer == 1
98
+ pool_addresses << node.text
99
+ end
100
+ end
101
+ pool_addresses
102
+ end
103
+
104
+ # Method accepting a block that supresses stdout/console logging
105
+ # https://gist.github.com/moertel/11091573
106
+
107
+ def suppress_output
108
+ begin
109
+ original_stderr = $stderr.clone
110
+ original_stdout = $stdout.clone
111
+ $stderr.reopen(File.new('/dev/null', 'w'))
112
+ $stdout.reopen(File.new('/dev/null', 'w'))
113
+ retval = yield
114
+ rescue Exception => e
115
+ $stdout.reopen(original_stdout)
116
+ $stderr.reopen(original_stderr)
117
+ raise e
118
+ ensure
119
+ $stdout.reopen(original_stdout)
120
+ $stderr.reopen(original_stderr)
121
+ end
122
+ retval
123
+ end
124
+
125
+ def gather_pool_coordinates(address)
126
+ if @display_mode == "verbose"
127
+ puts "Geocoding: #{address}"
128
+ else
129
+ print "."
130
+ end
131
+
132
+ coordinates_arr = suppress_output{ Geocoder.coordinates("#{address}, Toronto") }
133
+
134
+ # To avoid triggering google API limit of 10 queries per second
135
+ sleep(0.15)
136
+ return { latitude: coordinates_arr[0], longitude: coordinates_arr[1] }
137
+ end
138
+
139
+ #####Parse Weekly Leisure Swim Data#####
140
+ def gather_pool_swim_times
141
+ begin
142
+ @pool_urls ||= JSON.parse(File.read('pool_urls.json'), symbolize_names: true)
143
+ rescue => e
144
+ puts "Couldn't open pool_info, run scrape -f or run in path with pool_urls.json file"
145
+ exit
146
+ end
147
+
148
+ puts "\n--- Scraping pool swim times ---"
149
+ @pool_urls.each do |pool|
150
+
151
+ if @display_mode == "verbose"
152
+ puts "Scraping: " + pool[:name]
153
+ else
154
+ print "."
155
+ end
156
+
157
+ url = "http://www1.toronto.ca" + pool[:url]
158
+ doc = Nokogiri::HTML(open(url))
159
+ pool[:times] = build_pool_schedule_array_from_html(doc)
160
+ end
161
+
162
+ File.open("pools_data.json","w") do |f|
163
+ f.write(@pool_urls.to_json)
164
+ puts "\nWriting pools_data.json complete"
165
+ end
166
+ end
167
+
168
+ def gather_pool_program_cost_status
169
+ @pools = JSON.parse(File.read('pools_data.json'), symbolize_names: true)
170
+
171
+ page = "http://www1.toronto.ca/wps/portal/contentonly?vgnextoid=aaafdada600f0410VgnVCM10000071d60f89RCRD&vgnextchannel=a96adada600f0410VgnVCM10000071d60f89RCRD"
172
+ doc = Nokogiri::HTML(open(page))
173
+ free_facility_article = doc.at_css("#maincontent")
174
+ links = free_facility_article.css('a')
175
+ all_hrefs = links.map { |link| link.attribute('href').to_s }.uniq.sort.delete_if { |href| href.empty? }
176
+
177
+ free_facility_urls_regexed = all_hrefs.keep_if{ |href| href.match("\/parks/prd/facilities/complex\w*") }
178
+ .map{ |url| url.match(/\/parks\/prd\/facilities\/complex\/\d*/).to_s }
179
+
180
+ @pools.each do |pool|
181
+ pool_url_regex = pool[:url].match(/\/parks\/prd\/facilities\/complex\/\d*/).to_s
182
+ match = free_facility_urls_regexed.find{ |e| pool_url_regex == e }
183
+ pool[:free_swim] = match ? true : false
184
+ end
185
+
186
+ File.open("pools_data.json","w") do |f|
187
+ f.write(@pools.to_json)
188
+ puts "Writing program cost status to pools_data.json complete"
189
+ end
190
+ end
191
+
192
+ end
193
+ end
194
+
195
+ # Todo
196
+ # add a test suite
197
+ # remind self how to log name of vars while blown up (smaller method with info passed in probably!)
198
+ #start displaying, filtering?
199
+ #maybe transform date save
metadata ADDED
@@ -0,0 +1,123 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: TOSwimScraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Erich Welz
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-04-21 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: geocoder
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.2'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.2'
41
+ - !ruby/object:Gem::Dependency
42
+ name: json
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.8'
48
+ - - ">="
49
+ - !ruby/object:Gem::Version
50
+ version: 1.8.3
51
+ type: :runtime
52
+ prerelease: false
53
+ version_requirements: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - "~>"
56
+ - !ruby/object:Gem::Version
57
+ version: '1.8'
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: 1.8.3
61
+ - !ruby/object:Gem::Dependency
62
+ name: nokogiri
63
+ requirement: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '1.6'
68
+ type: :runtime
69
+ prerelease: false
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '1.6'
75
+ - !ruby/object:Gem::Dependency
76
+ name: pry
77
+ requirement: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '0.10'
82
+ type: :runtime
83
+ prerelease: false
84
+ version_requirements: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - "~>"
87
+ - !ruby/object:Gem::Version
88
+ version: '0.10'
89
+ description: Scraper to grab City of Toronto lane swim data creating a JSON file with
90
+ geotagged pools
91
+ email: erichwelz@gmail.com
92
+ executables:
93
+ - scrape
94
+ extensions: []
95
+ extra_rdoc_files: []
96
+ files:
97
+ - bin/scrape
98
+ - lib/scraper.rb
99
+ homepage: https://github.com/erichwelz/TOSwim
100
+ licenses: []
101
+ metadata: {}
102
+ post_install_message:
103
+ rdoc_options: []
104
+ require_paths:
105
+ - lib
106
+ required_ruby_version: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ required_rubygems_version: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ version: '0'
116
+ requirements: []
117
+ rubyforge_project:
118
+ rubygems_version: 2.4.8
119
+ signing_key:
120
+ specification_version: 4
121
+ summary: Scraper to grab City of Toronto lane swim data creating a JSON file with
122
+ geotagged pools
123
+ test_files: []