TOSwimScraper 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +7 -0
  2. data/bin/scrape +35 -0
  3. data/lib/scraper.rb +199 -0
  4. metadata +123 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 74a57bb46c96004254173ffaa32157460ee5f0d6
4
+ data.tar.gz: df2230bef488a7f8cf0eadbd170b1dfffcd05db2
5
+ SHA512:
6
+ metadata.gz: 6181c523844d05a449e65f08b6d909537d4e6dd4a0010d40698c2cc742b923d102142a7814cddba775ce6fe8922f1261ff2590d1db21ffbb638e9b8d435b4329
7
+ data.tar.gz: ecd7f0859e7349378cf2c11c7d6eb4f78948258000d8ce5f597db0e84c3cdb7529334cb0140de14c1ec83360906b5df4e8af724c53e3ac8fdf7910766ba5e6b8
data/bin/scrape ADDED
@@ -0,0 +1,35 @@
1
+ #!/usr/bin/env ruby
2
+ # coding: utf-8
3
+
4
+ require_relative '../lib/scraper'
5
+
6
+ if ARGV.empty?
7
+ puts " Usage: scrape UPDATE_TYPE DISPLAY_TYPE \n\n"
8
+ puts " Update type:\n"
9
+ puts " '-f' Full refresh including regrabbing swim times and pool locations \n"
10
+ puts " '-s' Just update swim schedules \n\n"
11
+ puts " Display type: \n"
12
+ puts " '-v' Verbose output \n"
13
+ puts " '-c' Concise output(default) \n"
14
+ else
15
+
16
+ display_mode = ARGV[1] || '-c'
17
+
18
+ aliases = {
19
+ "-c" => "concise",
20
+ "-v" => "verbose"
21
+ }
22
+
23
+ display_mode = aliases[display_mode] || "concise"
24
+
25
+ Scraper.display_mode(display_mode)
26
+
27
+ if ARGV.include?('-f')
28
+ Scraper.gather_pool_info
29
+ Scraper.gather_pool_swim_times
30
+ Scraper.gather_pool_program_cost_status
31
+ elsif ARGV.include?('-s')
32
+ Scraper.gather_pool_swim_times
33
+ Scraper.gather_pool_program_cost_status
34
+ end
35
+ end
data/lib/scraper.rb ADDED
@@ -0,0 +1,199 @@
1
+ # Gems
2
+ require 'nokogiri'
3
+ require 'open-uri'
4
+ require 'pry'
5
+ require 'json'
6
+ require 'geocoder'
7
+
8
+ module Scraper
9
+ class << self
10
+
11
+ def display_mode(display_mode)
12
+ @display_mode = display_mode
13
+ end
14
+
15
+
16
+ # faster testing
17
+ # POOL_LIST_URLS = ["http://www1.toronto.ca/parks/prd/facilities/indoor-pools/index.htm"]
18
+ # Full list
19
+ POOL_LIST_URLS = ["http://www1.toronto.ca/parks/prd/facilities/indoor-pools/index.htm",
20
+ "http://www1.toronto.ca/parks/prd/facilities/indoor-pools/2-indoor_pool.htm",
21
+ "http://www1.toronto.ca/parks/prd/facilities/outdoor-pools/index.htm",
22
+ "http://www1.toronto.ca/parks/prd/facilities/outdoor-pools/2-outdoor_pool.htm"]
23
+
24
+ Geocoder.configure(:timeout => 10)
25
+
26
+ def gather_pool_info
27
+ @pool_urls, pool_names, pool_addresses, pool_links, pool_coordinates = [],[],[],[],[]
28
+
29
+ POOL_LIST_URLS.each do |url|
30
+ doc = Nokogiri::HTML(open(url))
31
+ pools = doc.at_css("#pfrBody > div.pfrListing > table > tbody")
32
+ pool_names += pools.css('a').map { |link| link.children.text }
33
+ pool_links += pools.css('a').map { |link| link['href'] }
34
+ pool_addresses += gather_pool_addresses(pools)
35
+ end
36
+
37
+ # Geotag pools
38
+ puts "\n--- Scraping pool coordinates ---"
39
+ pool_coordinates = pool_addresses.map { |address| gather_pool_coordinates(address) }
40
+
41
+ # Convert Pool Data to Hash
42
+ pool_names.each_with_index do |pool, index|
43
+ current_pool = {}
44
+ current_pool[:name] = pool_names[index]
45
+ current_pool[:url] = pool_links[index]
46
+ current_pool[:address] = pool_addresses[index]
47
+ current_pool[:coordinates] = pool_coordinates[index]
48
+ @pool_urls << current_pool
49
+ end
50
+
51
+ # Write Hash
52
+ File.open("pool_urls.json","w") do |f|
53
+ f.write(@pool_urls.to_json)
54
+ end
55
+ end
56
+
57
+ def simple_equal(num)
58
+ 2 * num
59
+ end
60
+
61
+ def swim_time_finder(week, lane_swim_row_index)
62
+ week.at_css("tbody").css('tr')[lane_swim_row_index].children
63
+ .map do |el|
64
+ nodes = el.children.find_all(&:text?)
65
+ if nodes.length == 1
66
+ nodes = [el.children.text]
67
+ else
68
+ nodes.map!(&:text)
69
+ end
70
+ end
71
+ end
72
+
73
+ def build_pool_schedule_array_from_html(doc)
74
+ weeks = {}
75
+
76
+ for i in 0..1 #eventually poll more weeks, possibly 4 of available 7
77
+ week = doc.at_css("#dropin_Swimming_#{i}")
78
+ !week.nil?? week_dates = week.at_css('tr').children.map(&:text) : next
79
+
80
+ !week_dates.nil?? lane_swim_row_index = week.at_css("tbody").css('tr').find_index { |el| el.text=~ /Lane Swim/ } : next
81
+
82
+ if !lane_swim_row_index.nil?
83
+ week_lane_swim_times = swim_time_finder(week, lane_swim_row_index)
84
+ weeks.merge!(week_dates.zip(week_lane_swim_times).to_h)
85
+ end
86
+ end
87
+
88
+ # remove days with no swim times
89
+ weeks.delete_if { |day, time| time == [" "] || time == [] }
90
+ end
91
+
92
+ def gather_pool_addresses(pools)
93
+ pool_addresses = []
94
+ address_index_incrementer = pools.css('td').length / pools.css('tr').length
95
+ pools.css('td').each_with_index do |node, index|
96
+ # Address is always second column, table width varies for indoor vs. outdoor
97
+ if index % address_index_incrementer == 1
98
+ pool_addresses << node.text
99
+ end
100
+ end
101
+ pool_addresses
102
+ end
103
+
104
+ # Method accepting a block that supresses stdout/console logging
105
+ # https://gist.github.com/moertel/11091573
106
+
107
+ def suppress_output
108
+ begin
109
+ original_stderr = $stderr.clone
110
+ original_stdout = $stdout.clone
111
+ $stderr.reopen(File.new('/dev/null', 'w'))
112
+ $stdout.reopen(File.new('/dev/null', 'w'))
113
+ retval = yield
114
+ rescue Exception => e
115
+ $stdout.reopen(original_stdout)
116
+ $stderr.reopen(original_stderr)
117
+ raise e
118
+ ensure
119
+ $stdout.reopen(original_stdout)
120
+ $stderr.reopen(original_stderr)
121
+ end
122
+ retval
123
+ end
124
+
125
+ def gather_pool_coordinates(address)
126
+ if @display_mode == "verbose"
127
+ puts "Geocoding: #{address}"
128
+ else
129
+ print "."
130
+ end
131
+
132
+ coordinates_arr = suppress_output{ Geocoder.coordinates("#{address}, Toronto") }
133
+
134
+ # To avoid triggering google API limit of 10 queries per second
135
+ sleep(0.15)
136
+ return { latitude: coordinates_arr[0], longitude: coordinates_arr[1] }
137
+ end
138
+
139
+ #####Parse Weekly Leisure Swim Data#####
140
+ def gather_pool_swim_times
141
+ begin
142
+ @pool_urls ||= JSON.parse(File.read('pool_urls.json'), symbolize_names: true)
143
+ rescue => e
144
+ puts "Couldn't open pool_info, run scrape -f or run in path with pool_urls.json file"
145
+ exit
146
+ end
147
+
148
+ puts "\n--- Scraping pool swim times ---"
149
+ @pool_urls.each do |pool|
150
+
151
+ if @display_mode == "verbose"
152
+ puts "Scraping: " + pool[:name]
153
+ else
154
+ print "."
155
+ end
156
+
157
+ url = "http://www1.toronto.ca" + pool[:url]
158
+ doc = Nokogiri::HTML(open(url))
159
+ pool[:times] = build_pool_schedule_array_from_html(doc)
160
+ end
161
+
162
+ File.open("pools_data.json","w") do |f|
163
+ f.write(@pool_urls.to_json)
164
+ puts "\nWriting pools_data.json complete"
165
+ end
166
+ end
167
+
168
+ def gather_pool_program_cost_status
169
+ @pools = JSON.parse(File.read('pools_data.json'), symbolize_names: true)
170
+
171
+ page = "http://www1.toronto.ca/wps/portal/contentonly?vgnextoid=aaafdada600f0410VgnVCM10000071d60f89RCRD&vgnextchannel=a96adada600f0410VgnVCM10000071d60f89RCRD"
172
+ doc = Nokogiri::HTML(open(page))
173
+ free_facility_article = doc.at_css("#maincontent")
174
+ links = free_facility_article.css('a')
175
+ all_hrefs = links.map { |link| link.attribute('href').to_s }.uniq.sort.delete_if { |href| href.empty? }
176
+
177
+ free_facility_urls_regexed = all_hrefs.keep_if{ |href| href.match("\/parks/prd/facilities/complex\w*") }
178
+ .map{ |url| url.match(/\/parks\/prd\/facilities\/complex\/\d*/).to_s }
179
+
180
+ @pools.each do |pool|
181
+ pool_url_regex = pool[:url].match(/\/parks\/prd\/facilities\/complex\/\d*/).to_s
182
+ match = free_facility_urls_regexed.find{ |e| pool_url_regex == e }
183
+ pool[:free_swim] = match ? true : false
184
+ end
185
+
186
+ File.open("pools_data.json","w") do |f|
187
+ f.write(@pools.to_json)
188
+ puts "Writing program cost status to pools_data.json complete"
189
+ end
190
+ end
191
+
192
+ end
193
+ end
194
+
195
+ # Todo
196
+ # add a test suite
197
+ # remind self how to log name of vars while blown up (smaller method with info passed in probably!)
198
+ #start displaying, filtering?
199
+ #maybe transform date save
metadata ADDED
@@ -0,0 +1,123 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: TOSwimScraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Erich Welz
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-04-21 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: geocoder
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '1.2'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '1.2'
41
+ - !ruby/object:Gem::Dependency
42
+ name: json
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - "~>"
46
+ - !ruby/object:Gem::Version
47
+ version: '1.8'
48
+ - - ">="
49
+ - !ruby/object:Gem::Version
50
+ version: 1.8.3
51
+ type: :runtime
52
+ prerelease: false
53
+ version_requirements: !ruby/object:Gem::Requirement
54
+ requirements:
55
+ - - "~>"
56
+ - !ruby/object:Gem::Version
57
+ version: '1.8'
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: 1.8.3
61
+ - !ruby/object:Gem::Dependency
62
+ name: nokogiri
63
+ requirement: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - "~>"
66
+ - !ruby/object:Gem::Version
67
+ version: '1.6'
68
+ type: :runtime
69
+ prerelease: false
70
+ version_requirements: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - "~>"
73
+ - !ruby/object:Gem::Version
74
+ version: '1.6'
75
+ - !ruby/object:Gem::Dependency
76
+ name: pry
77
+ requirement: !ruby/object:Gem::Requirement
78
+ requirements:
79
+ - - "~>"
80
+ - !ruby/object:Gem::Version
81
+ version: '0.10'
82
+ type: :runtime
83
+ prerelease: false
84
+ version_requirements: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - "~>"
87
+ - !ruby/object:Gem::Version
88
+ version: '0.10'
89
+ description: Scraper to grab City of Toronto lane swim data creating a JSON file with
90
+ geotagged pools
91
+ email: erichwelz@gmail.com
92
+ executables:
93
+ - scrape
94
+ extensions: []
95
+ extra_rdoc_files: []
96
+ files:
97
+ - bin/scrape
98
+ - lib/scraper.rb
99
+ homepage: https://github.com/erichwelz/TOSwim
100
+ licenses: []
101
+ metadata: {}
102
+ post_install_message:
103
+ rdoc_options: []
104
+ require_paths:
105
+ - lib
106
+ required_ruby_version: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ required_rubygems_version: !ruby/object:Gem::Requirement
112
+ requirements:
113
+ - - ">="
114
+ - !ruby/object:Gem::Version
115
+ version: '0'
116
+ requirements: []
117
+ rubyforge_project:
118
+ rubygems_version: 2.4.8
119
+ signing_key:
120
+ specification_version: 4
121
+ summary: Scraper to grab City of Toronto lane swim data creating a JSON file with
122
+ geotagged pools
123
+ test_files: []