TOSwimScraper 0.1.2 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/bin/scrape +3 -8
  3. data/lib/scraper.rb +16 -42
  4. metadata +4 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8f558aee6fa59fd495364ee0fff61f2a62bda173
4
- data.tar.gz: 199647dd4bb4b78eaf29e343e3dd59ebde3f1ff7
3
+ metadata.gz: 1944dab62289f4921c52d1e38567c2ab2ff1f6cd
4
+ data.tar.gz: 778e414fa793222f93520bbe3e0e3fced1ff5d81
5
5
  SHA512:
6
- metadata.gz: 2bba02a899fd0579880a33c45720f83b48fe121690f7f0fb545d74e7a23004d68ac598035f2c83dd4c074d0582f6db78c990a32ecae9bd3cdd3ad0cb83a07ee0
7
- data.tar.gz: d0b48204133a4fb41b643c79f1d858a6f086d65de20fe2a1092db3f25edce6f04ad33cd0a5e0f5ddb0c3f397cc86c8835ec2412c482f6f6bf1f8e70d788b3f2c
6
+ metadata.gz: 048837648e2d9ac433403ad87ae7ce3e49caba5798ba230657a7338500cbda51f7adf3e076e4df70bc8311f02d2cfa8bb29a92ccdc6492fee5ea13b4becc9f97
7
+ data.tar.gz: 2d6407539497a44d432cf46130ccc16e125b8afff8521ea96ff4ead43bf0b492132ffd7899e98ea37a4378cb838ca53571b9d01f37272e7c86fa8b4f3e75a74f
data/bin/scrape CHANGED
@@ -24,12 +24,7 @@ else
24
24
 
25
25
  Scraper.display_mode(display_mode)
26
26
 
27
- if ARGV.include?('-f')
28
- Scraper.gather_pool_info
29
- Scraper.gather_pool_swim_times
30
- Scraper.gather_pool_program_cost_status
31
- elsif ARGV.include?('-s')
32
- Scraper.gather_pool_swim_times
33
- Scraper.gather_pool_program_cost_status
34
- end
27
+ Scraper.gather_pool_info if ARGV.include?('-f')
28
+ Scraper.gather_pool_swim_times
29
+ Scraper.gather_pool_program_cost_status
35
30
  end
data/lib/scraper.rb CHANGED
@@ -12,14 +12,10 @@ module Scraper
12
12
  @display_mode = display_mode
13
13
  end
14
14
 
15
-
16
15
  # faster testing
17
- # POOL_LIST_URLS = ["http://www1.toronto.ca/parks/prd/facilities/indoor-pools/index.htm"]
16
+ # POOL_LIST_URLS = ["https://web.toronto.ca/data/parks/prd/facilities/indoor-pools/index.html"]
18
17
  # Full list
19
- POOL_LIST_URLS = ["http://www1.toronto.ca/parks/prd/facilities/indoor-pools/index.htm",
20
- "http://www1.toronto.ca/parks/prd/facilities/indoor-pools/2-indoor_pool.htm",
21
- "http://www1.toronto.ca/parks/prd/facilities/outdoor-pools/index.htm",
22
- "http://www1.toronto.ca/parks/prd/facilities/outdoor-pools/2-outdoor_pool.htm"]
18
+ POOL_LIST_URLS = [ "https://web.toronto.ca/data/parks/prd/facilities/indoor-pools/index.html","https://web.toronto.ca/data/parks/prd/facilities/outdoor-pools/index.html" ]
23
19
 
24
20
  Geocoder.configure(:timeout => 10)
25
21
 
@@ -29,11 +25,14 @@ module Scraper
29
25
  POOL_LIST_URLS.each do |url|
30
26
  doc = Nokogiri::HTML(open(url))
31
27
  pools = doc.at_css("#pfrBody > div.pfrListing > table > tbody")
32
- pool_names += pools.css('a').map { |link| link.children.text }
33
- pool_links += pools.css('a').map { |link| link['href'] }
28
+ pool_names += pools.css('a').map { |link| link.children.text unless link.children.text == "" }.compact
29
+ pool_links += pools.css('a').map { |link| link['href'] if link['href'].match(/parks\/prd\/facilities\/complex/) }.compact
34
30
  pool_addresses += gather_pool_addresses(pools)
35
31
  end
36
32
 
33
+ array_length_equality = pool_names.length == pool_links.length && pool_links.length == pool_addresses.length
34
+ raise "Pool information lengths are unequal, the website schema has likely changed" unless array_length_equality
35
+
37
36
  # Geotag pools
38
37
  puts "\n--- Scraping pool coordinates ---"
39
38
  pool_coordinates = pool_addresses.map { |address| gather_pool_coordinates(address) }
@@ -52,7 +51,6 @@ module Scraper
52
51
  File.open("pool_urls.json","w") do |f|
53
52
  f.write(@pool_urls.to_json)
54
53
  end
55
-
56
54
  @pool_urls
57
55
  end
58
56
 
@@ -88,36 +86,14 @@ module Scraper
88
86
  end
89
87
 
90
88
  def gather_pool_addresses(pools)
91
- pool_addresses = []
92
- address_index_incrementer = pools.css('td').length / pools.css('tr').length
93
- pools.css('td').each_with_index do |node, index|
89
+ address_index = pools.css('td').length / pools.css('tr').length
90
+
91
+ pools.css('td').each_with_object([]).with_index do |(node, pool_addresses), index|
94
92
  # Address is always second column, table width varies for indoor vs. outdoor
95
- if index % address_index_incrementer == 1
93
+ if index % address_index == 1
96
94
  pool_addresses << node.text
97
95
  end
98
96
  end
99
- pool_addresses
100
- end
101
-
102
- # Method accepting a block that supresses stdout/console logging
103
- # https://gist.github.com/moertel/11091573
104
-
105
- def suppress_output
106
- begin
107
- original_stderr = $stderr.clone
108
- original_stdout = $stdout.clone
109
- $stderr.reopen(File.new('/dev/null', 'w'))
110
- $stdout.reopen(File.new('/dev/null', 'w'))
111
- retval = yield
112
- rescue Exception => e
113
- $stdout.reopen(original_stdout)
114
- $stderr.reopen(original_stderr)
115
- raise e
116
- ensure
117
- $stdout.reopen(original_stdout)
118
- $stderr.reopen(original_stderr)
119
- end
120
- retval
121
97
  end
122
98
 
123
99
  def gather_pool_coordinates(address)
@@ -127,10 +103,10 @@ module Scraper
127
103
  print "."
128
104
  end
129
105
 
130
- coordinates_arr = suppress_output{ Geocoder.coordinates("#{address}, Toronto") }
106
+ coordinates_arr = Geocoder.coordinates("#{address}, Toronto")
131
107
 
132
- # To avoid triggering google API limit of 10 queries per second
133
- sleep(0.15)
108
+ # To avoid triggering google API limit of 50 queries per second
109
+ sleep(0.02)
134
110
  return { latitude: coordinates_arr[0], longitude: coordinates_arr[1] }
135
111
  end
136
112
 
@@ -145,14 +121,12 @@ module Scraper
145
121
 
146
122
  puts "\n--- Scraping pool swim times ---"
147
123
  @pool_urls.each do |pool|
148
-
149
124
  if @display_mode == "verbose"
150
125
  puts "Scraping: " + pool[:name]
151
126
  else
152
127
  print "."
153
128
  end
154
-
155
- url = "http://www1.toronto.ca" + pool[:url]
129
+ url = "https://www.toronto.ca" + pool[:url]
156
130
  doc = Nokogiri::HTML(open(url))
157
131
  pool[:times] = build_pool_schedule_array_from_html(doc)
158
132
  end
@@ -168,7 +142,7 @@ module Scraper
168
142
  def gather_pool_program_cost_status
169
143
  @pools = JSON.parse(File.read('pools_data.json'), symbolize_names: true)
170
144
 
171
- page = "http://www1.toronto.ca/wps/portal/contentonly?vgnextoid=aaafdada600f0410VgnVCM10000071d60f89RCRD&vgnextchannel=a96adada600f0410VgnVCM10000071d60f89RCRD"
145
+ page = "https://www1.toronto.ca/wps/portal/contentonly?vgnextoid=aaafdada600f0410VgnVCM10000071d60f89RCRD&vgnextchannel=a96adada600f0410VgnVCM10000071d60f89RCRD"
172
146
  doc = Nokogiri::HTML(open(page))
173
147
  free_facility_article = doc.at_css("#maincontent")
174
148
  links = free_facility_article.css('a')
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: TOSwimScraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erich Welz
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-04-22 00:00:00.000000000 Z
11
+ date: 2017-11-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -115,9 +115,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
115
115
  version: '0'
116
116
  requirements: []
117
117
  rubyforge_project:
118
- rubygems_version: 2.4.8
118
+ rubygems_version: 2.5.1
119
119
  signing_key:
120
120
  specification_version: 4
121
121
  summary: Scraper to grab City of Toronto lane swim data creating a JSON file with
122
122
  geotagged pools
123
123
  test_files: []
124
+ has_rdoc: