TOSwimScraper 0.1.2 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. checksums.yaml +4 -4
  2. data/bin/scrape +3 -8
  3. data/lib/scraper.rb +16 -42
  4. metadata +4 -3
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8f558aee6fa59fd495364ee0fff61f2a62bda173
4
- data.tar.gz: 199647dd4bb4b78eaf29e343e3dd59ebde3f1ff7
3
+ metadata.gz: 1944dab62289f4921c52d1e38567c2ab2ff1f6cd
4
+ data.tar.gz: 778e414fa793222f93520bbe3e0e3fced1ff5d81
5
5
  SHA512:
6
- metadata.gz: 2bba02a899fd0579880a33c45720f83b48fe121690f7f0fb545d74e7a23004d68ac598035f2c83dd4c074d0582f6db78c990a32ecae9bd3cdd3ad0cb83a07ee0
7
- data.tar.gz: d0b48204133a4fb41b643c79f1d858a6f086d65de20fe2a1092db3f25edce6f04ad33cd0a5e0f5ddb0c3f397cc86c8835ec2412c482f6f6bf1f8e70d788b3f2c
6
+ metadata.gz: 048837648e2d9ac433403ad87ae7ce3e49caba5798ba230657a7338500cbda51f7adf3e076e4df70bc8311f02d2cfa8bb29a92ccdc6492fee5ea13b4becc9f97
7
+ data.tar.gz: 2d6407539497a44d432cf46130ccc16e125b8afff8521ea96ff4ead43bf0b492132ffd7899e98ea37a4378cb838ca53571b9d01f37272e7c86fa8b4f3e75a74f
data/bin/scrape CHANGED
@@ -24,12 +24,7 @@ else
24
24
 
25
25
  Scraper.display_mode(display_mode)
26
26
 
27
- if ARGV.include?('-f')
28
- Scraper.gather_pool_info
29
- Scraper.gather_pool_swim_times
30
- Scraper.gather_pool_program_cost_status
31
- elsif ARGV.include?('-s')
32
- Scraper.gather_pool_swim_times
33
- Scraper.gather_pool_program_cost_status
34
- end
27
+ Scraper.gather_pool_info if ARGV.include?('-f')
28
+ Scraper.gather_pool_swim_times
29
+ Scraper.gather_pool_program_cost_status
35
30
  end
data/lib/scraper.rb CHANGED
@@ -12,14 +12,10 @@ module Scraper
12
12
  @display_mode = display_mode
13
13
  end
14
14
 
15
-
16
15
  # faster testing
17
- # POOL_LIST_URLS = ["http://www1.toronto.ca/parks/prd/facilities/indoor-pools/index.htm"]
16
+ # POOL_LIST_URLS = ["https://web.toronto.ca/data/parks/prd/facilities/indoor-pools/index.html"]
18
17
  # Full list
19
- POOL_LIST_URLS = ["http://www1.toronto.ca/parks/prd/facilities/indoor-pools/index.htm",
20
- "http://www1.toronto.ca/parks/prd/facilities/indoor-pools/2-indoor_pool.htm",
21
- "http://www1.toronto.ca/parks/prd/facilities/outdoor-pools/index.htm",
22
- "http://www1.toronto.ca/parks/prd/facilities/outdoor-pools/2-outdoor_pool.htm"]
18
+ POOL_LIST_URLS = [ "https://web.toronto.ca/data/parks/prd/facilities/indoor-pools/index.html","https://web.toronto.ca/data/parks/prd/facilities/outdoor-pools/index.html" ]
23
19
 
24
20
  Geocoder.configure(:timeout => 10)
25
21
 
@@ -29,11 +25,14 @@ module Scraper
29
25
  POOL_LIST_URLS.each do |url|
30
26
  doc = Nokogiri::HTML(open(url))
31
27
  pools = doc.at_css("#pfrBody > div.pfrListing > table > tbody")
32
- pool_names += pools.css('a').map { |link| link.children.text }
33
- pool_links += pools.css('a').map { |link| link['href'] }
28
+ pool_names += pools.css('a').map { |link| link.children.text unless link.children.text == "" }.compact
29
+ pool_links += pools.css('a').map { |link| link['href'] if link['href'].match(/parks\/prd\/facilities\/complex/) }.compact
34
30
  pool_addresses += gather_pool_addresses(pools)
35
31
  end
36
32
 
33
+ array_length_equality = pool_names.length == pool_links.length && pool_links.length == pool_addresses.length
34
+ raise "Pool information lengths are unequal, the website schema has likely changed" unless array_length_equality
35
+
37
36
  # Geotag pools
38
37
  puts "\n--- Scraping pool coordinates ---"
39
38
  pool_coordinates = pool_addresses.map { |address| gather_pool_coordinates(address) }
@@ -52,7 +51,6 @@ module Scraper
52
51
  File.open("pool_urls.json","w") do |f|
53
52
  f.write(@pool_urls.to_json)
54
53
  end
55
-
56
54
  @pool_urls
57
55
  end
58
56
 
@@ -88,36 +86,14 @@ module Scraper
88
86
  end
89
87
 
90
88
  def gather_pool_addresses(pools)
91
- pool_addresses = []
92
- address_index_incrementer = pools.css('td').length / pools.css('tr').length
93
- pools.css('td').each_with_index do |node, index|
89
+ address_index = pools.css('td').length / pools.css('tr').length
90
+
91
+ pools.css('td').each_with_object([]).with_index do |(node, pool_addresses), index|
94
92
  # Address is always second column, table width varies for indoor vs. outdoor
95
- if index % address_index_incrementer == 1
93
+ if index % address_index == 1
96
94
  pool_addresses << node.text
97
95
  end
98
96
  end
99
- pool_addresses
100
- end
101
-
102
- # Method accepting a block that supresses stdout/console logging
103
- # https://gist.github.com/moertel/11091573
104
-
105
- def suppress_output
106
- begin
107
- original_stderr = $stderr.clone
108
- original_stdout = $stdout.clone
109
- $stderr.reopen(File.new('/dev/null', 'w'))
110
- $stdout.reopen(File.new('/dev/null', 'w'))
111
- retval = yield
112
- rescue Exception => e
113
- $stdout.reopen(original_stdout)
114
- $stderr.reopen(original_stderr)
115
- raise e
116
- ensure
117
- $stdout.reopen(original_stdout)
118
- $stderr.reopen(original_stderr)
119
- end
120
- retval
121
97
  end
122
98
 
123
99
  def gather_pool_coordinates(address)
@@ -127,10 +103,10 @@ module Scraper
127
103
  print "."
128
104
  end
129
105
 
130
- coordinates_arr = suppress_output{ Geocoder.coordinates("#{address}, Toronto") }
106
+ coordinates_arr = Geocoder.coordinates("#{address}, Toronto")
131
107
 
132
- # To avoid triggering google API limit of 10 queries per second
133
- sleep(0.15)
108
+ # To avoid triggering google API limit of 50 queries per second
109
+ sleep(0.02)
134
110
  return { latitude: coordinates_arr[0], longitude: coordinates_arr[1] }
135
111
  end
136
112
 
@@ -145,14 +121,12 @@ module Scraper
145
121
 
146
122
  puts "\n--- Scraping pool swim times ---"
147
123
  @pool_urls.each do |pool|
148
-
149
124
  if @display_mode == "verbose"
150
125
  puts "Scraping: " + pool[:name]
151
126
  else
152
127
  print "."
153
128
  end
154
-
155
- url = "http://www1.toronto.ca" + pool[:url]
129
+ url = "https://www.toronto.ca" + pool[:url]
156
130
  doc = Nokogiri::HTML(open(url))
157
131
  pool[:times] = build_pool_schedule_array_from_html(doc)
158
132
  end
@@ -168,7 +142,7 @@ module Scraper
168
142
  def gather_pool_program_cost_status
169
143
  @pools = JSON.parse(File.read('pools_data.json'), symbolize_names: true)
170
144
 
171
- page = "http://www1.toronto.ca/wps/portal/contentonly?vgnextoid=aaafdada600f0410VgnVCM10000071d60f89RCRD&vgnextchannel=a96adada600f0410VgnVCM10000071d60f89RCRD"
145
+ page = "https://www1.toronto.ca/wps/portal/contentonly?vgnextoid=aaafdada600f0410VgnVCM10000071d60f89RCRD&vgnextchannel=a96adada600f0410VgnVCM10000071d60f89RCRD"
172
146
  doc = Nokogiri::HTML(open(page))
173
147
  free_facility_article = doc.at_css("#maincontent")
174
148
  links = free_facility_article.css('a')
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: TOSwimScraper
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Erich Welz
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-04-22 00:00:00.000000000 Z
11
+ date: 2017-11-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -115,9 +115,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
115
115
  version: '0'
116
116
  requirements: []
117
117
  rubyforge_project:
118
- rubygems_version: 2.4.8
118
+ rubygems_version: 2.5.1
119
119
  signing_key:
120
120
  specification_version: 4
121
121
  summary: Scraper to grab City of Toronto lane swim data creating a JSON file with
122
122
  geotagged pools
123
123
  test_files: []
124
+ has_rdoc: