RubyGems - TOSwimScraper - Versions diffs - 0.1.2 → 0.1.5 - Mend

TOSwimScraper 0.1.2 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: 8f558aee6fa59fd495364ee0fff61f2a62bda173
-  data.tar.gz: 199647dd4bb4b78eaf29e343e3dd59ebde3f1ff7
+  metadata.gz: 1944dab62289f4921c52d1e38567c2ab2ff1f6cd
+  data.tar.gz: 778e414fa793222f93520bbe3e0e3fced1ff5d81
 SHA512:
-  metadata.gz: 2bba02a899fd0579880a33c45720f83b48fe121690f7f0fb545d74e7a23004d68ac598035f2c83dd4c074d0582f6db78c990a32ecae9bd3cdd3ad0cb83a07ee0
-  data.tar.gz: d0b48204133a4fb41b643c79f1d858a6f086d65de20fe2a1092db3f25edce6f04ad33cd0a5e0f5ddb0c3f397cc86c8835ec2412c482f6f6bf1f8e70d788b3f2c
+  metadata.gz: 048837648e2d9ac433403ad87ae7ce3e49caba5798ba230657a7338500cbda51f7adf3e076e4df70bc8311f02d2cfa8bb29a92ccdc6492fee5ea13b4becc9f97
+  data.tar.gz: 2d6407539497a44d432cf46130ccc16e125b8afff8521ea96ff4ead43bf0b492132ffd7899e98ea37a4378cb838ca53571b9d01f37272e7c86fa8b4f3e75a74f

data/bin/scrape CHANGED Viewed

@@ -24,12 +24,7 @@ else
   Scraper.display_mode(display_mode)
-  if ARGV.include?('-f')
-    Scraper.gather_pool_info
-    Scraper.gather_pool_swim_times
-    Scraper.gather_pool_program_cost_status
-  elsif ARGV.include?('-s')
-    Scraper.gather_pool_swim_times
-    Scraper.gather_pool_program_cost_status
-  end
+  Scraper.gather_pool_info if ARGV.include?('-f')
+  Scraper.gather_pool_swim_times
+  Scraper.gather_pool_program_cost_status
 end

data/lib/scraper.rb CHANGED Viewed

@@ -12,14 +12,10 @@ module Scraper
       @display_mode = display_mode
     end
     # faster testing
-    # POOL_LIST_URLS = ["http://www1.toronto.ca/parks/prd/facilities/indoor-pools/index.htm"]
+    # POOL_LIST_URLS = ["https://web.toronto.ca/data/parks/prd/facilities/indoor-pools/index.html"]
     # Full list
-    POOL_LIST_URLS = ["http://www1.toronto.ca/parks/prd/facilities/indoor-pools/index.htm",
-            "http://www1.toronto.ca/parks/prd/facilities/indoor-pools/2-indoor_pool.htm",
-            "http://www1.toronto.ca/parks/prd/facilities/outdoor-pools/index.htm",
-            "http://www1.toronto.ca/parks/prd/facilities/outdoor-pools/2-outdoor_pool.htm"]
+    POOL_LIST_URLS = [ "https://web.toronto.ca/data/parks/prd/facilities/indoor-pools/index.html","https://web.toronto.ca/data/parks/prd/facilities/outdoor-pools/index.html" ]
     Geocoder.configure(:timeout => 10)
@@ -29,11 +25,14 @@ module Scraper
       POOL_LIST_URLS.each do |url|
         doc = Nokogiri::HTML(open(url))
         pools = doc.at_css("#pfrBody > div.pfrListing > table > tbody")
-        pool_names += pools.css('a').map { |link| link.children.text }
-        pool_links += pools.css('a').map { |link| link['href'] }
+        pool_names += pools.css('a').map { |link| link.children.text unless link.children.text == "" }.compact
+        pool_links += pools.css('a').map { |link| link['href'] if link['href'].match(/parks\/prd\/facilities\/complex/) }.compact
         pool_addresses += gather_pool_addresses(pools)
       end
+      array_length_equality = pool_names.length == pool_links.length && pool_links.length == pool_addresses.length
+      raise "Pool information lengths are unequal, the website schema has likely changed" unless array_length_equality
       # Geotag pools
       puts "\n--- Scraping pool coordinates ---"
       pool_coordinates = pool_addresses.map { |address| gather_pool_coordinates(address) }
@@ -52,7 +51,6 @@ module Scraper
       File.open("pool_urls.json","w") do |f|
         f.write(@pool_urls.to_json)
       end
       @pool_urls
     end
@@ -88,36 +86,14 @@ module Scraper
     end
     def gather_pool_addresses(pools)
-      pool_addresses = []
-      address_index_incrementer = pools.css('td').length / pools.css('tr').length
-      pools.css('td').each_with_index do |node, index|
+      address_index = pools.css('td').length / pools.css('tr').length
+      pools.css('td').each_with_object([]).with_index do |(node, pool_addresses), index|
         # Address is always second column, table width varies for indoor vs. outdoor
-        if index % address_index_incrementer == 1
+        if index % address_index == 1
           pool_addresses << node.text
         end
       end
-      pool_addresses
-    end
-    # Method accepting a block that supresses stdout/console logging
-    #  https://gist.github.com/moertel/11091573
-    def suppress_output
-      begin
-        original_stderr = $stderr.clone
-        original_stdout = $stdout.clone
-        $stderr.reopen(File.new('/dev/null', 'w'))
-        $stdout.reopen(File.new('/dev/null', 'w'))
-        retval = yield
-      rescue Exception => e
-        $stdout.reopen(original_stdout)
-        $stderr.reopen(original_stderr)
-        raise e
-      ensure
-        $stdout.reopen(original_stdout)
-        $stderr.reopen(original_stderr)
-      end
-      retval
     end
     def gather_pool_coordinates(address)
@@ -127,10 +103,10 @@ module Scraper
         print "."
       end
-      coordinates_arr = suppress_output{ Geocoder.coordinates("#{address}, Toronto") }
+      coordinates_arr = Geocoder.coordinates("#{address}, Toronto")
-      # To avoid triggering google API limit of 10 queries per second
-      sleep(0.15)
+      # To avoid triggering google API limit of 50 queries per second
+      sleep(0.02)
       return { latitude: coordinates_arr[0], longitude: coordinates_arr[1] }
     end
@@ -145,14 +121,12 @@ module Scraper
       puts "\n--- Scraping pool swim times ---"
       @pool_urls.each do |pool|
         if @display_mode == "verbose"
           puts "Scraping: " + pool[:name]
         else
           print "."
         end
-        url = "http://www1.toronto.ca" + pool[:url]
+        url = "https://www.toronto.ca" + pool[:url]
         doc = Nokogiri::HTML(open(url))
         pool[:times] = build_pool_schedule_array_from_html(doc)
       end
@@ -168,7 +142,7 @@ module Scraper
     def gather_pool_program_cost_status
       @pools = JSON.parse(File.read('pools_data.json'), symbolize_names: true)
-      page = "http://www1.toronto.ca/wps/portal/contentonly?vgnextoid=aaafdada600f0410VgnVCM10000071d60f89RCRD&vgnextchannel=a96adada600f0410VgnVCM10000071d60f89RCRD"
+      page = "https://www1.toronto.ca/wps/portal/contentonly?vgnextoid=aaafdada600f0410VgnVCM10000071d60f89RCRD&vgnextchannel=a96adada600f0410VgnVCM10000071d60f89RCRD"
       doc = Nokogiri::HTML(open(page))
       free_facility_article = doc.at_css("#maincontent")
       links = free_facility_article.css('a')

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: TOSwimScraper
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.1.5
 platform: ruby
 authors:
 - Erich Welz
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2016-04-22 00:00:00.000000000 Z
+date: 2017-11-24 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: bundler
@@ -115,9 +115,10 @@ required_rubygems_version: !ruby/object:Gem::Requirement
       version: '0'
 requirements: []
 rubyforge_project:
-rubygems_version: 2.4.8
+rubygems_version: 2.5.1
 signing_key:
 specification_version: 4
 summary: Scraper to grab City of Toronto lane swim data creating a JSON file with
   geotagged pools
 test_files: []
+has_rdoc: