RubyGems - nz_pol_scrapers - Versions diffs - 0.1.0 - Mend

nz_pol_scrapers 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

checksums.yaml +7 -0
data/lib/nz_pol_scrapers.rb +2 -0
data/lib/scrapers/electorate_results_scraper.rb +142 -0
data/lib/scrapers/electorate_scraper.rb +57 -0
metadata +60 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 1c59b700dd438b7052ef8e78e9a9258aa6dc1b8d
+  data.tar.gz: 6943e11f8dbb7c1d1399687122aeb4b13ba13a00
+SHA512:
+  metadata.gz: 9fefa50f0b553f33ef41cd2e458c9448847b2fbe7c2d887fe461d54f16947a6f832e263ca0e9d9dda74910f4f381326db1dd8bbe72b25cdcf76c336629530798
+  data.tar.gz: 25e6ee879973348baa784dfaf1ac99a76e5acea9979eb9a540a6c9869c8c1d9bccd3d5898fef47e7781283e66b6f8807243424255be0891098d5a15181fb36d1

data/lib/nz_pol_scrapers.rb ADDED

	@@ -0,0 +1,2 @@
1	+ # require all files in the scrapers dir
2	+ Dir.glob("#{File.dirname(__FILE__)}/scrapers/*").each { \|f\| require f }

data/lib/scrapers/electorate_results_scraper.rb ADDED

@@ -0,0 +1,142 @@
+# encoding: utf-8
+module NZPolScrapers
+  class ElectorateResultsScraper
+    require 'nokogiri'
+    require 'open-uri'
+    require 'yaml'
+    def self.scrape_to_files(directory)
+      scrape do |parsed_rows, electorate|
+        year = parsed_rows.first[:candidacy][:year]
+        file = File.new("#{directory}/#{electorate[:name]} #{year}.yml", "w")
+        file.puts parsed_rows.to_yaml
+        file.close
+      end
+      return
+    end
+    def self.scrape_to_hash
+      result = {}
+      scrape do |parsed_rows, electorate|
+        result[:"#{electorate[:name]}"] = { :"#{parsed_rows.first[:candidacy][:year]}" => parsed_rows }
+      end
+      result
+    end
+    private
+    def self.scrape(&block)
+      # loop through each of the electorates
+      # we load them from wikipedia too see: electorate_scraper.rb
+      NZPolScrapers::ElectorateScraper.scrape_to_array.each do |electorate|
+        election_result_tables = election_result_tables_for(electorate)
+        # within each table get yer parse on
+        election_result_tables.each do |election_result_table|
+          parsed_rows = parsed_rows_from_election_result_table(election_result_table, electorate)
+          yield parsed_rows, electorate
+        end
+      end
+    end
+    def self.election_result_tables_for(electorate)
+      # load up the main content of the wikipedia page
+      doc = Nokogiri::HTML(open(electorate[:url])).at_css('#mw-content-text')
+      # find all the tables representing election results
+      election_result_tables = doc.css('table[border="1"]')
+    end
+    def self.parsed_rows_from_election_result_table(election_result_table, electorate)
+      year = election_result_table.children.first.children.first.children.first.attributes['title'].value.split(' ').last
+      puts "Scraping results for #{electorate[:name]}, #{year}"
+      table_title = election_result_table.children.first.children.text
+      election_type = table_title =~ /by\-election/ ? 'by-election' : 'general'
+      # A candidacy is an instance of a candidate seeking election in an electorate
+      candidacy_rows = election_result_table.css('tr.vcard')
+      parsed_rows = []
+      candidacy_rows.each do |candidacy_row|
+        # don't parse anything if the fourth child is all blank text, this means there is no candidate name or vote information
+        unless candidacy_row.children[4].children.text.empty?
+          parsed_row = { party: party_for_candidacy_row(candidacy_row),
+                        candidate: candidate_for_candidacy_row(candidacy_row),
+                        candidacy: candidacy_for_candidacy_row(candidacy_row, year, electorate[:name], election_type)
+                      }
+          parsed_rows << parsed_row
+          puts "...parsed #{parsed_row[:candidate][:name]} of #{parsed_row[:party][:name]}"
+        end
+      end
+      parsed_rows
+    end
+    def self.party_for_candidacy_row(candidacy_row)
+      party = {}
+      party[:colour] = candidacy_row.children.first.attributes['style'].value.split(';').first.split(' ').last
+      party[:name] = party_name_for_candidacy_row(candidacy_row)
+      party[:short_name] = short_party_name_for_candidacy_row(candidacy_row)
+      party
+    end
+    def self.candidate_for_candidacy_row(candidacy_row)
+      candidate = {}
+      candidacy_row.children[4].css('img').remove # remove any image tags from within the name cell
+      candidacy_row.children[4].css('sup').remove # remove any superscript tags from within the name cell
+      name = candidacy_row.children[4].children.text
+      cut_bullshit_from(name)
+      candidate[:name] = name
+      split_name = candidate[:name].split(' ')
+      candidate[:last_name] = split_name.pop
+      candidate[:first_name] = split_name.join(' ')
+      candidate
+    end
+    def self.candidacy_for_candidacy_row(candidacy_row, year, electorate, election_type)
+      candidacy = {}
+      candidacy[:votes] = candidacy_row.children[6].text.delete(',').to_i
+      candidacy[:percent] = candidacy_row.children[8].text.to_f
+      candidacy[:electorate] = electorate
+      candidacy[:year] = year
+      candidacy[:election_type] = election_type
+      candidacy
+    end
+    def self.party_name_for_candidacy_row(candidacy_row)
+      # some cells contain links, others don't, so we need to get different values depending on what's there
+      unless candidacy_row.children[2].children.first.attributes['title'].nil?
+        name = candidacy_row.children[2].children.first.attributes['title'].value
+      else
+        name = candidacy_row.children[2].children.first.text
+      end
+      cut_bullshit_from(name)
+    end
+    def self.short_party_name_for_candidacy_row(candidacy_row)
+      # likewise for the regular party name, in some cases there is only one name so we must fall back
+      unless candidacy_row.children[2].children.last.children.text == ''
+        name = candidacy_row.children[2].children.last.children.text
+      else
+        name = candidacy_row.children[2].children.last.text
+      end
+      cut_bullshit_from(name)
+    end
+    def self.cut_bullshit_from(name)
+      bullshits = [' (New Zealand)',
+                   ' (New Zealand political party)',
+                   ' (politician)',
+                   ' (page does not exist)',
+                   ' (political party)',
+                   'Y ',
+                   'N ',
+                   '[note 1]']
+      bullshits.each {|bullshit| name.slice! bullshit }
+      name
+    end
+  end
+end

data/lib/scrapers/electorate_scraper.rb ADDED

@@ -0,0 +1,57 @@
+# encoding: utf-8
+module NZPolScrapers
+  class ElectorateScraper
+    require 'nokogiri'
+    require 'open-uri'
+    require 'yaml'
+    def self.scrape_to_files(directory)
+      result = scrape_to_hash
+      file = File.new("#{directory}/NZ Electorates.yml", "w")
+      file.puts result.to_yaml
+      file.close
+    end
+    def self.scrape_to_hash
+      result = {}
+      scrape do |name, url|
+        result[:"#{name}"] = { name: name, url: url }
+      end
+      result
+    end
+    def self.scrape_to_array
+      result = []
+      scrape do |name, url|
+        result << { name: name, url: url }
+      end
+      result
+    end
+    private
+    def self.scrape(&block)
+      # visit the nz electorates index on wikipedia
+      url = 'http://en.wikipedia.org/wiki/Category:New_Zealand_electorates'
+      doc = Nokogiri::HTML(open(url)).at_css('#mw-pages')
+      # select all the category links
+      links = doc.css('li a')
+      # load up the electorate page addresses and names into an array of hashes
+      electorates = []
+      links.each do |link|
+        name = clean_electorate_name(link.attributes['title'].value)
+        url = "http://en.wikipedia.org#{link.attributes['href'].value}"
+        yield name, url unless name == 'New Zealand electorates'
+      end
+    end
+    def self.clean_electorate_name(name)
+      name.slice! ' (New Zealand electorate)'
+      name
+    end
+  end
+end

metadata ADDED

@@ -0,0 +1,60 @@
+--- !ruby/object:Gem::Specification
+name: nz_pol_scrapers
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Giles Thompson
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-03-18 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.6'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.6'
+description: Wikipedia scrapers which parse information about NZ politics
+email:
+- iam@gilesthompson.co.nz
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/nz_pol_scrapers.rb
+- lib/scrapers/electorate_results_scraper.rb
+- lib/scrapers/electorate_scraper.rb
+homepage: http://github.com/gilest/nz_pol_scrapers
+licenses: []
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: 1.9.2
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: 1.3.2
+requirements: []
+rubyforge_project:
+rubygems_version: 2.2.2
+signing_key:
+specification_version: 4
+summary: Wikipedia scrapers which parse information about NZ politics
+test_files: []