RubyGems - nz_pol_scrapers - Versions diffs - 0.1.0 - Mend

nz_pol_scrapers 0.1.0

Files changed (5) hide show

checksums.yaml +7 -0
data/lib/nz_pol_scrapers.rb +2 -0
data/lib/scrapers/electorate_results_scraper.rb +142 -0
data/lib/scrapers/electorate_scraper.rb +57 -0
metadata +60 -0

checksums.yaml ADDED

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: 1c59b700dd438b7052ef8e78e9a9258aa6dc1b8d
+  data.tar.gz: 6943e11f8dbb7c1d1399687122aeb4b13ba13a00
+SHA512:
+  metadata.gz: 9fefa50f0b553f33ef41cd2e458c9448847b2fbe7c2d887fe461d54f16947a6f832e263ca0e9d9dda74910f4f381326db1dd8bbe72b25cdcf76c336629530798
+  data.tar.gz: 25e6ee879973348baa784dfaf1ac99a76e5acea9979eb9a540a6c9869c8c1d9bccd3d5898fef47e7781283e66b6f8807243424255be0891098d5a15181fb36d1

data/lib/nz_pol_scrapers.rb ADDED

	@@ -0,0 +1,2 @@
1	+ # require all files in the scrapers dir
2	+ Dir.glob("#{File.dirname(__FILE__)}/scrapers/*").each { \|f\| require f }

data/lib/scrapers/electorate_results_scraper.rb ADDED

@@ -0,0 +1,142 @@
+# encoding: utf-8
+module NZPolScrapers
+  class ElectorateResultsScraper
+    require 'nokogiri'
+    require 'open-uri'
+    require 'yaml'
+    def self.scrape_to_files(directory)
+      scrape do |parsed_rows, electorate|
+        year = parsed_rows.first[:candidacy][:year]
+        file = File.new("#{directory}/#{electorate[:name]} #{year}.yml", "w")
+        file.puts parsed_rows.to_yaml
+        file.close
+      end
+      return
+    end
+    def self.scrape_to_hash
+      result = {}
+      scrape do |parsed_rows, electorate|
+        result[:"#{electorate[:name]}"] = { :"#{parsed_rows.first[:candidacy][:year]}" => parsed_rows }
+      end
+      result
+    end
+    private
+    def self.scrape(&block)
+      # loop through each of the electorates
+      # we load them from wikipedia too see: electorate_scraper.rb
+      NZPolScrapers::ElectorateScraper.scrape_to_array.each do |electorate|
+        election_result_tables = election_result_tables_for(electorate)
+        # within each table get yer parse on
+        election_result_tables.each do |election_result_table|
+          parsed_rows = parsed_rows_from_election_result_table(election_result_table, electorate)
+          yield parsed_rows, electorate
+        end
+      end
+    end
+    def self.election_result_tables_for(electorate)
+      # load up the main content of the wikipedia page
+      doc = Nokogiri::HTML(open(electorate[:url])).at_css('#mw-content-text')
+      # find all the tables representing election results
+      election_result_tables = doc.css('table[border="1"]')
+    end
+    def self.parsed_rows_from_election_result_table(election_result_table, electorate)
+      year = election_result_table.children.first.children.first.children.first.attributes['title'].value.split(' ').last
+      puts "Scraping results for #{electorate[:name]}, #{year}"
+      table_title = election_result_table.children.first.children.text
+      election_type = table_title =~ /by\-election/ ? 'by-election' : 'general'
+      # A candidacy is an instance of a candidate seeking election in an electorate
+      candidacy_rows = election_result_table.css('tr.vcard')
+      parsed_rows = []
+      candidacy_rows.each do |candidacy_row|
+        # don't parse anything if the fourth child is all blank text, this means there is no candidate name or vote information
+        unless candidacy_row.children[4].children.text.empty?
+          parsed_row = { party: party_for_candidacy_row(candidacy_row),
+                        candidate: candidate_for_candidacy_row(candidacy_row),
+                        candidacy: candidacy_for_candidacy_row(candidacy_row, year, electorate[:name], election_type)
+                      }
+          parsed_rows << parsed_row
+          puts "...parsed #{parsed_row[:candidate][:name]} of #{parsed_row[:party][:name]}"
+        end
+      end
+      parsed_rows
+    end
+    def self.party_for_candidacy_row(candidacy_row)
+      party = {}
+      party[:colour] = candidacy_row.children.first.attributes['style'].value.split(';').first.split(' ').last
+      party[:name] = party_name_for_candidacy_row(candidacy_row)
+      party[:short_name] = short_party_name_for_candidacy_row(candidacy_row)
+      party
+    end
+    def self.candidate_for_candidacy_row(candidacy_row)
+      candidate = {}
+      candidacy_row.children[4].css('img').remove # remove any image tags from within the name cell
+      candidacy_row.children[4].css('sup').remove # remove any superscript tags from within the name cell
+      name = candidacy_row.children[4].children.text
+      cut_bullshit_from(name)
+      candidate[:name] = name
+      split_name = candidate[:name].split(' ')
+      candidate[:last_name] = split_name.pop
+      candidate[:first_name] = split_name.join(' ')
+      candidate
+    end
+    def self.candidacy_for_candidacy_row(candidacy_row, year, electorate, election_type)
+      candidacy = {}
+      candidacy[:votes] = candidacy_row.children[6].text.delete(',').to_i
+      candidacy[:percent] = candidacy_row.children[8].text.to_f
+      candidacy[:electorate] = electorate
+      candidacy[:year] = year
+      candidacy[:election_type] = election_type
+      candidacy
+    end
+    def self.party_name_for_candidacy_row(candidacy_row)
+      # some cells contain links, others don't, so we need to get different values depending on what's there
+      unless candidacy_row.children[2].children.first.attributes['title'].nil?
+        name = candidacy_row.children[2].children.first.attributes['title'].value
+      else
+        name = candidacy_row.children[2].children.first.text
+      end
+      cut_bullshit_from(name)
+    end
+    def self.short_party_name_for_candidacy_row(candidacy_row)
+      # likewise for the regular party name, in some cases there is only one name so we must fall back
+      unless candidacy_row.children[2].children.last.children.text == ''
+        name = candidacy_row.children[2].children.last.children.text
+      else
+        name = candidacy_row.children[2].children.last.text
+      end
+      cut_bullshit_from(name)
+    end
+    def self.cut_bullshit_from(name)
+      bullshits = [' (New Zealand)',
+                   ' (New Zealand political party)',
+                   ' (politician)',
+                   ' (page does not exist)',
+                   ' (political party)',
+                   'Y ',
+                   'N ',
+                   '[note 1]']
+      bullshits.each {|bullshit| name.slice! bullshit }
+      name
+    end
+  end
+end

data/lib/scrapers/electorate_scraper.rb ADDED

@@ -0,0 +1,57 @@
+# encoding: utf-8
+module NZPolScrapers
+  class ElectorateScraper
+    require 'nokogiri'
+    require 'open-uri'
+    require 'yaml'
+    def self.scrape_to_files(directory)
+      result = scrape_to_hash
+      file = File.new("#{directory}/NZ Electorates.yml", "w")
+      file.puts result.to_yaml
+      file.close
+    end
+    def self.scrape_to_hash
+      result = {}
+      scrape do |name, url|
+        result[:"#{name}"] = { name: name, url: url }
+      end
+      result
+    end
+    def self.scrape_to_array
+      result = []
+      scrape do |name, url|
+        result << { name: name, url: url }
+      end
+      result
+    end
+    private
+    def self.scrape(&block)
+      # visit the nz electorates index on wikipedia
+      url = 'http://en.wikipedia.org/wiki/Category:New_Zealand_electorates'
+      doc = Nokogiri::HTML(open(url)).at_css('#mw-pages')
+      # select all the category links
+      links = doc.css('li a')
+      # load up the electorate page addresses and names into an array of hashes
+      electorates = []
+      links.each do |link|
+        name = clean_electorate_name(link.attributes['title'].value)
+        url = "http://en.wikipedia.org#{link.attributes['href'].value}"
+        yield name, url unless name == 'New Zealand electorates'
+      end
+    end
+    def self.clean_electorate_name(name)
+      name.slice! ' (New Zealand electorate)'
+      name
+    end
+  end
+end

metadata ADDED

@@ -0,0 +1,60 @@
+--- !ruby/object:Gem::Specification
+name: nz_pol_scrapers
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+platform: ruby
+authors:
+- Giles Thompson
+autorequire:
+bindir: bin
+cert_chain: []
+date: 2014-03-18 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: nokogiri
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.6'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.6'
+description: Wikipedia scrapers which parse information about NZ politics
+email:
+- iam@gilesthompson.co.nz
+executables: []
+extensions: []
+extra_rdoc_files: []
+files:
+- lib/nz_pol_scrapers.rb
+- lib/scrapers/electorate_results_scraper.rb
+- lib/scrapers/electorate_scraper.rb
+homepage: http://github.com/gilest/nz_pol_scrapers
+licenses: []
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: 1.9.2
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: 1.3.2
+requirements: []
+rubyforge_project:
+rubygems_version: 2.2.2
+signing_key:
+specification_version: 4
+summary: Wikipedia scrapers which parse information about NZ politics
+test_files: []