nz_pol_scrapers 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1c59b700dd438b7052ef8e78e9a9258aa6dc1b8d
4
+ data.tar.gz: 6943e11f8dbb7c1d1399687122aeb4b13ba13a00
5
+ SHA512:
6
+ metadata.gz: 9fefa50f0b553f33ef41cd2e458c9448847b2fbe7c2d887fe461d54f16947a6f832e263ca0e9d9dda74910f4f381326db1dd8bbe72b25cdcf76c336629530798
7
+ data.tar.gz: 25e6ee879973348baa784dfaf1ac99a76e5acea9979eb9a540a6c9869c8c1d9bccd3d5898fef47e7781283e66b6f8807243424255be0891098d5a15181fb36d1
@@ -0,0 +1,2 @@
1
+ # require all files in the scrapers dir
2
+ Dir.glob("#{File.dirname(__FILE__)}/scrapers/*").each { |f| require f }
@@ -0,0 +1,142 @@
1
+ # encoding: utf-8
2
+
3
+ module NZPolScrapers
4
+ class ElectorateResultsScraper
5
+ require 'nokogiri'
6
+ require 'open-uri'
7
+ require 'yaml'
8
+
9
+ def self.scrape_to_files(directory)
10
+ scrape do |parsed_rows, electorate|
11
+ year = parsed_rows.first[:candidacy][:year]
12
+ file = File.new("#{directory}/#{electorate[:name]} #{year}.yml", "w")
13
+ file.puts parsed_rows.to_yaml
14
+ file.close
15
+ end
16
+ return
17
+ end
18
+
19
+ def self.scrape_to_hash
20
+ result = {}
21
+ scrape do |parsed_rows, electorate|
22
+ result[:"#{electorate[:name]}"] = { :"#{parsed_rows.first[:candidacy][:year]}" => parsed_rows }
23
+ end
24
+ result
25
+ end
26
+
27
+ private
28
+
29
+ def self.scrape(&block)
30
+ # loop through each of the electorates
31
+ # we load them from wikipedia too see: electorate_scraper.rb
32
+ NZPolScrapers::ElectorateScraper.scrape_to_array.each do |electorate|
33
+ election_result_tables = election_result_tables_for(electorate)
34
+ # within each table get yer parse on
35
+ election_result_tables.each do |election_result_table|
36
+ parsed_rows = parsed_rows_from_election_result_table(election_result_table, electorate)
37
+ yield parsed_rows, electorate
38
+ end
39
+ end
40
+ end
41
+
42
+ def self.election_result_tables_for(electorate)
43
+ # load up the main content of the wikipedia page
44
+ doc = Nokogiri::HTML(open(electorate[:url])).at_css('#mw-content-text')
45
+ # find all the tables representing election results
46
+ election_result_tables = doc.css('table[border="1"]')
47
+ end
48
+
49
+ def self.parsed_rows_from_election_result_table(election_result_table, electorate)
50
+ year = election_result_table.children.first.children.first.children.first.attributes['title'].value.split(' ').last
51
+
52
+ puts "Scraping results for #{electorate[:name]}, #{year}"
53
+
54
+ table_title = election_result_table.children.first.children.text
55
+ election_type = table_title =~ /by\-election/ ? 'by-election' : 'general'
56
+
57
+ # A candidacy is an instance of a candidate seeking election in an electorate
58
+ candidacy_rows = election_result_table.css('tr.vcard')
59
+
60
+ parsed_rows = []
61
+
62
+ candidacy_rows.each do |candidacy_row|
63
+ # don't parse anything if the fourth child is all blank text, this means there is no candidate name or vote information
64
+ unless candidacy_row.children[4].children.text.empty?
65
+ parsed_row = { party: party_for_candidacy_row(candidacy_row),
66
+ candidate: candidate_for_candidacy_row(candidacy_row),
67
+ candidacy: candidacy_for_candidacy_row(candidacy_row, year, electorate[:name], election_type)
68
+ }
69
+ parsed_rows << parsed_row
70
+ puts "...parsed #{parsed_row[:candidate][:name]} of #{parsed_row[:party][:name]}"
71
+ end
72
+ end
73
+
74
+ parsed_rows
75
+ end
76
+
77
+ def self.party_for_candidacy_row(candidacy_row)
78
+ party = {}
79
+ party[:colour] = candidacy_row.children.first.attributes['style'].value.split(';').first.split(' ').last
80
+ party[:name] = party_name_for_candidacy_row(candidacy_row)
81
+ party[:short_name] = short_party_name_for_candidacy_row(candidacy_row)
82
+ party
83
+ end
84
+
85
+ def self.candidate_for_candidacy_row(candidacy_row)
86
+ candidate = {}
87
+ candidacy_row.children[4].css('img').remove # remove any image tags from within the name cell
88
+ candidacy_row.children[4].css('sup').remove # remove any superscript tags from within the name cell
89
+ name = candidacy_row.children[4].children.text
90
+ cut_bullshit_from(name)
91
+ candidate[:name] = name
92
+ split_name = candidate[:name].split(' ')
93
+ candidate[:last_name] = split_name.pop
94
+ candidate[:first_name] = split_name.join(' ')
95
+ candidate
96
+ end
97
+
98
+ def self.candidacy_for_candidacy_row(candidacy_row, year, electorate, election_type)
99
+ candidacy = {}
100
+ candidacy[:votes] = candidacy_row.children[6].text.delete(',').to_i
101
+ candidacy[:percent] = candidacy_row.children[8].text.to_f
102
+ candidacy[:electorate] = electorate
103
+ candidacy[:year] = year
104
+ candidacy[:election_type] = election_type
105
+ candidacy
106
+ end
107
+
108
+ def self.party_name_for_candidacy_row(candidacy_row)
109
+ # some cells contain links, others don't, so we need to get different values depending on what's there
110
+ unless candidacy_row.children[2].children.first.attributes['title'].nil?
111
+ name = candidacy_row.children[2].children.first.attributes['title'].value
112
+ else
113
+ name = candidacy_row.children[2].children.first.text
114
+ end
115
+ cut_bullshit_from(name)
116
+ end
117
+
118
+ def self.short_party_name_for_candidacy_row(candidacy_row)
119
+ # likewise for the regular party name, in some cases there is only one name so we must fall back
120
+ unless candidacy_row.children[2].children.last.children.text == ''
121
+ name = candidacy_row.children[2].children.last.children.text
122
+ else
123
+ name = candidacy_row.children[2].children.last.text
124
+ end
125
+ cut_bullshit_from(name)
126
+ end
127
+
128
+ def self.cut_bullshit_from(name)
129
+ bullshits = [' (New Zealand)',
130
+ ' (New Zealand political party)',
131
+ ' (politician)',
132
+ ' (page does not exist)',
133
+ ' (political party)',
134
+ 'Y ',
135
+ 'N ',
136
+ '[note 1]']
137
+ bullshits.each {|bullshit| name.slice! bullshit }
138
+ name
139
+ end
140
+
141
+ end
142
+ end
@@ -0,0 +1,57 @@
1
+ # encoding: utf-8
2
+
3
+ module NZPolScrapers
4
+ class ElectorateScraper
5
+ require 'nokogiri'
6
+ require 'open-uri'
7
+ require 'yaml'
8
+
9
+ def self.scrape_to_files(directory)
10
+ result = scrape_to_hash
11
+ file = File.new("#{directory}/NZ Electorates.yml", "w")
12
+ file.puts result.to_yaml
13
+ file.close
14
+ end
15
+
16
+ def self.scrape_to_hash
17
+ result = {}
18
+ scrape do |name, url|
19
+ result[:"#{name}"] = { name: name, url: url }
20
+ end
21
+ result
22
+ end
23
+
24
+ def self.scrape_to_array
25
+ result = []
26
+ scrape do |name, url|
27
+ result << { name: name, url: url }
28
+ end
29
+ result
30
+ end
31
+
32
+ private
33
+
34
+ def self.scrape(&block)
35
+ # visit the nz electorates index on wikipedia
36
+ url = 'http://en.wikipedia.org/wiki/Category:New_Zealand_electorates'
37
+ doc = Nokogiri::HTML(open(url)).at_css('#mw-pages')
38
+
39
+ # select all the category links
40
+ links = doc.css('li a')
41
+
42
+ # load up the electorate page addresses and names into an array of hashes
43
+ electorates = []
44
+ links.each do |link|
45
+ name = clean_electorate_name(link.attributes['title'].value)
46
+ url = "http://en.wikipedia.org#{link.attributes['href'].value}"
47
+ yield name, url unless name == 'New Zealand electorates'
48
+ end
49
+ end
50
+
51
+ def self.clean_electorate_name(name)
52
+ name.slice! ' (New Zealand electorate)'
53
+ name
54
+ end
55
+
56
+ end
57
+ end
metadata ADDED
@@ -0,0 +1,60 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: nz_pol_scrapers
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Giles Thompson
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-03-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ description: Wikipedia scrapers which parse information about NZ politics
28
+ email:
29
+ - iam@gilesthompson.co.nz
30
+ executables: []
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - lib/nz_pol_scrapers.rb
35
+ - lib/scrapers/electorate_results_scraper.rb
36
+ - lib/scrapers/electorate_scraper.rb
37
+ homepage: http://github.com/gilest/nz_pol_scrapers
38
+ licenses: []
39
+ metadata: {}
40
+ post_install_message:
41
+ rdoc_options: []
42
+ require_paths:
43
+ - lib
44
+ required_ruby_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: 1.9.2
49
+ required_rubygems_version: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 1.3.2
54
+ requirements: []
55
+ rubyforge_project:
56
+ rubygems_version: 2.2.2
57
+ signing_key:
58
+ specification_version: 4
59
+ summary: Wikipedia scrapers which parse information about NZ politics
60
+ test_files: []