nz_pol_scrapers 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/lib/nz_pol_scrapers.rb +2 -0
- data/lib/scrapers/electorate_results_scraper.rb +142 -0
- data/lib/scrapers/electorate_scraper.rb +57 -0
- metadata +60 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 1c59b700dd438b7052ef8e78e9a9258aa6dc1b8d
|
4
|
+
data.tar.gz: 6943e11f8dbb7c1d1399687122aeb4b13ba13a00
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 9fefa50f0b553f33ef41cd2e458c9448847b2fbe7c2d887fe461d54f16947a6f832e263ca0e9d9dda74910f4f381326db1dd8bbe72b25cdcf76c336629530798
|
7
|
+
data.tar.gz: 25e6ee879973348baa784dfaf1ac99a76e5acea9979eb9a540a6c9869c8c1d9bccd3d5898fef47e7781283e66b6f8807243424255be0891098d5a15181fb36d1
|
@@ -0,0 +1,142 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module NZPolScrapers
|
4
|
+
class ElectorateResultsScraper
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'open-uri'
|
7
|
+
require 'yaml'
|
8
|
+
|
9
|
+
def self.scrape_to_files(directory)
|
10
|
+
scrape do |parsed_rows, electorate|
|
11
|
+
year = parsed_rows.first[:candidacy][:year]
|
12
|
+
file = File.new("#{directory}/#{electorate[:name]} #{year}.yml", "w")
|
13
|
+
file.puts parsed_rows.to_yaml
|
14
|
+
file.close
|
15
|
+
end
|
16
|
+
return
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.scrape_to_hash
|
20
|
+
result = {}
|
21
|
+
scrape do |parsed_rows, electorate|
|
22
|
+
result[:"#{electorate[:name]}"] = { :"#{parsed_rows.first[:candidacy][:year]}" => parsed_rows }
|
23
|
+
end
|
24
|
+
result
|
25
|
+
end
|
26
|
+
|
27
|
+
private
|
28
|
+
|
29
|
+
def self.scrape(&block)
|
30
|
+
# loop through each of the electorates
|
31
|
+
# we load them from wikipedia too see: electorate_scraper.rb
|
32
|
+
NZPolScrapers::ElectorateScraper.scrape_to_array.each do |electorate|
|
33
|
+
election_result_tables = election_result_tables_for(electorate)
|
34
|
+
# within each table get yer parse on
|
35
|
+
election_result_tables.each do |election_result_table|
|
36
|
+
parsed_rows = parsed_rows_from_election_result_table(election_result_table, electorate)
|
37
|
+
yield parsed_rows, electorate
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.election_result_tables_for(electorate)
|
43
|
+
# load up the main content of the wikipedia page
|
44
|
+
doc = Nokogiri::HTML(open(electorate[:url])).at_css('#mw-content-text')
|
45
|
+
# find all the tables representing election results
|
46
|
+
election_result_tables = doc.css('table[border="1"]')
|
47
|
+
end
|
48
|
+
|
49
|
+
def self.parsed_rows_from_election_result_table(election_result_table, electorate)
|
50
|
+
year = election_result_table.children.first.children.first.children.first.attributes['title'].value.split(' ').last
|
51
|
+
|
52
|
+
puts "Scraping results for #{electorate[:name]}, #{year}"
|
53
|
+
|
54
|
+
table_title = election_result_table.children.first.children.text
|
55
|
+
election_type = table_title =~ /by\-election/ ? 'by-election' : 'general'
|
56
|
+
|
57
|
+
# A candidacy is an instance of a candidate seeking election in an electorate
|
58
|
+
candidacy_rows = election_result_table.css('tr.vcard')
|
59
|
+
|
60
|
+
parsed_rows = []
|
61
|
+
|
62
|
+
candidacy_rows.each do |candidacy_row|
|
63
|
+
# don't parse anything if the fourth child is all blank text, this means there is no candidate name or vote information
|
64
|
+
unless candidacy_row.children[4].children.text.empty?
|
65
|
+
parsed_row = { party: party_for_candidacy_row(candidacy_row),
|
66
|
+
candidate: candidate_for_candidacy_row(candidacy_row),
|
67
|
+
candidacy: candidacy_for_candidacy_row(candidacy_row, year, electorate[:name], election_type)
|
68
|
+
}
|
69
|
+
parsed_rows << parsed_row
|
70
|
+
puts "...parsed #{parsed_row[:candidate][:name]} of #{parsed_row[:party][:name]}"
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
parsed_rows
|
75
|
+
end
|
76
|
+
|
77
|
+
def self.party_for_candidacy_row(candidacy_row)
|
78
|
+
party = {}
|
79
|
+
party[:colour] = candidacy_row.children.first.attributes['style'].value.split(';').first.split(' ').last
|
80
|
+
party[:name] = party_name_for_candidacy_row(candidacy_row)
|
81
|
+
party[:short_name] = short_party_name_for_candidacy_row(candidacy_row)
|
82
|
+
party
|
83
|
+
end
|
84
|
+
|
85
|
+
def self.candidate_for_candidacy_row(candidacy_row)
|
86
|
+
candidate = {}
|
87
|
+
candidacy_row.children[4].css('img').remove # remove any image tags from within the name cell
|
88
|
+
candidacy_row.children[4].css('sup').remove # remove any superscript tags from within the name cell
|
89
|
+
name = candidacy_row.children[4].children.text
|
90
|
+
cut_bullshit_from(name)
|
91
|
+
candidate[:name] = name
|
92
|
+
split_name = candidate[:name].split(' ')
|
93
|
+
candidate[:last_name] = split_name.pop
|
94
|
+
candidate[:first_name] = split_name.join(' ')
|
95
|
+
candidate
|
96
|
+
end
|
97
|
+
|
98
|
+
def self.candidacy_for_candidacy_row(candidacy_row, year, electorate, election_type)
|
99
|
+
candidacy = {}
|
100
|
+
candidacy[:votes] = candidacy_row.children[6].text.delete(',').to_i
|
101
|
+
candidacy[:percent] = candidacy_row.children[8].text.to_f
|
102
|
+
candidacy[:electorate] = electorate
|
103
|
+
candidacy[:year] = year
|
104
|
+
candidacy[:election_type] = election_type
|
105
|
+
candidacy
|
106
|
+
end
|
107
|
+
|
108
|
+
def self.party_name_for_candidacy_row(candidacy_row)
|
109
|
+
# some cells contain links, others don't, so we need to get different values depending on what's there
|
110
|
+
unless candidacy_row.children[2].children.first.attributes['title'].nil?
|
111
|
+
name = candidacy_row.children[2].children.first.attributes['title'].value
|
112
|
+
else
|
113
|
+
name = candidacy_row.children[2].children.first.text
|
114
|
+
end
|
115
|
+
cut_bullshit_from(name)
|
116
|
+
end
|
117
|
+
|
118
|
+
def self.short_party_name_for_candidacy_row(candidacy_row)
|
119
|
+
# likewise for the regular party name, in some cases there is only one name so we must fall back
|
120
|
+
unless candidacy_row.children[2].children.last.children.text == ''
|
121
|
+
name = candidacy_row.children[2].children.last.children.text
|
122
|
+
else
|
123
|
+
name = candidacy_row.children[2].children.last.text
|
124
|
+
end
|
125
|
+
cut_bullshit_from(name)
|
126
|
+
end
|
127
|
+
|
128
|
+
def self.cut_bullshit_from(name)
|
129
|
+
bullshits = [' (New Zealand)',
|
130
|
+
' (New Zealand political party)',
|
131
|
+
' (politician)',
|
132
|
+
' (page does not exist)',
|
133
|
+
' (political party)',
|
134
|
+
'Y ',
|
135
|
+
'N ',
|
136
|
+
'[note 1]']
|
137
|
+
bullshits.each {|bullshit| name.slice! bullshit }
|
138
|
+
name
|
139
|
+
end
|
140
|
+
|
141
|
+
end
|
142
|
+
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
module NZPolScrapers
|
4
|
+
class ElectorateScraper
|
5
|
+
require 'nokogiri'
|
6
|
+
require 'open-uri'
|
7
|
+
require 'yaml'
|
8
|
+
|
9
|
+
def self.scrape_to_files(directory)
|
10
|
+
result = scrape_to_hash
|
11
|
+
file = File.new("#{directory}/NZ Electorates.yml", "w")
|
12
|
+
file.puts result.to_yaml
|
13
|
+
file.close
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.scrape_to_hash
|
17
|
+
result = {}
|
18
|
+
scrape do |name, url|
|
19
|
+
result[:"#{name}"] = { name: name, url: url }
|
20
|
+
end
|
21
|
+
result
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.scrape_to_array
|
25
|
+
result = []
|
26
|
+
scrape do |name, url|
|
27
|
+
result << { name: name, url: url }
|
28
|
+
end
|
29
|
+
result
|
30
|
+
end
|
31
|
+
|
32
|
+
private
|
33
|
+
|
34
|
+
def self.scrape(&block)
|
35
|
+
# visit the nz electorates index on wikipedia
|
36
|
+
url = 'http://en.wikipedia.org/wiki/Category:New_Zealand_electorates'
|
37
|
+
doc = Nokogiri::HTML(open(url)).at_css('#mw-pages')
|
38
|
+
|
39
|
+
# select all the category links
|
40
|
+
links = doc.css('li a')
|
41
|
+
|
42
|
+
# load up the electorate page addresses and names into an array of hashes
|
43
|
+
electorates = []
|
44
|
+
links.each do |link|
|
45
|
+
name = clean_electorate_name(link.attributes['title'].value)
|
46
|
+
url = "http://en.wikipedia.org#{link.attributes['href'].value}"
|
47
|
+
yield name, url unless name == 'New Zealand electorates'
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
def self.clean_electorate_name(name)
|
52
|
+
name.slice! ' (New Zealand electorate)'
|
53
|
+
name
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
57
|
+
end
|
metadata
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: nz_pol_scrapers
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Giles Thompson
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-03-18 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.6'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.6'
|
27
|
+
description: Wikipedia scrapers which parse information about NZ politics
|
28
|
+
email:
|
29
|
+
- iam@gilesthompson.co.nz
|
30
|
+
executables: []
|
31
|
+
extensions: []
|
32
|
+
extra_rdoc_files: []
|
33
|
+
files:
|
34
|
+
- lib/nz_pol_scrapers.rb
|
35
|
+
- lib/scrapers/electorate_results_scraper.rb
|
36
|
+
- lib/scrapers/electorate_scraper.rb
|
37
|
+
homepage: http://github.com/gilest/nz_pol_scrapers
|
38
|
+
licenses: []
|
39
|
+
metadata: {}
|
40
|
+
post_install_message:
|
41
|
+
rdoc_options: []
|
42
|
+
require_paths:
|
43
|
+
- lib
|
44
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - ">="
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: 1.9.2
|
49
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: 1.3.2
|
54
|
+
requirements: []
|
55
|
+
rubyforge_project:
|
56
|
+
rubygems_version: 2.2.2
|
57
|
+
signing_key:
|
58
|
+
specification_version: 4
|
59
|
+
summary: Wikipedia scrapers which parse information about NZ politics
|
60
|
+
test_files: []
|