nz_pol_scrapers 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 1c59b700dd438b7052ef8e78e9a9258aa6dc1b8d
4
+ data.tar.gz: 6943e11f8dbb7c1d1399687122aeb4b13ba13a00
5
+ SHA512:
6
+ metadata.gz: 9fefa50f0b553f33ef41cd2e458c9448847b2fbe7c2d887fe461d54f16947a6f832e263ca0e9d9dda74910f4f381326db1dd8bbe72b25cdcf76c336629530798
7
+ data.tar.gz: 25e6ee879973348baa784dfaf1ac99a76e5acea9979eb9a540a6c9869c8c1d9bccd3d5898fef47e7781283e66b6f8807243424255be0891098d5a15181fb36d1
@@ -0,0 +1,2 @@
1
+ # require all files in the scrapers dir
2
+ Dir.glob("#{File.dirname(__FILE__)}/scrapers/*").each { |f| require f }
@@ -0,0 +1,142 @@
1
+ # encoding: utf-8
2
+
3
+ module NZPolScrapers
4
+ class ElectorateResultsScraper
5
+ require 'nokogiri'
6
+ require 'open-uri'
7
+ require 'yaml'
8
+
9
+ def self.scrape_to_files(directory)
10
+ scrape do |parsed_rows, electorate|
11
+ year = parsed_rows.first[:candidacy][:year]
12
+ file = File.new("#{directory}/#{electorate[:name]} #{year}.yml", "w")
13
+ file.puts parsed_rows.to_yaml
14
+ file.close
15
+ end
16
+ return
17
+ end
18
+
19
+ def self.scrape_to_hash
20
+ result = {}
21
+ scrape do |parsed_rows, electorate|
22
+ result[:"#{electorate[:name]}"] = { :"#{parsed_rows.first[:candidacy][:year]}" => parsed_rows }
23
+ end
24
+ result
25
+ end
26
+
27
+ private
28
+
29
+ def self.scrape(&block)
30
+ # loop through each of the electorates
31
+ # we load them from wikipedia too see: electorate_scraper.rb
32
+ NZPolScrapers::ElectorateScraper.scrape_to_array.each do |electorate|
33
+ election_result_tables = election_result_tables_for(electorate)
34
+ # within each table get yer parse on
35
+ election_result_tables.each do |election_result_table|
36
+ parsed_rows = parsed_rows_from_election_result_table(election_result_table, electorate)
37
+ yield parsed_rows, electorate
38
+ end
39
+ end
40
+ end
41
+
42
+ def self.election_result_tables_for(electorate)
43
+ # load up the main content of the wikipedia page
44
+ doc = Nokogiri::HTML(open(electorate[:url])).at_css('#mw-content-text')
45
+ # find all the tables representing election results
46
+ election_result_tables = doc.css('table[border="1"]')
47
+ end
48
+
49
+ def self.parsed_rows_from_election_result_table(election_result_table, electorate)
50
+ year = election_result_table.children.first.children.first.children.first.attributes['title'].value.split(' ').last
51
+
52
+ puts "Scraping results for #{electorate[:name]}, #{year}"
53
+
54
+ table_title = election_result_table.children.first.children.text
55
+ election_type = table_title =~ /by\-election/ ? 'by-election' : 'general'
56
+
57
+ # A candidacy is an instance of a candidate seeking election in an electorate
58
+ candidacy_rows = election_result_table.css('tr.vcard')
59
+
60
+ parsed_rows = []
61
+
62
+ candidacy_rows.each do |candidacy_row|
63
+ # don't parse anything if the fourth child is all blank text, this means there is no candidate name or vote information
64
+ unless candidacy_row.children[4].children.text.empty?
65
+ parsed_row = { party: party_for_candidacy_row(candidacy_row),
66
+ candidate: candidate_for_candidacy_row(candidacy_row),
67
+ candidacy: candidacy_for_candidacy_row(candidacy_row, year, electorate[:name], election_type)
68
+ }
69
+ parsed_rows << parsed_row
70
+ puts "...parsed #{parsed_row[:candidate][:name]} of #{parsed_row[:party][:name]}"
71
+ end
72
+ end
73
+
74
+ parsed_rows
75
+ end
76
+
77
+ def self.party_for_candidacy_row(candidacy_row)
78
+ party = {}
79
+ party[:colour] = candidacy_row.children.first.attributes['style'].value.split(';').first.split(' ').last
80
+ party[:name] = party_name_for_candidacy_row(candidacy_row)
81
+ party[:short_name] = short_party_name_for_candidacy_row(candidacy_row)
82
+ party
83
+ end
84
+
85
+ def self.candidate_for_candidacy_row(candidacy_row)
86
+ candidate = {}
87
+ candidacy_row.children[4].css('img').remove # remove any image tags from within the name cell
88
+ candidacy_row.children[4].css('sup').remove # remove any superscript tags from within the name cell
89
+ name = candidacy_row.children[4].children.text
90
+ cut_bullshit_from(name)
91
+ candidate[:name] = name
92
+ split_name = candidate[:name].split(' ')
93
+ candidate[:last_name] = split_name.pop
94
+ candidate[:first_name] = split_name.join(' ')
95
+ candidate
96
+ end
97
+
98
+ def self.candidacy_for_candidacy_row(candidacy_row, year, electorate, election_type)
99
+ candidacy = {}
100
+ candidacy[:votes] = candidacy_row.children[6].text.delete(',').to_i
101
+ candidacy[:percent] = candidacy_row.children[8].text.to_f
102
+ candidacy[:electorate] = electorate
103
+ candidacy[:year] = year
104
+ candidacy[:election_type] = election_type
105
+ candidacy
106
+ end
107
+
108
+ def self.party_name_for_candidacy_row(candidacy_row)
109
+ # some cells contain links, others don't, so we need to get different values depending on what's there
110
+ unless candidacy_row.children[2].children.first.attributes['title'].nil?
111
+ name = candidacy_row.children[2].children.first.attributes['title'].value
112
+ else
113
+ name = candidacy_row.children[2].children.first.text
114
+ end
115
+ cut_bullshit_from(name)
116
+ end
117
+
118
+ def self.short_party_name_for_candidacy_row(candidacy_row)
119
+ # likewise for the regular party name, in some cases there is only one name so we must fall back
120
+ unless candidacy_row.children[2].children.last.children.text == ''
121
+ name = candidacy_row.children[2].children.last.children.text
122
+ else
123
+ name = candidacy_row.children[2].children.last.text
124
+ end
125
+ cut_bullshit_from(name)
126
+ end
127
+
128
+ def self.cut_bullshit_from(name)
129
+ bullshits = [' (New Zealand)',
130
+ ' (New Zealand political party)',
131
+ ' (politician)',
132
+ ' (page does not exist)',
133
+ ' (political party)',
134
+ 'Y ',
135
+ 'N ',
136
+ '[note 1]']
137
+ bullshits.each {|bullshit| name.slice! bullshit }
138
+ name
139
+ end
140
+
141
+ end
142
+ end
@@ -0,0 +1,57 @@
1
+ # encoding: utf-8
2
+
3
+ module NZPolScrapers
4
+ class ElectorateScraper
5
+ require 'nokogiri'
6
+ require 'open-uri'
7
+ require 'yaml'
8
+
9
+ def self.scrape_to_files(directory)
10
+ result = scrape_to_hash
11
+ file = File.new("#{directory}/NZ Electorates.yml", "w")
12
+ file.puts result.to_yaml
13
+ file.close
14
+ end
15
+
16
+ def self.scrape_to_hash
17
+ result = {}
18
+ scrape do |name, url|
19
+ result[:"#{name}"] = { name: name, url: url }
20
+ end
21
+ result
22
+ end
23
+
24
+ def self.scrape_to_array
25
+ result = []
26
+ scrape do |name, url|
27
+ result << { name: name, url: url }
28
+ end
29
+ result
30
+ end
31
+
32
+ private
33
+
34
+ def self.scrape(&block)
35
+ # visit the nz electorates index on wikipedia
36
+ url = 'http://en.wikipedia.org/wiki/Category:New_Zealand_electorates'
37
+ doc = Nokogiri::HTML(open(url)).at_css('#mw-pages')
38
+
39
+ # select all the category links
40
+ links = doc.css('li a')
41
+
42
+ # load up the electorate page addresses and names into an array of hashes
43
+ electorates = []
44
+ links.each do |link|
45
+ name = clean_electorate_name(link.attributes['title'].value)
46
+ url = "http://en.wikipedia.org#{link.attributes['href'].value}"
47
+ yield name, url unless name == 'New Zealand electorates'
48
+ end
49
+ end
50
+
51
+ def self.clean_electorate_name(name)
52
+ name.slice! ' (New Zealand electorate)'
53
+ name
54
+ end
55
+
56
+ end
57
+ end
metadata ADDED
@@ -0,0 +1,60 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: nz_pol_scrapers
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Giles Thompson
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-03-18 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: nokogiri
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.6'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: '1.6'
27
+ description: Wikipedia scrapers which parse information about NZ politics
28
+ email:
29
+ - iam@gilesthompson.co.nz
30
+ executables: []
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - lib/nz_pol_scrapers.rb
35
+ - lib/scrapers/electorate_results_scraper.rb
36
+ - lib/scrapers/electorate_scraper.rb
37
+ homepage: http://github.com/gilest/nz_pol_scrapers
38
+ licenses: []
39
+ metadata: {}
40
+ post_install_message:
41
+ rdoc_options: []
42
+ require_paths:
43
+ - lib
44
+ required_ruby_version: !ruby/object:Gem::Requirement
45
+ requirements:
46
+ - - ">="
47
+ - !ruby/object:Gem::Version
48
+ version: 1.9.2
49
+ required_rubygems_version: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: 1.3.2
54
+ requirements: []
55
+ rubyforge_project:
56
+ rubygems_version: 2.2.2
57
+ signing_key:
58
+ specification_version: 4
59
+ summary: Wikipedia scrapers which parse information about NZ politics
60
+ test_files: []