uk_parliament 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,20 @@
1
+ module UkParliament
2
+ # Class representing the House of Lords.
3
+ class Lords
4
+ include UkParliament
5
+
6
+ # Unique identifier for House of Lords.
7
+ HOUSE_ID = 'lords'
8
+ # URL of where to look for the list of Lords members.
9
+ MEMBER_LIST_URL = 'http://www.parliament.uk/mps-lords-and-offices/lords/'
10
+
11
+ # Instance data accessor(s).
12
+ attr_reader :members
13
+
14
+ # Initialise the class populating the Lords member data.
15
+ def initialize(load_from_file = false)
16
+ @members = HouseMembersManager.new(HOUSE_ID, load_from_file).members
17
+ end
18
+ end
19
+
20
+ end
@@ -0,0 +1,103 @@
1
+ module UkParliament
2
+ # Class defining the pipeline process of a scraped member list document.
3
+ class MemberListDocPipeline < DocPipeline
4
+ # Initialise the class, calling the parent class init, with provided args.
5
+ def initialize(house_id, document)
6
+ super
7
+ end
8
+
9
+ # Produce the list of members for the relevant house.
10
+ def house_member_list(members)
11
+ @members = members
12
+
13
+ execute
14
+ end
15
+
16
+ private
17
+
18
+ # Define the tasks that will be performed for the commons member list
19
+ # pipeline.
20
+ def define_commons_tasks
21
+ @commons_tasks = %w(commons_members)
22
+ end
23
+
24
+ # Define the tasks that will be performed for the lords member list
25
+ # pipeline.
26
+ def define_lords_tasks
27
+ @lords_tasks = %w(lords_members)
28
+ end
29
+
30
+ # Process House of Commons member list document data, pulling out each
31
+ # member's basic data and appending to a list of members.
32
+ def commons_members
33
+ table_rows = @document.xpath("//tr[descendant::a[starts-with(@href, 'http://www.parliament.uk/biographies/commons/')]]")
34
+
35
+ table_rows.each do |row|
36
+ member = {}
37
+
38
+ name = row.at_xpath('./td[1]/a')
39
+ first_cell_text = row.xpath('./td[1]//text()')
40
+ constituency = row.at_xpath('./td[2]')
41
+
42
+ member_name(member, name)
43
+ member_profile_url(member, name)
44
+ member_id(member, name)
45
+ commons_party(member, first_cell_text)
46
+ commons_constituency(member, constituency)
47
+
48
+ @members << member
49
+ end
50
+ end
51
+
52
+ # Process House of Lords member list document data, pulling out each
53
+ # member's basic data and appending to a list of members.
54
+ def lords_members
55
+ table_rows = @document.xpath("//tr[descendant::a[starts-with(@href, 'http://www.parliament.uk/biographies/lords/')]]")
56
+
57
+ table_rows.each do |row|
58
+ member = {}
59
+
60
+ name = row.at_xpath('./td[1]/a')
61
+ party = row.at_xpath('./td[2]')
62
+
63
+ member_name(member, name)
64
+ member_profile_url(member, name)
65
+ member_id(member, name)
66
+ lords_party(member, party)
67
+
68
+ @members << member
69
+ end
70
+ end
71
+
72
+ # Extract member name data from a document node.
73
+ def member_name(member, node)
74
+ member['alphabetical_name'] = node.content
75
+ end
76
+
77
+ # Extract member summary URL data from a document node.
78
+ def member_profile_url(member, node)
79
+ member['url'] = node['href']
80
+ end
81
+
82
+ # Extract member ID data from a document node.
83
+ def member_id(member, node)
84
+ member['id'] = node['href'].split('/').last.to_i
85
+ end
86
+
87
+ # Extract Commons member party from a document node.
88
+ def commons_party(member, nodeset)
89
+ member['party'] = nodeset.last.to_s.strip[1..-2]
90
+ end
91
+
92
+ # Extract a Commons member constituency from a document node.
93
+ def commons_constituency(member, node)
94
+ member['constituency'] = node.content
95
+ end
96
+
97
+ # Extract Lords member party or group from a document node.
98
+ def lords_party(member, node)
99
+ member['party_or_group'] = node.content
100
+ end
101
+ end
102
+
103
+ end
@@ -0,0 +1,215 @@
1
+ module UkParliament
2
+ # Class defining the pipeline process of a scraped member summary document.
3
+ class MemberSummaryDocPipeline < DocPipeline
4
+ # Initialise the class, calling the parent class init, with provided args.
5
+ def initialize(house_id, document)
6
+ super
7
+ end
8
+
9
+ # Produce the member summary.
10
+ def enrich_member_data(member)
11
+ @member = member
12
+
13
+ execute
14
+ end
15
+
16
+ private
17
+
18
+ # Define the tasks that will be performed for the commons member summary
19
+ # pipeline.
20
+ def define_commons_tasks
21
+ @commons_tasks = %w(parliamentary_details departmental_details constituency_details digital_details commons_member_name)
22
+ end
23
+
24
+ # Define the tasks that will be performed for the lords member summary
25
+ # pipeline.
26
+ def define_lords_tasks
27
+ @lords_tasks = %w(parliamentary_details departmental_details external_office_details digital_details lords_member_name)
28
+ end
29
+
30
+ # Extract the parliamentary contact details for a member.
31
+ def parliamentary_details
32
+ nodeset = @document.xpath("//div[contains(@class, 'contact-detail') and contains(@class, 'parliamentary')]")
33
+
34
+ if nodeset.length > 0
35
+ section_id = 'parliamentary_contact'
36
+ @member[section_id] = {}
37
+ section_contact_details(section_id, nodeset)
38
+ end
39
+ end
40
+
41
+ # Create a container for a particular section of contact details.
42
+ def section_contact_details(section_id, nodeset)
43
+ address = nodeset.at_xpath(".//*[@data-generic-id = 'address']")
44
+ phone_fax = nodeset.at_xpath(".//*[@data-generic-id = 'telephone']")
45
+ email = nodeset.at_xpath(".//*[@data-generic-id = 'email-address']/a/span[@class = '__cf_email__']")
46
+
47
+ address(address, section_id)
48
+ phone_fax(phone_fax, section_id)
49
+ email(email, section_id)
50
+ end
51
+
52
+ # Extract the address value from a document node.
53
+ def address(node, section_id)
54
+ unless node.nil?
55
+ @member[section_id]['address'] = node.content.strip
56
+ end
57
+ end
58
+
59
+ # Extract the phone/fax value(s) from a document node.
60
+ def phone_fax(node, section_id)
61
+ unless node.nil?
62
+ # Some telephone values include a 'Fax' number label/value as well as a
63
+ # 'Tel' number label/value
64
+ if node.content.include?('Fax')
65
+ parts = node.content.strip.gsub(/\s+/, ' ').split(/fax:*\s*/i)
66
+ @member[section_id]['telephone'] = parts[0].gsub(/tel:*\s*/i, '').strip
67
+ @member[section_id]['fax'] = parts[1]
68
+ else
69
+ @member[section_id]['telephone'] = node.content.strip.gsub(/\s+/, ' ').sub(/tel:*\s*/i, '')
70
+ end
71
+ end
72
+ end
73
+
74
+ # Extract email value from a document node.
75
+ def email(node, section_id)
76
+ unless node.nil?
77
+ @member[section_id]['email'] = decode_email(node['data-cfemail'])
78
+ end
79
+ end
80
+
81
+ # Decode the Cloudflare encoded email address.
82
+ def decode_email(code)
83
+ k = code[0..1].hex
84
+
85
+ (2..(code.size - 1)).step(2).to_a.map{ |i|
86
+ (code[i..(i + 1)].hex ^ k).chr
87
+ }.join
88
+ end
89
+
90
+ # Extract the constituency contact details for a member.
91
+ def constituency_details
92
+ nodeset = @document.xpath("//div[contains(@class, 'contact-detail') and contains(@class, 'constituency')]")
93
+
94
+ if nodeset.length > 0
95
+ section_id = 'constituency_contact'
96
+ @member[section_id] = {}
97
+ section_contact_details(section_id, nodeset)
98
+ end
99
+ end
100
+
101
+ # Extract the external office contact details for a member.
102
+ def external_office_details
103
+ nodeset = @document.xpath("//div[contains(@class, 'contact-detail') and contains(@class, 'externalprivate-office')]")
104
+
105
+ if nodeset.length > 0
106
+ section_id = 'external_contact'
107
+ @member[section_id] = {}
108
+ section_contact_details(section_id, nodeset)
109
+ end
110
+ end
111
+
112
+ # Extract the departmental office contact details for a member.
113
+ def departmental_details
114
+ nodeset = @document.xpath("//div[contains(@class, 'contact-detail') and contains(@class, 'departmental')]")
115
+
116
+ if nodeset.length > 0
117
+ section_id = 'departmental_contact'
118
+ @member[section_id] = {}
119
+ section_contact_details(section_id, nodeset)
120
+ end
121
+ end
122
+
123
+ # Extract the digital contact details for a member.
124
+ def digital_details
125
+ nodeset = @document.xpath("//div[@id = 'web-social-media']")
126
+
127
+ web = nodeset.xpath(".//*[@data-generic-id = 'website']/a")
128
+ twitter = nodeset.at_xpath(".//*[@data-generic-id = 'twitter']/a")
129
+ facebook = nodeset.at_xpath(".//*[@data-generic-id = 'facebook']/a")
130
+
131
+ web(web)
132
+ twitter(twitter)
133
+ facebook(facebook)
134
+ end
135
+
136
+ # Extract web address value(s) from a document node.
137
+ def web(nodeset)
138
+ unless nodeset.nil? || nodeset.empty?
139
+ @member['web'] = []
140
+
141
+ nodeset.each { |node|
142
+ @member['web'] << node['href']
143
+ }
144
+ end
145
+ end
146
+
147
+ # Extract Twitter account values from a document node.
148
+ def twitter(node)
149
+ unless node.nil?
150
+ @member['twitter'] = {
151
+ 'profile' => node['href'],
152
+ 'username' => node.child.content
153
+ }
154
+ end
155
+ end
156
+
157
+ # Extract Facebook link value from a document node.
158
+ def facebook(node)
159
+ unless node.nil?
160
+ @member['facebook'] = node['href']
161
+ end
162
+ end
163
+
164
+ # Extract a commons member name value from a document node.
165
+ def commons_member_name
166
+ section_id = 'name'
167
+ @member[section_id] = {}
168
+
169
+ title_list = %w(Mr Mrs Ms Dr Sir Dame Lady Lord)
170
+ # String: "Abbot, Ms Diane"
171
+ components = @member['alphabetical_name'].split(',')
172
+ # Array: |Abbot| Ms Diane|
173
+ surname = components.shift
174
+ # Array: | Ms Diane|
175
+ components = components.join.split(' ')
176
+ # Array: |Ms|Diane|
177
+ if title_list.include?(components[0])
178
+ @member[section_id]['title'] = components.shift
179
+ end
180
+ # Array: |Diane|
181
+ components << surname
182
+ # Array: |Diane|Abbot|
183
+ @member[section_id]['full_name'] = components.join(' ')
184
+ @member[section_id]['given_name'] = components.shift
185
+ @member[section_id]['surname'] = components.pop
186
+
187
+ unless components.empty?
188
+ @member[section_id]['middle_names'] = components
189
+ end
190
+ end
191
+
192
+ # Extract a lords member name value from a document node.
193
+ def lords_member_name
194
+ section_id = 'name'
195
+ @member[section_id] = {}
196
+
197
+ table = @document.xpath("//table[@class = 'personal-details-container']")
198
+
199
+ full_title = table.at_xpath("//div[@id = 'lords-fulltitle']")
200
+ @member[section_id]['full_title'] = full_title.content.strip
201
+
202
+ name = table.at_xpath("//div[@id = 'lords-name']")
203
+ components = name.content.strip.split(' ')
204
+ @member[section_id]['full_name'] = components.join(' ')
205
+ @member[section_id]['given_name'] = components.shift
206
+ @member[section_id]['surname'] = components.pop
207
+
208
+ unless components.empty?
209
+ @member[section_id]['middle_names'] = components
210
+ end
211
+ end
212
+
213
+ end
214
+
215
+ end
@@ -0,0 +1,104 @@
1
+ require 'filequeue'
2
+
3
+ module UkParliament
4
+ # Class to create/manage a queue for a set of items that will be scraped.
5
+ class QueueManager
6
+ include UkParliament
7
+
8
+ # Unique identifier for the main work queue.
9
+ QUEUE_MAIN = 'main'
10
+ # Unique identifier for the error queue.
11
+ QUEUE_ERROR = 'error'
12
+
13
+ # Instance data accessor(s).
14
+ attr_reader :main_queue, :error_queue, :active_queue
15
+
16
+ # Set up queue states.
17
+ def initialize(name = 'commons')
18
+ config = configuration
19
+ main_queue_file_name = File.join(config[:queue_file_path], "#{name}.queue")
20
+ error_queue_file_name = File.join(config[:queue_file_path], "#{name}.error.queue")
21
+
22
+ @main_queue = FileQueue.new(main_queue_file_name)
23
+ @error_queue = FileQueue.new(error_queue_file_name)
24
+
25
+ reset_main_queue
26
+ set_active_queue
27
+ end
28
+
29
+ # Identify if there were errors from the last scrape.
30
+ def scrape_errors?
31
+ if @active_queue == QUEUE_ERROR
32
+ true
33
+ else
34
+ false
35
+ end
36
+ end
37
+
38
+ # Return the current size of the error queue.
39
+ #
40
+ # This is a bit of a work around FileQueue.
41
+ # https://github.com/pezra/filequeue/pull/4
42
+ def error_queue_size
43
+ size = 0
44
+
45
+ if File.exists?(@error_queue.file_name)
46
+ size = @error_queue.length
47
+ end
48
+
49
+ size
50
+ end
51
+
52
+ # Set up the queue, either with provided items, or from the error queue.
53
+ def enqueue(members)
54
+ if @active_queue == QUEUE_ERROR
55
+ populate_from_error_queue
56
+ else
57
+ populate(members, 'id')
58
+ end
59
+
60
+ log.info("Populated queue with #{@main_queue.length} items...")
61
+ end
62
+
63
+ private
64
+
65
+ # Empty the main queue for a house.
66
+ def reset_main_queue
67
+ if File.exists?(@main_queue.file_name)
68
+ @main_queue.clear
69
+ end
70
+ end
71
+
72
+ # Identify the currently active queue, main or error.
73
+ def set_active_queue
74
+ @active_queue = QUEUE_MAIN
75
+
76
+ if File.exists?(@error_queue.file_name)
77
+ unless @error_queue.empty?
78
+ @active_queue = QUEUE_ERROR
79
+ end
80
+ end
81
+ end
82
+
83
+ # Populate the main queue.
84
+ def populate(items, key)
85
+ items.each { |member|
86
+ @main_queue.push(member[key].to_s)
87
+ }
88
+ end
89
+
90
+ # Populate the main queue with items from the error queue.
91
+ def populate_from_error_queue
92
+ log.info('Populating queue from error queue...')
93
+
94
+ until @error_queue.empty?
95
+ # Could prevent potentially a lot of disk IO by just overwriting the
96
+ # file directly and clear() the error queue...
97
+ id = @error_queue.pop
98
+ @main_queue.push(id)
99
+ end
100
+ end
101
+
102
+ end
103
+
104
+ end
@@ -0,0 +1,3 @@
1
+ module UkParliament
2
+ VERSION = "0.1.0"
3
+ end