RubyGems - uk_parliament - Versions diffs - 0.1.0 - Mend

uk_parliament 0.1.0

Files changed (25) hide show

checksums.yaml +7 -0
data/.gitignore +10 -0
data/.rspec +2 -0
data/.travis.yml +5 -0
data/Gemfile +4 -0
data/LICENSE +674 -0
data/README.md +88 -0
data/Rakefile +15 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/lib/uk_parliament.rb +99 -0
data/lib/uk_parliament/commons.rb +20 -0
data/lib/uk_parliament/doc_pipeline.rb +44 -0
data/lib/uk_parliament/file_house_members.rb +11 -0
data/lib/uk_parliament/house_members.rb +53 -0
data/lib/uk_parliament/house_members_manager.rb +19 -0
data/lib/uk_parliament/house_members_source_factory.rb +18 -0
data/lib/uk_parliament/http_house_members.rb +103 -0
data/lib/uk_parliament/lords.rb +20 -0
data/lib/uk_parliament/member_list_doc_pipeline.rb +103 -0
data/lib/uk_parliament/member_summary_doc_pipeline.rb +215 -0
data/lib/uk_parliament/queue_manager.rb +104 -0
data/lib/uk_parliament/version.rb +3 -0
data/uk_parliament.gemspec +29 -0
metadata +137 -0

data/lib/uk_parliament/lords.rb ADDED Viewed

@@ -0,0 +1,20 @@
+module UkParliament
+  # Class representing the House of Lords.
+  class Lords
+    include UkParliament
+    # Unique identifier for House of Lords.
+    HOUSE_ID = 'lords'
+    # URL of where to look for the list of Lords members.
+    MEMBER_LIST_URL = 'http://www.parliament.uk/mps-lords-and-offices/lords/'
+    # Instance data accessor(s).
+    attr_reader :members
+    # Initialise the class populating the Lords member data.
+    def initialize(load_from_file = false)
+      @members = HouseMembersManager.new(HOUSE_ID, load_from_file).members
+    end
+  end
+end

data/lib/uk_parliament/member_list_doc_pipeline.rb ADDED Viewed

@@ -0,0 +1,103 @@
+module UkParliament
+  # Class defining the pipeline process of a scraped member list document.
+  class MemberListDocPipeline < DocPipeline
+    # Initialise the class, calling the parent class init, with provided args.
+    def initialize(house_id, document)
+      super
+    end
+    # Produce the list of members for the relevant house.
+    def house_member_list(members)
+      @members = members
+      execute
+    end
+    private
+    # Define the tasks that will be performed for the commons member list
+    # pipeline.
+    def define_commons_tasks
+      @commons_tasks = %w(commons_members)
+    end
+    # Define the tasks that will be performed for the lords member list
+    # pipeline.
+    def define_lords_tasks
+      @lords_tasks = %w(lords_members)
+    end
+    # Process House of Commons member list document data, pulling out each
+    # member's basic data and appending to a list of members.
+    def commons_members
+      table_rows = @document.xpath("//tr[descendant::a[starts-with(@href, 'http://www.parliament.uk/biographies/commons/')]]")
+      table_rows.each do |row|
+        member = {}
+        name = row.at_xpath('./td[1]/a')
+        first_cell_text = row.xpath('./td[1]//text()')
+        constituency = row.at_xpath('./td[2]')
+        member_name(member, name)
+        member_profile_url(member, name)
+        member_id(member, name)
+        commons_party(member, first_cell_text)
+        commons_constituency(member, constituency)
+        @members << member
+      end
+    end
+    # Process House of Lords member list document data, pulling out each
+    # member's basic data and appending to a list of members.
+    def lords_members
+      table_rows = @document.xpath("//tr[descendant::a[starts-with(@href, 'http://www.parliament.uk/biographies/lords/')]]")
+      table_rows.each do |row|
+        member = {}
+        name = row.at_xpath('./td[1]/a')
+        party = row.at_xpath('./td[2]')
+        member_name(member, name)
+        member_profile_url(member, name)
+        member_id(member, name)
+        lords_party(member, party)
+        @members << member
+      end
+    end
+    # Extract member name data from a document node.
+    def member_name(member, node)
+      member['alphabetical_name'] = node.content
+    end
+    # Extract member summary URL data from a document node.
+    def member_profile_url(member, node)
+      member['url'] = node['href']
+    end
+    # Extract member ID data from a document node.
+    def member_id(member, node)
+      member['id'] = node['href'].split('/').last.to_i
+    end
+    # Extract Commons member party from a document node.
+    def commons_party(member, nodeset)
+      member['party'] = nodeset.last.to_s.strip[1..-2]
+    end
+    # Extract a Commons member constituency from a document node.
+    def commons_constituency(member, node)
+      member['constituency'] = node.content
+    end
+    # Extract Lords member party or group from a document node.
+    def lords_party(member, node)
+      member['party_or_group'] = node.content
+    end
+  end
+end

data/lib/uk_parliament/member_summary_doc_pipeline.rb ADDED Viewed

@@ -0,0 +1,215 @@
+module UkParliament
+  # Class defining the pipeline process of a scraped member summary document.
+  class MemberSummaryDocPipeline < DocPipeline
+    # Initialise the class, calling the parent class init, with provided args.
+    def initialize(house_id, document)
+      super
+    end
+    # Produce the member summary.
+    def enrich_member_data(member)
+      @member = member
+      execute
+    end
+    private
+    # Define the tasks that will be performed for the commons member summary
+    # pipeline.
+    def define_commons_tasks
+      @commons_tasks = %w(parliamentary_details departmental_details constituency_details digital_details commons_member_name)
+    end
+    # Define the tasks that will be performed for the lords member summary
+    # pipeline.
+    def define_lords_tasks
+      @lords_tasks = %w(parliamentary_details departmental_details external_office_details digital_details lords_member_name)
+    end
+    # Extract the parliamentary contact details for a member.
+    def parliamentary_details
+      nodeset = @document.xpath("//div[contains(@class, 'contact-detail') and contains(@class, 'parliamentary')]")
+      if nodeset.length > 0
+        section_id = 'parliamentary_contact'
+        @member[section_id] = {}
+        section_contact_details(section_id, nodeset)
+      end
+    end
+    # Create a container for a particular section of contact details.
+    def section_contact_details(section_id, nodeset)
+      address = nodeset.at_xpath(".//*[@data-generic-id = 'address']")
+      phone_fax = nodeset.at_xpath(".//*[@data-generic-id = 'telephone']")
+      email = nodeset.at_xpath(".//*[@data-generic-id = 'email-address']/a/span[@class = '__cf_email__']")
+      address(address, section_id)
+      phone_fax(phone_fax, section_id)
+      email(email, section_id)
+    end
+    # Extract the address value from a document node.
+    def address(node, section_id)
+      unless node.nil?
+        @member[section_id]['address'] = node.content.strip
+      end
+    end
+    # Extract the phone/fax value(s) from a document node.
+    def phone_fax(node, section_id)
+      unless node.nil?
+        # Some telephone values include a 'Fax' number label/value as well as a
+        # 'Tel' number label/value
+        if node.content.include?('Fax')
+          parts = node.content.strip.gsub(/\s+/, ' ').split(/fax:*\s*/i)
+          @member[section_id]['telephone'] = parts[0].gsub(/tel:*\s*/i, '').strip
+          @member[section_id]['fax'] = parts[1]
+        else
+          @member[section_id]['telephone'] = node.content.strip.gsub(/\s+/, ' ').sub(/tel:*\s*/i, '')
+        end
+      end
+    end
+    # Extract email value from a document node.
+    def email(node, section_id)
+      unless node.nil?
+        @member[section_id]['email'] = decode_email(node['data-cfemail'])
+      end
+    end
+    # Decode the Cloudflare encoded email address.
+    def decode_email(code)
+      k = code[0..1].hex
+      (2..(code.size - 1)).step(2).to_a.map{ |i|
+        (code[i..(i + 1)].hex ^ k).chr
+      }.join
+    end
+    # Extract the constituency contact details for a member.
+    def constituency_details
+      nodeset = @document.xpath("//div[contains(@class, 'contact-detail') and contains(@class, 'constituency')]")
+      if nodeset.length > 0
+        section_id = 'constituency_contact'
+        @member[section_id] = {}
+        section_contact_details(section_id, nodeset)
+      end
+    end
+    # Extract the external office contact details for a member.
+    def external_office_details
+      nodeset = @document.xpath("//div[contains(@class, 'contact-detail') and contains(@class, 'externalprivate-office')]")
+      if nodeset.length > 0
+        section_id = 'external_contact'
+        @member[section_id] = {}
+        section_contact_details(section_id, nodeset)
+      end
+    end
+    # Extract the departmental office contact details for a member.
+    def departmental_details
+      nodeset = @document.xpath("//div[contains(@class, 'contact-detail') and contains(@class, 'departmental')]")
+      if nodeset.length > 0
+        section_id = 'departmental_contact'
+        @member[section_id] = {}
+        section_contact_details(section_id, nodeset)
+      end
+    end
+    # Extract the digital contact details for a member.
+    def digital_details
+      nodeset = @document.xpath("//div[@id = 'web-social-media']")
+      web = nodeset.xpath(".//*[@data-generic-id = 'website']/a")
+      twitter = nodeset.at_xpath(".//*[@data-generic-id = 'twitter']/a")
+      facebook = nodeset.at_xpath(".//*[@data-generic-id = 'facebook']/a")
+      web(web)
+      twitter(twitter)
+      facebook(facebook)
+    end
+    # Extract web address value(s) from a document node.
+    def web(nodeset)
+      unless nodeset.nil? || nodeset.empty?
+        @member['web'] = []
+        nodeset.each { |node|
+          @member['web'] << node['href']
+        }
+      end
+    end
+    # Extract Twitter account values from a document node.
+    def twitter(node)
+      unless node.nil?
+        @member['twitter'] = {
+          'profile' => node['href'],
+          'username' => node.child.content
+        }
+      end
+    end
+    # Extract Facebook link value from a document node.
+    def facebook(node)
+      unless node.nil?
+        @member['facebook'] = node['href']
+      end
+    end
+    # Extract a commons member name value from a document node.
+    def commons_member_name
+      section_id = 'name'
+      @member[section_id] = {}
+      title_list = %w(Mr Mrs Ms Dr Sir Dame Lady Lord)
+      # String: "Abbot, Ms Diane"
+      components = @member['alphabetical_name'].split(',')
+      # Array: |Abbot| Ms Diane|
+      surname = components.shift
+      # Array: | Ms Diane|
+      components = components.join.split(' ')
+      # Array: |Ms|Diane|
+      if title_list.include?(components[0])
+        @member[section_id]['title'] = components.shift
+      end
+      # Array: |Diane|
+      components << surname
+      # Array: |Diane|Abbot|
+      @member[section_id]['full_name'] = components.join(' ')
+      @member[section_id]['given_name'] = components.shift
+      @member[section_id]['surname'] = components.pop
+      unless components.empty?
+        @member[section_id]['middle_names'] = components
+      end
+    end
+    # Extract a lords member name value from a document node.
+    def lords_member_name
+      section_id = 'name'
+      @member[section_id] = {}
+      table = @document.xpath("//table[@class = 'personal-details-container']")
+      full_title = table.at_xpath("//div[@id = 'lords-fulltitle']")
+      @member[section_id]['full_title'] = full_title.content.strip
+      name = table.at_xpath("//div[@id = 'lords-name']")
+      components = name.content.strip.split(' ')
+      @member[section_id]['full_name'] = components.join(' ')
+      @member[section_id]['given_name'] = components.shift
+      @member[section_id]['surname'] = components.pop
+      unless components.empty?
+        @member[section_id]['middle_names'] = components
+      end
+    end
+  end
+end

data/lib/uk_parliament/queue_manager.rb ADDED Viewed

@@ -0,0 +1,104 @@
+require 'filequeue'
+module UkParliament
+  # Class to create/manage a queue for a set of items that will be scraped.
+  class QueueManager
+    include UkParliament
+    # Unique identifier for the main work queue.
+    QUEUE_MAIN = 'main'
+    # Unique identifier for the error queue.
+    QUEUE_ERROR = 'error'
+    # Instance data accessor(s).
+    attr_reader :main_queue, :error_queue, :active_queue
+    # Set up queue states.
+    def initialize(name = 'commons')
+      config = configuration
+      main_queue_file_name = File.join(config[:queue_file_path], "#{name}.queue")
+      error_queue_file_name = File.join(config[:queue_file_path], "#{name}.error.queue")
+      @main_queue = FileQueue.new(main_queue_file_name)
+      @error_queue = FileQueue.new(error_queue_file_name)
+      reset_main_queue
+      set_active_queue
+    end
+    # Identify if there were errors from the last scrape.
+    def scrape_errors?
+      if @active_queue == QUEUE_ERROR
+        true
+      else
+        false
+      end
+    end
+    # Return the current size of the error queue.
+    #
+    # This is a bit of a work around FileQueue.
+    # https://github.com/pezra/filequeue/pull/4
+    def error_queue_size
+      size = 0
+      if File.exists?(@error_queue.file_name)
+        size = @error_queue.length
+      end
+      size
+    end
+    # Set up the queue, either with provided items, or from the error queue.
+    def enqueue(members)
+      if @active_queue == QUEUE_ERROR
+        populate_from_error_queue
+      else
+        populate(members, 'id')
+      end
+      log.info("Populated queue with #{@main_queue.length} items...")
+    end
+    private
+    # Empty the main queue for a house.
+    def reset_main_queue
+      if File.exists?(@main_queue.file_name)
+        @main_queue.clear
+      end
+    end
+    # Identify the currently active queue, main or error.
+    def set_active_queue
+      @active_queue = QUEUE_MAIN
+      if File.exists?(@error_queue.file_name)
+        unless @error_queue.empty?
+          @active_queue = QUEUE_ERROR
+        end
+      end
+    end
+    # Populate the main queue.
+    def populate(items, key)
+      items.each { |member|
+        @main_queue.push(member[key].to_s)
+      }
+    end
+    # Populate the main queue with items from the error queue.
+    def populate_from_error_queue
+      log.info('Populating queue from error queue...')
+      until @error_queue.empty?
+        # Could prevent potentially a lot of disk IO by just overwriting the
+        # file directly and clear() the error queue...
+        id = @error_queue.pop
+        @main_queue.push(id)
+      end
+    end
+  end
+end

data/lib/uk_parliament/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module UkParliament
+  VERSION = "0.1.0"
+end