RubyGems - uk_planning_scraper - Versions diffs - 0.4.3 - Mend

uk_planning_scraper 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

checksums.yaml +7 -0
data/.gitignore +10 -0
data/.rspec +2 -0
data/Gemfile +4 -0
data/LICENSE +165 -0
data/README.md +250 -0
data/Rakefile +2 -0
data/bin/console +10 -0
data/bin/setup +8 -0
data/lib/uk_planning_scraper/application.rb +46 -0
data/lib/uk_planning_scraper/authorities.csv +113 -0
data/lib/uk_planning_scraper/authority.rb +140 -0
data/lib/uk_planning_scraper/authority_scrape_params.rb +134 -0
data/lib/uk_planning_scraper/idox.rb +182 -0
data/lib/uk_planning_scraper/northgate.rb +127 -0
data/lib/uk_planning_scraper/version.rb +3 -0
data/lib/uk_planning_scraper.rb +13 -0
data/uk_planning_scraper.gemspec +33 -0
metadata +185 -0

data/lib/uk_planning_scraper/authority.rb ADDED Viewed

@@ -0,0 +1,140 @@
+require 'csv'
+module UKPlanningScraper
+  class Authority
+    attr_reader :name, :url
+    @@authorities = []
+    def initialize(name, url)
+      @name = name.strip
+      @url = url.strip
+      @tags = [] # Strings in arbitrary order
+      @applications = [] # Application objects
+      @scrape_params = {}
+    end
+    def scrape(options = {})
+      default_options = {
+        delay: 10,
+      }
+      # The user-supplied options override the defaults
+      options = default_options.merge(options)
+      # Select which scraper to use
+      case system
+      when 'idox'
+        @applications = scrape_idox(@scrape_params, options)
+      when 'northgate'
+        @applications = scrape_northgate(@scrape_params, options)
+      else
+        raise SystemNotSupported.new("Planning system not supported for \
+          #{@name} at URL: #{@url}")
+      end
+      # Post processing
+      @applications.each do |app|
+        app.authority_name = @name
+      end
+      # Output as an array of hashes
+      output = []
+      # FIXME - silently ignores invalid apps. How should we handle them?
+      @applications.each { |app| output << app.to_hash if app.valid? }
+      # Reset so that old params don't get used for new scrapes
+      clear_scrape_params
+      output  # Single point of successful exit
+    end
+    def tags
+      @tags.sort
+    end
+    # Add multiple tags to existing tags
+    def add_tags(tags)
+      tags.each { |t| add_tag(t) }
+    end
+    # Add a single tag to existing tags
+    def add_tag(tag)
+      clean_tag = tag.strip.downcase.gsub(' ', '')
+      @tags << clean_tag unless tagged?(clean_tag) # prevent duplicates
+    end
+    def tagged?(tag)
+      @tags.include?(tag)
+    end
+    def system
+      if @url.match(/search\.do\?action=advanced/i)
+        'idox'
+      elsif @url.match(/generalsearch\.aspx/i)
+        'northgate'
+      elsif @url.match(/ocellaweb/i)
+        'ocellaweb'
+      elsif @url.match(/\/apas\//)
+        'agileplanning'
+      else
+        'unknownsystem'
+      end
+    end
+    def self.all
+      @@authorities
+    end
+    # List all the tags in use
+    def self.tags
+      tags = []
+      @@authorities.each { |a| tags << a.tags }
+      tags.flatten.uniq.sort
+    end
+    def self.named(name)
+      authority = @@authorities.find { |a| name == a.name }
+      raise AuthorityNotFound if authority.nil?
+      authority
+    end
+    # Tagged x
+    def self.tagged(tag)
+      found = []
+      @@authorities.each { |a| found << a if a.tagged?(tag) }
+      found
+    end
+    # Not tagged x
+    def self.not_tagged(tag)
+      found = []
+      @@authorities.each { |a| found << a unless a.tagged?(tag) }
+      found
+    end
+    # Authorities with no tags
+    def self.untagged
+      found = []
+      @@authorities.each { |a| found << a if a.tags.empty? }
+      found
+    end
+    def self.load
+      # Don't run this method more than once
+      return unless @@authorities.empty?
+      CSV.foreach(File.join(File.dirname(__dir__), 'uk_planning_scraper', \
+          'authorities.csv'), :headers => true) do |line|
+        auth = Authority.new(line['authority_name'], line['url'])
+        if line['tags']
+          auth.add_tags(line['tags'].split(/\s+/))
+        end
+        auth.add_tag(auth.system)
+        @@authorities << auth
+      end
+    end
+  end
+end
+UKPlanningScraper::Authority.load

data/lib/uk_planning_scraper/authority_scrape_params.rb ADDED Viewed

@@ -0,0 +1,134 @@
+require 'date'
+module UKPlanningScraper
+  class Authority
+    # Parameter methods for Authority#scrape
+    # Desgined to be method chained, eg:
+    #
+    # applications = UKPlanningScraper::Authority.named("Barnet"). \
+    # development_type("Q22").keywords("illuminat"). \
+    # validated_days(30).scrape
+    def validated_days(n)
+      # Validated within the last n days
+      # Assumes that every scraper/system can do a date range search
+      check_class(n, Fixnum)
+      unless n > 0
+        raise ArgumentError.new("validated_days must be greater than 0")
+      end
+      validated_from(Date.today - (n - 1))
+      validated_to(Date.today)
+      self
+    end
+    def received_days(n)
+      # received within the last n days
+      # Assumes that every scraper/system can do a date range search
+      check_class(n, Fixnum)
+      unless n > 0
+        raise ArgumentError.new("received_days must be greater than 0")
+      end
+      received_from(Date.today - (n - 1))
+      received_to(Date.today)
+      self
+    end
+    def decided_days(n)
+      # decided within the last n days
+      # Assumes that every scraper/system can do a date range search
+      check_class(n, Fixnum)
+      unless n > 0
+        raise ArgumentError.new("decided_days must be greater than 0")
+      end
+      decided_from(Date.today - (n - 1))
+      decided_to(Date.today)
+      self
+    end
+    def applicant_name(s)
+      unless system == 'idox'
+        raise NoMethodError.new("applicant_name is only implemented for Idox. \
+          This authority (#{@name}) is #{system.capitalize}.")
+      end
+      check_class(s, String)
+      @scrape_params[:applicant_name] = s.strip
+      self
+    end
+    def application_type(s)
+      unless system == 'idox'
+        raise NoMethodError.new("application_type is only implemented for \
+          Idox. This authority (#{@name}) is #{system.capitalize}.")
+      end
+      check_class(s, String)
+      @scrape_params[:application_type] = s.strip
+      self
+    end
+    def development_type(s)
+      unless system == 'idox'
+        raise NoMethodError.new("development_type is only implemented for \
+          Idox. This authority (#{@name}) is #{system.capitalize}.")
+      end
+      check_class(s, String)
+      @scrape_params[:development_type] = s.strip
+      self
+    end
+    private
+    # Handle the simple params with this
+    def method_missing(method_name, *args)
+      sc_params = {
+        validated_from: Date,
+        validated_to: Date,
+        received_from: Date,
+        received_to: Date,
+        decided_from: Date,
+        decided_to: Date,
+        keywords: String
+      }
+      value = args[0]
+      if sc_params[method_name]
+        check_class(value, sc_params[method_name], method_name.to_s)
+        value.strip! if value.class == String
+        if value.class == Date && value > Date.today
+          raise ArgumentError.new("#{method_name} can't be a date in the " + \
+            "future (#{value.to_s})")
+        end
+        @scrape_params[method_name] = value
+        self
+      else
+        raise NoMethodError.new(method_name.to_s)
+      end
+    end
+    def clear_scrape_params
+      @scrape_params = {}
+    end
+    # https://stackoverflow.com/questions/5100299/how-to-get-the-name-of-the-calling-method
+    def check_class(
+      param_value,
+      expected_class,
+      param_name = caller_locations(1, 1)[0].label) # name of calling method
+      unless param_value.class == expected_class
+        raise TypeError.new("#{param_name} must be a " \
+          "#{expected_class} not a #{param_value.class.to_s}")
+      end
+    end
+  end
+end

data/lib/uk_planning_scraper/idox.rb ADDED Viewed

@@ -0,0 +1,182 @@
+require 'mechanize'
+require 'pp'
+module UKPlanningScraper
+  class Authority
+    private
+    def scrape_idox(params, options)
+      puts "Using Idox scraper."
+      base_url = @url.match(/(https?:\/\/.+?)\//)[1]
+      apps = []
+      agent = Mechanize.new
+      puts "Getting: #{@url}"
+      page = agent.get(@url) # load the search form page
+      # Check that the search form is actually present.
+      # When Idox has an internal error it returns an error page with HTTP 200.
+      unless form = page.form('searchCriteriaForm')
+        puts "Error: Search form page failed to load due to Idox internal error."
+        return []
+      end
+      # form.action = form.action + '&searchCriteria.resultsPerPage=100'
+      # Fill out and submit search form
+      # Add expected fields to form if they're not already present so that searches using these terms work
+      %w{
+        date(applicationReceivedStart)
+        date(applicationReceivedEnd)
+      }.each { |f| form.add_field!(f) unless form.has_field?(f) }
+      date_format = "%d/%m/%Y"
+      form.send(:"date(applicationReceivedStart)", params[:received_from].strftime(date_format)) if params[:received_from]
+      form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime(date_format)) if params[:received_to]
+      form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime(date_format)) if params[:validated_from]
+      form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime(date_format)) if params[:validated_to]
+      form.send(:"date(applicationDecisionStart)", params[:decided_from].strftime(date_format)) if params[:decided_from]
+      form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime(date_format)) if params[:decided_to]
+      form.send(:"searchCriteria\.description", params[:keywords])
+      # Some councils don't have the applicant name on their form, eg Bexley
+      form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
+      form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType'
+      # Only some Idox sites (eg Bolton) have a 'searchCriteria.developmentType' parameter
+      form.send(:"searchCriteria\.developmentType", params[:development_type]) if form.has_field? 'searchCriteria.developmentType'
+      page = form.submit
+      if page.search('.errors').inner_text.match(/Too many results found/i)
+        raise TooManySearchResults.new("Scrape in smaller chunks. Use shorter date ranges and/or more search parameters.")
+      end
+      loop do
+        # Parse search results
+        items = page.search('li.searchresult')
+        puts "Found #{items.size} apps on this page."
+        items.each do |app|
+          data = Application.new
+          # Parse info line
+          info_line = app.at("p.metaInfo").inner_text.strip
+          bits = info_line.split('|').map { |e| e.strip.delete("\r\n") }
+          bits.each do |bit|
+            if matches = bit.match(/Ref\. No:\s+(.+)/)
+              data.council_reference = matches[1]
+            end
+            if matches = bit.match(/(Received|Registered):\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
+              data.date_received = Date.parse(matches[2])
+            end
+            if matches = bit.match(/Validated:\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
+              data.date_validated = Date.parse(matches[1])
+            end
+            if matches = bit.match(/Status:\s+(.+)/)
+              data.status = matches[1]
+            end
+          end
+          data.scraped_at = Time.now
+          data.info_url = base_url + app.at('a')['href']
+          data.address = app.at('p.address').inner_text.strip
+          data.description = app.at('a').inner_text.strip
+          apps << data
+        end
+        # Get the Next button from the pager, if there is one
+        if next_button = page.at('a.next')
+          next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
+          sleep options[:delay]
+          puts "Getting: #{next_url}"
+          page = agent.get(next_url)
+        else
+          break
+        end
+      end
+      # Scrape the summary tab for each app
+      apps.each_with_index do |app, i|
+        sleep options[:delay]
+        puts "#{i + 1} of #{apps.size}: #{app.info_url}"
+        res = agent.get(app.info_url)
+        if res.code == '200' # That's a String not an Integer, ffs
+          # Parse the summary tab for this app
+          app.scraped_at = Time.now
+          # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
+          # Bradford has #tab_documents but without the document count on it
+          app.documents_count = 0
+          if documents_link = res.at('.associateddocument a')
+            if documents_link.inner_text.match(/\d+/)
+              app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
+              app.documents_url = base_url + documents_link[:href]
+            end
+          elsif documents_link = res.at('#tab_documents')
+            if documents_link.inner_text.match(/\d+/)
+              app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
+              app.documents_url = base_url + documents_link[:href]
+            end
+          end
+          # We need to find values in the table by using the th labels.
+          # The row indexes/positions change from site to site (or even app to app) so we can't rely on that.
+          res.search('#simpleDetailsTable tr').each do |row|
+            key = row.at('th').inner_text.strip
+            value = row.at('td').inner_text.strip
+            case key
+              when 'Reference'
+                app.council_reference = value
+              when 'Alternative Reference'
+                app.alternative_reference = value unless value.empty?
+              when 'Planning Portal Reference'
+                app.alternative_reference = value unless value.empty?
+              when 'Application Received'
+                app.date_received = Date.parse(value) if value.match(/\d/)
+              when 'Application Registered'
+                app.date_received = Date.parse(value) if value.match(/\d/)
+              when 'Application Validated'
+                app.date_validated = Date.parse(value) if value.match(/\d/)
+              when 'Address'
+                app.address = value unless value.empty?
+              when 'Proposal'
+                app.description = value unless value.empty?
+              when 'Status'
+                app.status = value unless value.empty?
+              when 'Decision'
+                app.decision = value unless value.empty?
+              when 'Decision Issued Date'
+                app.date_decision = Date.parse(value) if value.match(/\d/)
+              when 'Appeal Status'
+                app.appeal_status = value unless value.empty?
+              when 'Appeal Decision'
+                app.appeal_decision = value unless value.empty?
+              else
+                puts "Error: key '#{key}' not found"
+            end # case
+          end # each row
+        else
+          puts "Error: HTTP #{res.code}"
+        end # if
+      end # scrape summary tab for apps
+      apps
+    end # scrape_idox
+  end # class
+end

data/lib/uk_planning_scraper/northgate.rb ADDED Viewed

@@ -0,0 +1,127 @@
+require 'http'
+require 'nokogiri'
+require 'logger'
+module UKPlanningScraper
+  class Authority
+    private
+    def scrape_northgate(params, options)
+      puts "Using Northgate scraper."
+      base_url = @url.match(/(https?:\/\/.+?)\//)[1]
+      # Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive?
+      generic_url = @url.match(/.+\//)[0] + 'Generic/'
+      apps = []
+      $stdout.sync = true # Flush output buffer after every write so log messages appear immediately.
+      logger = Logger.new($stdout)
+      logger.level = Logger::DEBUG
+      date_regex = /\d{2}-\d{2}-\d{4}/
+      form_vars = {
+        'csbtnSearch' => 'Search' # required
+      }
+      form_vars['txtProposal'] = params[:keywords]
+      # Date received from and to
+      if params[:received_from] || params[:received_to]
+        form_vars['cboSelectDateValue'] = 'DATE_RECEIVED'
+        form_vars['rbGroup'] = 'rbRange'
+        form_vars['dateStart'] = params[:received_from].to_s if params[:received_from] # YYYY-MM-DD
+        form_vars['dateEnd'] = params[:received_to].to_s if params[:received_to] # YYYY-MM-DD
+      end
+      # Date validated from and to
+      if params[:validated_from] || params[:validated_to]
+        form_vars['cboSelectDateValue'] = 'DATE_VALID'
+        form_vars['rbGroup'] = 'rbRange'
+        form_vars['dateStart'] = params[:validated_from].to_s if params[:validated_from] # YYYY-MM-DD
+        form_vars['dateEnd'] = params[:validated_to].to_s if params[:validated_to] # YYYY-MM-DD
+      end
+      # Date decided from and to
+      if params[:decided_from] || params[:decided_to]
+        form_vars['cboSelectDateValue'] = 'DATE_DECISION'
+        form_vars['rbGroup'] = 'rbRange'
+        form_vars['dateStart'] = params[:decided_from].to_s if params[:decided_from] # YYYY-MM-DD
+        form_vars['dateEnd'] = params[:decided_to].to_s if params[:decided_to] # YYYY-MM-DD
+      end
+      logger.info "Form variables: #{form_vars.to_s}"
+      headers = {
+        'Origin' => base_url,
+        'Referer' => @url,
+      }
+      logger.debug "HTTP request headers:"
+      logger.debug(headers.to_s)
+      logger.debug "GET: " + @url
+      response = HTTP.headers(headers).get(@url)
+      logger.debug "Response code: HTTP " + response.code.to_s
+      if response.code == 200
+        doc = Nokogiri::HTML(response.to_s)
+        asp_vars = {
+          '__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'],
+          '__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value']
+         }
+      else
+        logger.fatal "Bad response from search page. Response code: #{response.code.to_s}."
+        raise RuntimeError.new("Northgate: Bad response from search page. Response code: #{response.code.to_s}.")
+      end
+      cookies = {}
+      response.cookies.each { |c| cookies[c.name] = c.value }
+      form_vars.merge!(asp_vars)
+      logger.debug "POST: " + @url
+      response2 = HTTP.headers(headers).cookies(cookies).post(@url, :form => form_vars)
+      logger.debug "Response code: HTTP " + response2.code.to_s
+      if response2.code == 302
+        # Follow the redirect manually
+        # Set the page size (PS) to max so we don't have to page through search results
+        logger.debug "Location: #{response2.headers['Location']}"
+        results_url = URI::encode(base_url + response2.headers['Location'].gsub!('PS=10', 'PS=99999'))
+        logger.debug "GET: " + results_url
+        response3 = HTTP.headers(headers).cookies(cookies).get(results_url)
+        logger.debug "Response code: HTTP " + response3.code.to_s
+        doc = Nokogiri::HTML(response3.to_s)
+      else
+        logger.error "Didn't get redirected from search."
+        raise RuntimeError.new("Northgate: didn't get redirected from search.")
+      end
+      rows = doc.search("table.display_table tr")
+      logger.info "Found #{rows.size - 1} applications in search results." # The first row is the header row
+      # Iterate over search results
+      rows.each do |row|
+        if row.at("td") # skip header row which only has th's
+          cells = row.search("td")
+          app = Application.new
+          app.scraped_at = Time.now
+          app.council_reference = cells[0].inner_text.strip
+          app.info_url = URI::encode(generic_url + cells[0].at('a')[:href].strip)
+          app.info_url.gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
+          app.address = cells[1].inner_text.strip
+          app.description = cells[2].inner_text.strip
+          app.status = cells[3].inner_text.strip
+          raw_date_received = cells[4].inner_text.strip
+          app.date_received = Date.parse(raw_date_received) if raw_date_received != '--'
+          app.decision = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney
+          apps << app
+        end
+      end
+      apps
+    end
+  end
+end

data/lib/uk_planning_scraper/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module UKPlanningScraper
+  VERSION = "0.4.3"
+end

data/lib/uk_planning_scraper.rb ADDED Viewed

@@ -0,0 +1,13 @@
+require "uk_planning_scraper/version"
+require "uk_planning_scraper/authority"
+require "uk_planning_scraper/authority_scrape_params"
+require "uk_planning_scraper/application"
+require 'uk_planning_scraper/idox'
+require 'uk_planning_scraper/northgate'
+require 'logger'
+module UKPlanningScraper
+  class SystemNotSupported < StandardError; end
+  class AuthorityNotFound < StandardError; end
+  class TooManySearchResults < StandardError; end
+end

data/uk_planning_scraper.gemspec ADDED Viewed

@@ -0,0 +1,33 @@
+# coding: utf-8
+lib = File.expand_path("../lib", __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require "uk_planning_scraper/version"
+Gem::Specification.new do |spec|
+  spec.name          = "uk_planning_scraper"
+  spec.version       = UKPlanningScraper::VERSION
+  spec.authors       = ["Adrian Short"]
+  spec.email         = 'rubygems@adrianshort.org'
+  spec.summary       = %q{Scrape planning applications data from UK council websites.}
+  # spec.description   = %q{TODO: Write a longer description or delete this line.}
+  spec.homepage      = "https://github.com/adrianshort/uk_planning_scraper/"
+  spec.licenses      = ['LGPL-3.0']
+  spec.files         = `git ls-files -z`.split("\x0").reject do |f|
+    f.match(%r{^(test|spec|features)/})
+  end
+  spec.bindir        = "exe"
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ["lib"]
+  spec.add_development_dependency "bundler", "~> 2.0"
+  spec.add_development_dependency "rake", "~> 10.0"
+  spec.add_development_dependency "rspec", "~> 3.8"
+  spec.add_development_dependency "simplecov", "~> 0.16"
+  spec.add_development_dependency "vcr", "~> 4.0"
+  spec.add_development_dependency "webmock", "~> 3.5"
+  spec.add_development_dependency "pry", "~> 0.11"
+  spec.add_runtime_dependency "mechanize", "~> 2.7"
+  spec.add_runtime_dependency "http", "~> 3.3"
+end