uk_planning_scraper 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,140 @@
1
+ require 'csv'
2
+
3
+ module UKPlanningScraper
4
+ class Authority
5
+ attr_reader :name, :url
6
+
7
+ @@authorities = []
8
+
9
+ def initialize(name, url)
10
+ @name = name.strip
11
+ @url = url.strip
12
+ @tags = [] # Strings in arbitrary order
13
+ @applications = [] # Application objects
14
+ @scrape_params = {}
15
+ end
16
+
17
+ def scrape(options = {})
18
+ default_options = {
19
+ delay: 10,
20
+ }
21
+ # The user-supplied options override the defaults
22
+ options = default_options.merge(options)
23
+
24
+ # Select which scraper to use
25
+ case system
26
+ when 'idox'
27
+ @applications = scrape_idox(@scrape_params, options)
28
+ when 'northgate'
29
+ @applications = scrape_northgate(@scrape_params, options)
30
+ else
31
+ raise SystemNotSupported.new("Planning system not supported for \
32
+ #{@name} at URL: #{@url}")
33
+ end
34
+
35
+ # Post processing
36
+ @applications.each do |app|
37
+ app.authority_name = @name
38
+ end
39
+
40
+ # Output as an array of hashes
41
+ output = []
42
+ # FIXME - silently ignores invalid apps. How should we handle them?
43
+ @applications.each { |app| output << app.to_hash if app.valid? }
44
+
45
+ # Reset so that old params don't get used for new scrapes
46
+ clear_scrape_params
47
+
48
+ output # Single point of successful exit
49
+ end
50
+
51
+ def tags
52
+ @tags.sort
53
+ end
54
+
55
+ # Add multiple tags to existing tags
56
+ def add_tags(tags)
57
+ tags.each { |t| add_tag(t) }
58
+ end
59
+
60
+ # Add a single tag to existing tags
61
+ def add_tag(tag)
62
+ clean_tag = tag.strip.downcase.gsub(' ', '')
63
+ @tags << clean_tag unless tagged?(clean_tag) # prevent duplicates
64
+ end
65
+
66
+ def tagged?(tag)
67
+ @tags.include?(tag)
68
+ end
69
+
70
+ def system
71
+ if @url.match(/search\.do\?action=advanced/i)
72
+ 'idox'
73
+ elsif @url.match(/generalsearch\.aspx/i)
74
+ 'northgate'
75
+ elsif @url.match(/ocellaweb/i)
76
+ 'ocellaweb'
77
+ elsif @url.match(/\/apas\//)
78
+ 'agileplanning'
79
+ else
80
+ 'unknownsystem'
81
+ end
82
+ end
83
+
84
+ def self.all
85
+ @@authorities
86
+ end
87
+
88
+ # List all the tags in use
89
+ def self.tags
90
+ tags = []
91
+ @@authorities.each { |a| tags << a.tags }
92
+ tags.flatten.uniq.sort
93
+ end
94
+
95
+ def self.named(name)
96
+ authority = @@authorities.find { |a| name == a.name }
97
+ raise AuthorityNotFound if authority.nil?
98
+ authority
99
+ end
100
+
101
+ # Tagged x
102
+ def self.tagged(tag)
103
+ found = []
104
+ @@authorities.each { |a| found << a if a.tagged?(tag) }
105
+ found
106
+ end
107
+
108
+ # Not tagged x
109
+ def self.not_tagged(tag)
110
+ found = []
111
+ @@authorities.each { |a| found << a unless a.tagged?(tag) }
112
+ found
113
+ end
114
+
115
+ # Authorities with no tags
116
+ def self.untagged
117
+ found = []
118
+ @@authorities.each { |a| found << a if a.tags.empty? }
119
+ found
120
+ end
121
+
122
+ def self.load
123
+ # Don't run this method more than once
124
+ return unless @@authorities.empty?
125
+ CSV.foreach(File.join(File.dirname(__dir__), 'uk_planning_scraper', \
126
+ 'authorities.csv'), :headers => true) do |line|
127
+ auth = Authority.new(line['authority_name'], line['url'])
128
+
129
+ if line['tags']
130
+ auth.add_tags(line['tags'].split(/\s+/))
131
+ end
132
+
133
+ auth.add_tag(auth.system)
134
+ @@authorities << auth
135
+ end
136
+ end
137
+ end
138
+ end
139
+
140
+ UKPlanningScraper::Authority.load
@@ -0,0 +1,134 @@
1
+ require 'date'
2
+
3
+ module UKPlanningScraper
4
+ class Authority
5
+ # Parameter methods for Authority#scrape
6
+ # Desgined to be method chained, eg:
7
+ #
8
+ # applications = UKPlanningScraper::Authority.named("Barnet"). \
9
+ # development_type("Q22").keywords("illuminat"). \
10
+ # validated_days(30).scrape
11
+
12
+ def validated_days(n)
13
+ # Validated within the last n days
14
+ # Assumes that every scraper/system can do a date range search
15
+ check_class(n, Fixnum)
16
+
17
+ unless n > 0
18
+ raise ArgumentError.new("validated_days must be greater than 0")
19
+ end
20
+
21
+ validated_from(Date.today - (n - 1))
22
+ validated_to(Date.today)
23
+ self
24
+ end
25
+
26
+ def received_days(n)
27
+ # received within the last n days
28
+ # Assumes that every scraper/system can do a date range search
29
+ check_class(n, Fixnum)
30
+
31
+ unless n > 0
32
+ raise ArgumentError.new("received_days must be greater than 0")
33
+ end
34
+
35
+ received_from(Date.today - (n - 1))
36
+ received_to(Date.today)
37
+ self
38
+ end
39
+
40
+ def decided_days(n)
41
+ # decided within the last n days
42
+ # Assumes that every scraper/system can do a date range search
43
+ check_class(n, Fixnum)
44
+
45
+ unless n > 0
46
+ raise ArgumentError.new("decided_days must be greater than 0")
47
+ end
48
+
49
+ decided_from(Date.today - (n - 1))
50
+ decided_to(Date.today)
51
+ self
52
+ end
53
+
54
+ def applicant_name(s)
55
+ unless system == 'idox'
56
+ raise NoMethodError.new("applicant_name is only implemented for Idox. \
57
+ This authority (#{@name}) is #{system.capitalize}.")
58
+ end
59
+
60
+ check_class(s, String)
61
+ @scrape_params[:applicant_name] = s.strip
62
+ self
63
+ end
64
+
65
+ def application_type(s)
66
+ unless system == 'idox'
67
+ raise NoMethodError.new("application_type is only implemented for \
68
+ Idox. This authority (#{@name}) is #{system.capitalize}.")
69
+ end
70
+
71
+ check_class(s, String)
72
+ @scrape_params[:application_type] = s.strip
73
+ self
74
+ end
75
+
76
+ def development_type(s)
77
+ unless system == 'idox'
78
+ raise NoMethodError.new("development_type is only implemented for \
79
+ Idox. This authority (#{@name}) is #{system.capitalize}.")
80
+ end
81
+
82
+ check_class(s, String)
83
+ @scrape_params[:development_type] = s.strip
84
+ self
85
+ end
86
+
87
+ private
88
+
89
+ # Handle the simple params with this
90
+ def method_missing(method_name, *args)
91
+ sc_params = {
92
+ validated_from: Date,
93
+ validated_to: Date,
94
+ received_from: Date,
95
+ received_to: Date,
96
+ decided_from: Date,
97
+ decided_to: Date,
98
+ keywords: String
99
+ }
100
+
101
+ value = args[0]
102
+
103
+ if sc_params[method_name]
104
+ check_class(value, sc_params[method_name], method_name.to_s)
105
+ value.strip! if value.class == String
106
+
107
+ if value.class == Date && value > Date.today
108
+ raise ArgumentError.new("#{method_name} can't be a date in the " + \
109
+ "future (#{value.to_s})")
110
+ end
111
+
112
+ @scrape_params[method_name] = value
113
+ self
114
+ else
115
+ raise NoMethodError.new(method_name.to_s)
116
+ end
117
+ end
118
+
119
+ def clear_scrape_params
120
+ @scrape_params = {}
121
+ end
122
+
123
+ # https://stackoverflow.com/questions/5100299/how-to-get-the-name-of-the-calling-method
124
+ def check_class(
125
+ param_value,
126
+ expected_class,
127
+ param_name = caller_locations(1, 1)[0].label) # name of calling method
128
+ unless param_value.class == expected_class
129
+ raise TypeError.new("#{param_name} must be a " \
130
+ "#{expected_class} not a #{param_value.class.to_s}")
131
+ end
132
+ end
133
+ end
134
+ end
@@ -0,0 +1,182 @@
1
+ require 'mechanize'
2
+ require 'pp'
3
+
4
+ module UKPlanningScraper
5
+ class Authority
6
+ private
7
+ def scrape_idox(params, options)
8
+ puts "Using Idox scraper."
9
+ base_url = @url.match(/(https?:\/\/.+?)\//)[1]
10
+
11
+ apps = []
12
+
13
+ agent = Mechanize.new
14
+ puts "Getting: #{@url}"
15
+ page = agent.get(@url) # load the search form page
16
+
17
+ # Check that the search form is actually present.
18
+ # When Idox has an internal error it returns an error page with HTTP 200.
19
+ unless form = page.form('searchCriteriaForm')
20
+ puts "Error: Search form page failed to load due to Idox internal error."
21
+ return []
22
+ end
23
+ # form.action = form.action + '&searchCriteria.resultsPerPage=100'
24
+
25
+ # Fill out and submit search form
26
+
27
+ # Add expected fields to form if they're not already present so that searches using these terms work
28
+ %w{
29
+ date(applicationReceivedStart)
30
+ date(applicationReceivedEnd)
31
+ }.each { |f| form.add_field!(f) unless form.has_field?(f) }
32
+
33
+ date_format = "%d/%m/%Y"
34
+
35
+ form.send(:"date(applicationReceivedStart)", params[:received_from].strftime(date_format)) if params[:received_from]
36
+ form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime(date_format)) if params[:received_to]
37
+
38
+ form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime(date_format)) if params[:validated_from]
39
+ form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime(date_format)) if params[:validated_to]
40
+
41
+ form.send(:"date(applicationDecisionStart)", params[:decided_from].strftime(date_format)) if params[:decided_from]
42
+ form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime(date_format)) if params[:decided_to]
43
+
44
+ form.send(:"searchCriteria\.description", params[:keywords])
45
+
46
+ # Some councils don't have the applicant name on their form, eg Bexley
47
+ form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
48
+
49
+ form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType'
50
+
51
+ # Only some Idox sites (eg Bolton) have a 'searchCriteria.developmentType' parameter
52
+ form.send(:"searchCriteria\.developmentType", params[:development_type]) if form.has_field? 'searchCriteria.developmentType'
53
+
54
+ page = form.submit
55
+
56
+ if page.search('.errors').inner_text.match(/Too many results found/i)
57
+ raise TooManySearchResults.new("Scrape in smaller chunks. Use shorter date ranges and/or more search parameters.")
58
+ end
59
+
60
+ loop do
61
+ # Parse search results
62
+ items = page.search('li.searchresult')
63
+
64
+ puts "Found #{items.size} apps on this page."
65
+
66
+ items.each do |app|
67
+ data = Application.new
68
+
69
+ # Parse info line
70
+ info_line = app.at("p.metaInfo").inner_text.strip
71
+ bits = info_line.split('|').map { |e| e.strip.delete("\r\n") }
72
+
73
+ bits.each do |bit|
74
+ if matches = bit.match(/Ref\. No:\s+(.+)/)
75
+ data.council_reference = matches[1]
76
+ end
77
+
78
+ if matches = bit.match(/(Received|Registered):\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
79
+ data.date_received = Date.parse(matches[2])
80
+ end
81
+
82
+ if matches = bit.match(/Validated:\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
83
+ data.date_validated = Date.parse(matches[1])
84
+ end
85
+
86
+ if matches = bit.match(/Status:\s+(.+)/)
87
+ data.status = matches[1]
88
+ end
89
+ end
90
+
91
+ data.scraped_at = Time.now
92
+ data.info_url = base_url + app.at('a')['href']
93
+ data.address = app.at('p.address').inner_text.strip
94
+ data.description = app.at('a').inner_text.strip
95
+
96
+ apps << data
97
+ end
98
+
99
+ # Get the Next button from the pager, if there is one
100
+ if next_button = page.at('a.next')
101
+ next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
102
+ sleep options[:delay]
103
+ puts "Getting: #{next_url}"
104
+ page = agent.get(next_url)
105
+ else
106
+ break
107
+ end
108
+ end
109
+
110
+ # Scrape the summary tab for each app
111
+ apps.each_with_index do |app, i|
112
+ sleep options[:delay]
113
+ puts "#{i + 1} of #{apps.size}: #{app.info_url}"
114
+ res = agent.get(app.info_url)
115
+
116
+ if res.code == '200' # That's a String not an Integer, ffs
117
+ # Parse the summary tab for this app
118
+
119
+ app.scraped_at = Time.now
120
+
121
+ # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
122
+ # Bradford has #tab_documents but without the document count on it
123
+ app.documents_count = 0
124
+
125
+ if documents_link = res.at('.associateddocument a')
126
+ if documents_link.inner_text.match(/\d+/)
127
+ app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
128
+ app.documents_url = base_url + documents_link[:href]
129
+ end
130
+ elsif documents_link = res.at('#tab_documents')
131
+ if documents_link.inner_text.match(/\d+/)
132
+ app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
133
+ app.documents_url = base_url + documents_link[:href]
134
+ end
135
+ end
136
+
137
+ # We need to find values in the table by using the th labels.
138
+ # The row indexes/positions change from site to site (or even app to app) so we can't rely on that.
139
+
140
+ res.search('#simpleDetailsTable tr').each do |row|
141
+ key = row.at('th').inner_text.strip
142
+ value = row.at('td').inner_text.strip
143
+
144
+ case key
145
+ when 'Reference'
146
+ app.council_reference = value
147
+ when 'Alternative Reference'
148
+ app.alternative_reference = value unless value.empty?
149
+ when 'Planning Portal Reference'
150
+ app.alternative_reference = value unless value.empty?
151
+ when 'Application Received'
152
+ app.date_received = Date.parse(value) if value.match(/\d/)
153
+ when 'Application Registered'
154
+ app.date_received = Date.parse(value) if value.match(/\d/)
155
+ when 'Application Validated'
156
+ app.date_validated = Date.parse(value) if value.match(/\d/)
157
+ when 'Address'
158
+ app.address = value unless value.empty?
159
+ when 'Proposal'
160
+ app.description = value unless value.empty?
161
+ when 'Status'
162
+ app.status = value unless value.empty?
163
+ when 'Decision'
164
+ app.decision = value unless value.empty?
165
+ when 'Decision Issued Date'
166
+ app.date_decision = Date.parse(value) if value.match(/\d/)
167
+ when 'Appeal Status'
168
+ app.appeal_status = value unless value.empty?
169
+ when 'Appeal Decision'
170
+ app.appeal_decision = value unless value.empty?
171
+ else
172
+ puts "Error: key '#{key}' not found"
173
+ end # case
174
+ end # each row
175
+ else
176
+ puts "Error: HTTP #{res.code}"
177
+ end # if
178
+ end # scrape summary tab for apps
179
+ apps
180
+ end # scrape_idox
181
+ end # class
182
+ end
@@ -0,0 +1,127 @@
1
+ require 'http'
2
+ require 'nokogiri'
3
+ require 'logger'
4
+
5
+ module UKPlanningScraper
6
+ class Authority
7
+ private
8
+ def scrape_northgate(params, options)
9
+ puts "Using Northgate scraper."
10
+ base_url = @url.match(/(https?:\/\/.+?)\//)[1]
11
+
12
+ # Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive?
13
+ generic_url = @url.match(/.+\//)[0] + 'Generic/'
14
+
15
+ apps = []
16
+
17
+ $stdout.sync = true # Flush output buffer after every write so log messages appear immediately.
18
+ logger = Logger.new($stdout)
19
+ logger.level = Logger::DEBUG
20
+
21
+ date_regex = /\d{2}-\d{2}-\d{4}/
22
+
23
+ form_vars = {
24
+ 'csbtnSearch' => 'Search' # required
25
+ }
26
+
27
+ form_vars['txtProposal'] = params[:keywords]
28
+
29
+ # Date received from and to
30
+ if params[:received_from] || params[:received_to]
31
+ form_vars['cboSelectDateValue'] = 'DATE_RECEIVED'
32
+ form_vars['rbGroup'] = 'rbRange'
33
+ form_vars['dateStart'] = params[:received_from].to_s if params[:received_from] # YYYY-MM-DD
34
+ form_vars['dateEnd'] = params[:received_to].to_s if params[:received_to] # YYYY-MM-DD
35
+ end
36
+
37
+ # Date validated from and to
38
+ if params[:validated_from] || params[:validated_to]
39
+ form_vars['cboSelectDateValue'] = 'DATE_VALID'
40
+ form_vars['rbGroup'] = 'rbRange'
41
+ form_vars['dateStart'] = params[:validated_from].to_s if params[:validated_from] # YYYY-MM-DD
42
+ form_vars['dateEnd'] = params[:validated_to].to_s if params[:validated_to] # YYYY-MM-DD
43
+ end
44
+
45
+ # Date decided from and to
46
+ if params[:decided_from] || params[:decided_to]
47
+ form_vars['cboSelectDateValue'] = 'DATE_DECISION'
48
+ form_vars['rbGroup'] = 'rbRange'
49
+ form_vars['dateStart'] = params[:decided_from].to_s if params[:decided_from] # YYYY-MM-DD
50
+ form_vars['dateEnd'] = params[:decided_to].to_s if params[:decided_to] # YYYY-MM-DD
51
+ end
52
+
53
+ logger.info "Form variables: #{form_vars.to_s}"
54
+
55
+ headers = {
56
+ 'Origin' => base_url,
57
+ 'Referer' => @url,
58
+ }
59
+
60
+ logger.debug "HTTP request headers:"
61
+ logger.debug(headers.to_s)
62
+
63
+ logger.debug "GET: " + @url
64
+ response = HTTP.headers(headers).get(@url)
65
+ logger.debug "Response code: HTTP " + response.code.to_s
66
+
67
+ if response.code == 200
68
+ doc = Nokogiri::HTML(response.to_s)
69
+ asp_vars = {
70
+ '__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'],
71
+ '__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value']
72
+ }
73
+ else
74
+ logger.fatal "Bad response from search page. Response code: #{response.code.to_s}."
75
+ raise RuntimeError.new("Northgate: Bad response from search page. Response code: #{response.code.to_s}.")
76
+ end
77
+
78
+ cookies = {}
79
+ response.cookies.each { |c| cookies[c.name] = c.value }
80
+
81
+ form_vars.merge!(asp_vars)
82
+
83
+ logger.debug "POST: " + @url
84
+ response2 = HTTP.headers(headers).cookies(cookies).post(@url, :form => form_vars)
85
+ logger.debug "Response code: HTTP " + response2.code.to_s
86
+
87
+ if response2.code == 302
88
+ # Follow the redirect manually
89
+ # Set the page size (PS) to max so we don't have to page through search results
90
+ logger.debug "Location: #{response2.headers['Location']}"
91
+ results_url = URI::encode(base_url + response2.headers['Location'].gsub!('PS=10', 'PS=99999'))
92
+ logger.debug "GET: " + results_url
93
+ response3 = HTTP.headers(headers).cookies(cookies).get(results_url)
94
+ logger.debug "Response code: HTTP " + response3.code.to_s
95
+ doc = Nokogiri::HTML(response3.to_s)
96
+ else
97
+ logger.error "Didn't get redirected from search."
98
+ raise RuntimeError.new("Northgate: didn't get redirected from search.")
99
+ end
100
+
101
+ rows = doc.search("table.display_table tr")
102
+ logger.info "Found #{rows.size - 1} applications in search results." # The first row is the header row
103
+
104
+ # Iterate over search results
105
+ rows.each do |row|
106
+ if row.at("td") # skip header row which only has th's
107
+ cells = row.search("td")
108
+
109
+ app = Application.new
110
+ app.scraped_at = Time.now
111
+ app.council_reference = cells[0].inner_text.strip
112
+ app.info_url = URI::encode(generic_url + cells[0].at('a')[:href].strip)
113
+ app.info_url.gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
114
+ app.address = cells[1].inner_text.strip
115
+ app.description = cells[2].inner_text.strip
116
+ app.status = cells[3].inner_text.strip
117
+ raw_date_received = cells[4].inner_text.strip
118
+ app.date_received = Date.parse(raw_date_received) if raw_date_received != '--'
119
+ app.decision = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney
120
+
121
+ apps << app
122
+ end
123
+ end
124
+ apps
125
+ end
126
+ end
127
+ end
@@ -0,0 +1,3 @@
1
+ module UKPlanningScraper
2
+ VERSION = "0.4.3"
3
+ end
@@ -0,0 +1,13 @@
1
+ require "uk_planning_scraper/version"
2
+ require "uk_planning_scraper/authority"
3
+ require "uk_planning_scraper/authority_scrape_params"
4
+ require "uk_planning_scraper/application"
5
+ require 'uk_planning_scraper/idox'
6
+ require 'uk_planning_scraper/northgate'
7
+ require 'logger'
8
+
9
+ module UKPlanningScraper
10
+ class SystemNotSupported < StandardError; end
11
+ class AuthorityNotFound < StandardError; end
12
+ class TooManySearchResults < StandardError; end
13
+ end
@@ -0,0 +1,33 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "uk_planning_scraper/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "uk_planning_scraper"
8
+ spec.version = UKPlanningScraper::VERSION
9
+ spec.authors = ["Adrian Short"]
10
+ spec.email = 'rubygems@adrianshort.org'
11
+ spec.summary = %q{Scrape planning applications data from UK council websites.}
12
+ # spec.description = %q{TODO: Write a longer description or delete this line.}
13
+ spec.homepage = "https://github.com/adrianshort/uk_planning_scraper/"
14
+ spec.licenses = ['LGPL-3.0']
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
17
+ f.match(%r{^(test|spec|features)/})
18
+ end
19
+ spec.bindir = "exe"
20
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
21
+ spec.require_paths = ["lib"]
22
+
23
+ spec.add_development_dependency "bundler", "~> 2.0"
24
+ spec.add_development_dependency "rake", "~> 10.0"
25
+ spec.add_development_dependency "rspec", "~> 3.8"
26
+ spec.add_development_dependency "simplecov", "~> 0.16"
27
+ spec.add_development_dependency "vcr", "~> 4.0"
28
+ spec.add_development_dependency "webmock", "~> 3.5"
29
+ spec.add_development_dependency "pry", "~> 0.11"
30
+
31
+ spec.add_runtime_dependency "mechanize", "~> 2.7"
32
+ spec.add_runtime_dependency "http", "~> 3.3"
33
+ end