uk_planning_scraper 0.4.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,140 @@
1
+ require 'csv'
2
+
3
+ module UKPlanningScraper
4
+ class Authority
5
+ attr_reader :name, :url
6
+
7
+ @@authorities = []
8
+
9
+ def initialize(name, url)
10
+ @name = name.strip
11
+ @url = url.strip
12
+ @tags = [] # Strings in arbitrary order
13
+ @applications = [] # Application objects
14
+ @scrape_params = {}
15
+ end
16
+
17
+ def scrape(options = {})
18
+ default_options = {
19
+ delay: 10,
20
+ }
21
+ # The user-supplied options override the defaults
22
+ options = default_options.merge(options)
23
+
24
+ # Select which scraper to use
25
+ case system
26
+ when 'idox'
27
+ @applications = scrape_idox(@scrape_params, options)
28
+ when 'northgate'
29
+ @applications = scrape_northgate(@scrape_params, options)
30
+ else
31
+ raise SystemNotSupported.new("Planning system not supported for \
32
+ #{@name} at URL: #{@url}")
33
+ end
34
+
35
+ # Post processing
36
+ @applications.each do |app|
37
+ app.authority_name = @name
38
+ end
39
+
40
+ # Output as an array of hashes
41
+ output = []
42
+ # FIXME - silently ignores invalid apps. How should we handle them?
43
+ @applications.each { |app| output << app.to_hash if app.valid? }
44
+
45
+ # Reset so that old params don't get used for new scrapes
46
+ clear_scrape_params
47
+
48
+ output # Single point of successful exit
49
+ end
50
+
51
+ def tags
52
+ @tags.sort
53
+ end
54
+
55
+ # Add multiple tags to existing tags
56
+ def add_tags(tags)
57
+ tags.each { |t| add_tag(t) }
58
+ end
59
+
60
+ # Add a single tag to existing tags
61
+ def add_tag(tag)
62
+ clean_tag = tag.strip.downcase.gsub(' ', '')
63
+ @tags << clean_tag unless tagged?(clean_tag) # prevent duplicates
64
+ end
65
+
66
+ def tagged?(tag)
67
+ @tags.include?(tag)
68
+ end
69
+
70
+ def system
71
+ if @url.match(/search\.do\?action=advanced/i)
72
+ 'idox'
73
+ elsif @url.match(/generalsearch\.aspx/i)
74
+ 'northgate'
75
+ elsif @url.match(/ocellaweb/i)
76
+ 'ocellaweb'
77
+ elsif @url.match(/\/apas\//)
78
+ 'agileplanning'
79
+ else
80
+ 'unknownsystem'
81
+ end
82
+ end
83
+
84
+ def self.all
85
+ @@authorities
86
+ end
87
+
88
+ # List all the tags in use
89
+ def self.tags
90
+ tags = []
91
+ @@authorities.each { |a| tags << a.tags }
92
+ tags.flatten.uniq.sort
93
+ end
94
+
95
+ def self.named(name)
96
+ authority = @@authorities.find { |a| name == a.name }
97
+ raise AuthorityNotFound if authority.nil?
98
+ authority
99
+ end
100
+
101
+ # Tagged x
102
+ def self.tagged(tag)
103
+ found = []
104
+ @@authorities.each { |a| found << a if a.tagged?(tag) }
105
+ found
106
+ end
107
+
108
+ # Not tagged x
109
+ def self.not_tagged(tag)
110
+ found = []
111
+ @@authorities.each { |a| found << a unless a.tagged?(tag) }
112
+ found
113
+ end
114
+
115
+ # Authorities with no tags
116
+ def self.untagged
117
+ found = []
118
+ @@authorities.each { |a| found << a if a.tags.empty? }
119
+ found
120
+ end
121
+
122
+ def self.load
123
+ # Don't run this method more than once
124
+ return unless @@authorities.empty?
125
+ CSV.foreach(File.join(File.dirname(__dir__), 'uk_planning_scraper', \
126
+ 'authorities.csv'), :headers => true) do |line|
127
+ auth = Authority.new(line['authority_name'], line['url'])
128
+
129
+ if line['tags']
130
+ auth.add_tags(line['tags'].split(/\s+/))
131
+ end
132
+
133
+ auth.add_tag(auth.system)
134
+ @@authorities << auth
135
+ end
136
+ end
137
+ end
138
+ end
139
+
140
+ UKPlanningScraper::Authority.load
@@ -0,0 +1,134 @@
1
+ require 'date'
2
+
3
+ module UKPlanningScraper
4
+ class Authority
5
+ # Parameter methods for Authority#scrape
6
+ # Desgined to be method chained, eg:
7
+ #
8
+ # applications = UKPlanningScraper::Authority.named("Barnet"). \
9
+ # development_type("Q22").keywords("illuminat"). \
10
+ # validated_days(30).scrape
11
+
12
+ def validated_days(n)
13
+ # Validated within the last n days
14
+ # Assumes that every scraper/system can do a date range search
15
+ check_class(n, Fixnum)
16
+
17
+ unless n > 0
18
+ raise ArgumentError.new("validated_days must be greater than 0")
19
+ end
20
+
21
+ validated_from(Date.today - (n - 1))
22
+ validated_to(Date.today)
23
+ self
24
+ end
25
+
26
+ def received_days(n)
27
+ # received within the last n days
28
+ # Assumes that every scraper/system can do a date range search
29
+ check_class(n, Fixnum)
30
+
31
+ unless n > 0
32
+ raise ArgumentError.new("received_days must be greater than 0")
33
+ end
34
+
35
+ received_from(Date.today - (n - 1))
36
+ received_to(Date.today)
37
+ self
38
+ end
39
+
40
+ def decided_days(n)
41
+ # decided within the last n days
42
+ # Assumes that every scraper/system can do a date range search
43
+ check_class(n, Fixnum)
44
+
45
+ unless n > 0
46
+ raise ArgumentError.new("decided_days must be greater than 0")
47
+ end
48
+
49
+ decided_from(Date.today - (n - 1))
50
+ decided_to(Date.today)
51
+ self
52
+ end
53
+
54
+ def applicant_name(s)
55
+ unless system == 'idox'
56
+ raise NoMethodError.new("applicant_name is only implemented for Idox. \
57
+ This authority (#{@name}) is #{system.capitalize}.")
58
+ end
59
+
60
+ check_class(s, String)
61
+ @scrape_params[:applicant_name] = s.strip
62
+ self
63
+ end
64
+
65
+ def application_type(s)
66
+ unless system == 'idox'
67
+ raise NoMethodError.new("application_type is only implemented for \
68
+ Idox. This authority (#{@name}) is #{system.capitalize}.")
69
+ end
70
+
71
+ check_class(s, String)
72
+ @scrape_params[:application_type] = s.strip
73
+ self
74
+ end
75
+
76
+ def development_type(s)
77
+ unless system == 'idox'
78
+ raise NoMethodError.new("development_type is only implemented for \
79
+ Idox. This authority (#{@name}) is #{system.capitalize}.")
80
+ end
81
+
82
+ check_class(s, String)
83
+ @scrape_params[:development_type] = s.strip
84
+ self
85
+ end
86
+
87
+ private
88
+
89
+ # Handle the simple params with this
90
+ def method_missing(method_name, *args)
91
+ sc_params = {
92
+ validated_from: Date,
93
+ validated_to: Date,
94
+ received_from: Date,
95
+ received_to: Date,
96
+ decided_from: Date,
97
+ decided_to: Date,
98
+ keywords: String
99
+ }
100
+
101
+ value = args[0]
102
+
103
+ if sc_params[method_name]
104
+ check_class(value, sc_params[method_name], method_name.to_s)
105
+ value.strip! if value.class == String
106
+
107
+ if value.class == Date && value > Date.today
108
+ raise ArgumentError.new("#{method_name} can't be a date in the " + \
109
+ "future (#{value.to_s})")
110
+ end
111
+
112
+ @scrape_params[method_name] = value
113
+ self
114
+ else
115
+ raise NoMethodError.new(method_name.to_s)
116
+ end
117
+ end
118
+
119
+ def clear_scrape_params
120
+ @scrape_params = {}
121
+ end
122
+
123
+ # https://stackoverflow.com/questions/5100299/how-to-get-the-name-of-the-calling-method
124
+ def check_class(
125
+ param_value,
126
+ expected_class,
127
+ param_name = caller_locations(1, 1)[0].label) # name of calling method
128
+ unless param_value.class == expected_class
129
+ raise TypeError.new("#{param_name} must be a " \
130
+ "#{expected_class} not a #{param_value.class.to_s}")
131
+ end
132
+ end
133
+ end
134
+ end
@@ -0,0 +1,182 @@
1
+ require 'mechanize'
2
+ require 'pp'
3
+
4
+ module UKPlanningScraper
5
+ class Authority
6
+ private
7
+ def scrape_idox(params, options)
8
+ puts "Using Idox scraper."
9
+ base_url = @url.match(/(https?:\/\/.+?)\//)[1]
10
+
11
+ apps = []
12
+
13
+ agent = Mechanize.new
14
+ puts "Getting: #{@url}"
15
+ page = agent.get(@url) # load the search form page
16
+
17
+ # Check that the search form is actually present.
18
+ # When Idox has an internal error it returns an error page with HTTP 200.
19
+ unless form = page.form('searchCriteriaForm')
20
+ puts "Error: Search form page failed to load due to Idox internal error."
21
+ return []
22
+ end
23
+ # form.action = form.action + '&searchCriteria.resultsPerPage=100'
24
+
25
+ # Fill out and submit search form
26
+
27
+ # Add expected fields to form if they're not already present so that searches using these terms work
28
+ %w{
29
+ date(applicationReceivedStart)
30
+ date(applicationReceivedEnd)
31
+ }.each { |f| form.add_field!(f) unless form.has_field?(f) }
32
+
33
+ date_format = "%d/%m/%Y"
34
+
35
+ form.send(:"date(applicationReceivedStart)", params[:received_from].strftime(date_format)) if params[:received_from]
36
+ form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime(date_format)) if params[:received_to]
37
+
38
+ form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime(date_format)) if params[:validated_from]
39
+ form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime(date_format)) if params[:validated_to]
40
+
41
+ form.send(:"date(applicationDecisionStart)", params[:decided_from].strftime(date_format)) if params[:decided_from]
42
+ form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime(date_format)) if params[:decided_to]
43
+
44
+ form.send(:"searchCriteria\.description", params[:keywords])
45
+
46
+ # Some councils don't have the applicant name on their form, eg Bexley
47
+ form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
48
+
49
+ form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType'
50
+
51
+ # Only some Idox sites (eg Bolton) have a 'searchCriteria.developmentType' parameter
52
+ form.send(:"searchCriteria\.developmentType", params[:development_type]) if form.has_field? 'searchCriteria.developmentType'
53
+
54
+ page = form.submit
55
+
56
+ if page.search('.errors').inner_text.match(/Too many results found/i)
57
+ raise TooManySearchResults.new("Scrape in smaller chunks. Use shorter date ranges and/or more search parameters.")
58
+ end
59
+
60
+ loop do
61
+ # Parse search results
62
+ items = page.search('li.searchresult')
63
+
64
+ puts "Found #{items.size} apps on this page."
65
+
66
+ items.each do |app|
67
+ data = Application.new
68
+
69
+ # Parse info line
70
+ info_line = app.at("p.metaInfo").inner_text.strip
71
+ bits = info_line.split('|').map { |e| e.strip.delete("\r\n") }
72
+
73
+ bits.each do |bit|
74
+ if matches = bit.match(/Ref\. No:\s+(.+)/)
75
+ data.council_reference = matches[1]
76
+ end
77
+
78
+ if matches = bit.match(/(Received|Registered):\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
79
+ data.date_received = Date.parse(matches[2])
80
+ end
81
+
82
+ if matches = bit.match(/Validated:\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
83
+ data.date_validated = Date.parse(matches[1])
84
+ end
85
+
86
+ if matches = bit.match(/Status:\s+(.+)/)
87
+ data.status = matches[1]
88
+ end
89
+ end
90
+
91
+ data.scraped_at = Time.now
92
+ data.info_url = base_url + app.at('a')['href']
93
+ data.address = app.at('p.address').inner_text.strip
94
+ data.description = app.at('a').inner_text.strip
95
+
96
+ apps << data
97
+ end
98
+
99
+ # Get the Next button from the pager, if there is one
100
+ if next_button = page.at('a.next')
101
+ next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
102
+ sleep options[:delay]
103
+ puts "Getting: #{next_url}"
104
+ page = agent.get(next_url)
105
+ else
106
+ break
107
+ end
108
+ end
109
+
110
+ # Scrape the summary tab for each app
111
+ apps.each_with_index do |app, i|
112
+ sleep options[:delay]
113
+ puts "#{i + 1} of #{apps.size}: #{app.info_url}"
114
+ res = agent.get(app.info_url)
115
+
116
+ if res.code == '200' # That's a String not an Integer, ffs
117
+ # Parse the summary tab for this app
118
+
119
+ app.scraped_at = Time.now
120
+
121
+ # The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
122
+ # Bradford has #tab_documents but without the document count on it
123
+ app.documents_count = 0
124
+
125
+ if documents_link = res.at('.associateddocument a')
126
+ if documents_link.inner_text.match(/\d+/)
127
+ app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
128
+ app.documents_url = base_url + documents_link[:href]
129
+ end
130
+ elsif documents_link = res.at('#tab_documents')
131
+ if documents_link.inner_text.match(/\d+/)
132
+ app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
133
+ app.documents_url = base_url + documents_link[:href]
134
+ end
135
+ end
136
+
137
+ # We need to find values in the table by using the th labels.
138
+ # The row indexes/positions change from site to site (or even app to app) so we can't rely on that.
139
+
140
+ res.search('#simpleDetailsTable tr').each do |row|
141
+ key = row.at('th').inner_text.strip
142
+ value = row.at('td').inner_text.strip
143
+
144
+ case key
145
+ when 'Reference'
146
+ app.council_reference = value
147
+ when 'Alternative Reference'
148
+ app.alternative_reference = value unless value.empty?
149
+ when 'Planning Portal Reference'
150
+ app.alternative_reference = value unless value.empty?
151
+ when 'Application Received'
152
+ app.date_received = Date.parse(value) if value.match(/\d/)
153
+ when 'Application Registered'
154
+ app.date_received = Date.parse(value) if value.match(/\d/)
155
+ when 'Application Validated'
156
+ app.date_validated = Date.parse(value) if value.match(/\d/)
157
+ when 'Address'
158
+ app.address = value unless value.empty?
159
+ when 'Proposal'
160
+ app.description = value unless value.empty?
161
+ when 'Status'
162
+ app.status = value unless value.empty?
163
+ when 'Decision'
164
+ app.decision = value unless value.empty?
165
+ when 'Decision Issued Date'
166
+ app.date_decision = Date.parse(value) if value.match(/\d/)
167
+ when 'Appeal Status'
168
+ app.appeal_status = value unless value.empty?
169
+ when 'Appeal Decision'
170
+ app.appeal_decision = value unless value.empty?
171
+ else
172
+ puts "Error: key '#{key}' not found"
173
+ end # case
174
+ end # each row
175
+ else
176
+ puts "Error: HTTP #{res.code}"
177
+ end # if
178
+ end # scrape summary tab for apps
179
+ apps
180
+ end # scrape_idox
181
+ end # class
182
+ end
@@ -0,0 +1,127 @@
1
+ require 'http'
2
+ require 'nokogiri'
3
+ require 'logger'
4
+
5
+ module UKPlanningScraper
6
+ class Authority
7
+ private
8
+ def scrape_northgate(params, options)
9
+ puts "Using Northgate scraper."
10
+ base_url = @url.match(/(https?:\/\/.+?)\//)[1]
11
+
12
+ # Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive?
13
+ generic_url = @url.match(/.+\//)[0] + 'Generic/'
14
+
15
+ apps = []
16
+
17
+ $stdout.sync = true # Flush output buffer after every write so log messages appear immediately.
18
+ logger = Logger.new($stdout)
19
+ logger.level = Logger::DEBUG
20
+
21
+ date_regex = /\d{2}-\d{2}-\d{4}/
22
+
23
+ form_vars = {
24
+ 'csbtnSearch' => 'Search' # required
25
+ }
26
+
27
+ form_vars['txtProposal'] = params[:keywords]
28
+
29
+ # Date received from and to
30
+ if params[:received_from] || params[:received_to]
31
+ form_vars['cboSelectDateValue'] = 'DATE_RECEIVED'
32
+ form_vars['rbGroup'] = 'rbRange'
33
+ form_vars['dateStart'] = params[:received_from].to_s if params[:received_from] # YYYY-MM-DD
34
+ form_vars['dateEnd'] = params[:received_to].to_s if params[:received_to] # YYYY-MM-DD
35
+ end
36
+
37
+ # Date validated from and to
38
+ if params[:validated_from] || params[:validated_to]
39
+ form_vars['cboSelectDateValue'] = 'DATE_VALID'
40
+ form_vars['rbGroup'] = 'rbRange'
41
+ form_vars['dateStart'] = params[:validated_from].to_s if params[:validated_from] # YYYY-MM-DD
42
+ form_vars['dateEnd'] = params[:validated_to].to_s if params[:validated_to] # YYYY-MM-DD
43
+ end
44
+
45
+ # Date decided from and to
46
+ if params[:decided_from] || params[:decided_to]
47
+ form_vars['cboSelectDateValue'] = 'DATE_DECISION'
48
+ form_vars['rbGroup'] = 'rbRange'
49
+ form_vars['dateStart'] = params[:decided_from].to_s if params[:decided_from] # YYYY-MM-DD
50
+ form_vars['dateEnd'] = params[:decided_to].to_s if params[:decided_to] # YYYY-MM-DD
51
+ end
52
+
53
+ logger.info "Form variables: #{form_vars.to_s}"
54
+
55
+ headers = {
56
+ 'Origin' => base_url,
57
+ 'Referer' => @url,
58
+ }
59
+
60
+ logger.debug "HTTP request headers:"
61
+ logger.debug(headers.to_s)
62
+
63
+ logger.debug "GET: " + @url
64
+ response = HTTP.headers(headers).get(@url)
65
+ logger.debug "Response code: HTTP " + response.code.to_s
66
+
67
+ if response.code == 200
68
+ doc = Nokogiri::HTML(response.to_s)
69
+ asp_vars = {
70
+ '__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'],
71
+ '__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value']
72
+ }
73
+ else
74
+ logger.fatal "Bad response from search page. Response code: #{response.code.to_s}."
75
+ raise RuntimeError.new("Northgate: Bad response from search page. Response code: #{response.code.to_s}.")
76
+ end
77
+
78
+ cookies = {}
79
+ response.cookies.each { |c| cookies[c.name] = c.value }
80
+
81
+ form_vars.merge!(asp_vars)
82
+
83
+ logger.debug "POST: " + @url
84
+ response2 = HTTP.headers(headers).cookies(cookies).post(@url, :form => form_vars)
85
+ logger.debug "Response code: HTTP " + response2.code.to_s
86
+
87
+ if response2.code == 302
88
+ # Follow the redirect manually
89
+ # Set the page size (PS) to max so we don't have to page through search results
90
+ logger.debug "Location: #{response2.headers['Location']}"
91
+ results_url = URI::encode(base_url + response2.headers['Location'].gsub!('PS=10', 'PS=99999'))
92
+ logger.debug "GET: " + results_url
93
+ response3 = HTTP.headers(headers).cookies(cookies).get(results_url)
94
+ logger.debug "Response code: HTTP " + response3.code.to_s
95
+ doc = Nokogiri::HTML(response3.to_s)
96
+ else
97
+ logger.error "Didn't get redirected from search."
98
+ raise RuntimeError.new("Northgate: didn't get redirected from search.")
99
+ end
100
+
101
+ rows = doc.search("table.display_table tr")
102
+ logger.info "Found #{rows.size - 1} applications in search results." # The first row is the header row
103
+
104
+ # Iterate over search results
105
+ rows.each do |row|
106
+ if row.at("td") # skip header row which only has th's
107
+ cells = row.search("td")
108
+
109
+ app = Application.new
110
+ app.scraped_at = Time.now
111
+ app.council_reference = cells[0].inner_text.strip
112
+ app.info_url = URI::encode(generic_url + cells[0].at('a')[:href].strip)
113
+ app.info_url.gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
114
+ app.address = cells[1].inner_text.strip
115
+ app.description = cells[2].inner_text.strip
116
+ app.status = cells[3].inner_text.strip
117
+ raw_date_received = cells[4].inner_text.strip
118
+ app.date_received = Date.parse(raw_date_received) if raw_date_received != '--'
119
+ app.decision = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney
120
+
121
+ apps << app
122
+ end
123
+ end
124
+ apps
125
+ end
126
+ end
127
+ end
@@ -0,0 +1,3 @@
1
+ module UKPlanningScraper
2
+ VERSION = "0.4.3"
3
+ end
@@ -0,0 +1,13 @@
1
+ require "uk_planning_scraper/version"
2
+ require "uk_planning_scraper/authority"
3
+ require "uk_planning_scraper/authority_scrape_params"
4
+ require "uk_planning_scraper/application"
5
+ require 'uk_planning_scraper/idox'
6
+ require 'uk_planning_scraper/northgate'
7
+ require 'logger'
8
+
9
+ module UKPlanningScraper
10
+ class SystemNotSupported < StandardError; end
11
+ class AuthorityNotFound < StandardError; end
12
+ class TooManySearchResults < StandardError; end
13
+ end
@@ -0,0 +1,33 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path("../lib", __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require "uk_planning_scraper/version"
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "uk_planning_scraper"
8
+ spec.version = UKPlanningScraper::VERSION
9
+ spec.authors = ["Adrian Short"]
10
+ spec.email = 'rubygems@adrianshort.org'
11
+ spec.summary = %q{Scrape planning applications data from UK council websites.}
12
+ # spec.description = %q{TODO: Write a longer description or delete this line.}
13
+ spec.homepage = "https://github.com/adrianshort/uk_planning_scraper/"
14
+ spec.licenses = ['LGPL-3.0']
15
+
16
+ spec.files = `git ls-files -z`.split("\x0").reject do |f|
17
+ f.match(%r{^(test|spec|features)/})
18
+ end
19
+ spec.bindir = "exe"
20
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
21
+ spec.require_paths = ["lib"]
22
+
23
+ spec.add_development_dependency "bundler", "~> 2.0"
24
+ spec.add_development_dependency "rake", "~> 10.0"
25
+ spec.add_development_dependency "rspec", "~> 3.8"
26
+ spec.add_development_dependency "simplecov", "~> 0.16"
27
+ spec.add_development_dependency "vcr", "~> 4.0"
28
+ spec.add_development_dependency "webmock", "~> 3.5"
29
+ spec.add_development_dependency "pry", "~> 0.11"
30
+
31
+ spec.add_runtime_dependency "mechanize", "~> 2.7"
32
+ spec.add_runtime_dependency "http", "~> 3.3"
33
+ end