uk_planning_scraper 0.4.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.rspec +2 -0
- data/Gemfile +4 -0
- data/LICENSE +165 -0
- data/README.md +250 -0
- data/Rakefile +2 -0
- data/bin/console +10 -0
- data/bin/setup +8 -0
- data/lib/uk_planning_scraper/application.rb +46 -0
- data/lib/uk_planning_scraper/authorities.csv +113 -0
- data/lib/uk_planning_scraper/authority.rb +140 -0
- data/lib/uk_planning_scraper/authority_scrape_params.rb +134 -0
- data/lib/uk_planning_scraper/idox.rb +182 -0
- data/lib/uk_planning_scraper/northgate.rb +127 -0
- data/lib/uk_planning_scraper/version.rb +3 -0
- data/lib/uk_planning_scraper.rb +13 -0
- data/uk_planning_scraper.gemspec +33 -0
- metadata +185 -0
@@ -0,0 +1,140 @@
|
|
1
|
+
require 'csv'
|
2
|
+
|
3
|
+
module UKPlanningScraper
|
4
|
+
class Authority
|
5
|
+
attr_reader :name, :url
|
6
|
+
|
7
|
+
@@authorities = []
|
8
|
+
|
9
|
+
def initialize(name, url)
|
10
|
+
@name = name.strip
|
11
|
+
@url = url.strip
|
12
|
+
@tags = [] # Strings in arbitrary order
|
13
|
+
@applications = [] # Application objects
|
14
|
+
@scrape_params = {}
|
15
|
+
end
|
16
|
+
|
17
|
+
def scrape(options = {})
|
18
|
+
default_options = {
|
19
|
+
delay: 10,
|
20
|
+
}
|
21
|
+
# The user-supplied options override the defaults
|
22
|
+
options = default_options.merge(options)
|
23
|
+
|
24
|
+
# Select which scraper to use
|
25
|
+
case system
|
26
|
+
when 'idox'
|
27
|
+
@applications = scrape_idox(@scrape_params, options)
|
28
|
+
when 'northgate'
|
29
|
+
@applications = scrape_northgate(@scrape_params, options)
|
30
|
+
else
|
31
|
+
raise SystemNotSupported.new("Planning system not supported for \
|
32
|
+
#{@name} at URL: #{@url}")
|
33
|
+
end
|
34
|
+
|
35
|
+
# Post processing
|
36
|
+
@applications.each do |app|
|
37
|
+
app.authority_name = @name
|
38
|
+
end
|
39
|
+
|
40
|
+
# Output as an array of hashes
|
41
|
+
output = []
|
42
|
+
# FIXME - silently ignores invalid apps. How should we handle them?
|
43
|
+
@applications.each { |app| output << app.to_hash if app.valid? }
|
44
|
+
|
45
|
+
# Reset so that old params don't get used for new scrapes
|
46
|
+
clear_scrape_params
|
47
|
+
|
48
|
+
output # Single point of successful exit
|
49
|
+
end
|
50
|
+
|
51
|
+
def tags
|
52
|
+
@tags.sort
|
53
|
+
end
|
54
|
+
|
55
|
+
# Add multiple tags to existing tags
|
56
|
+
def add_tags(tags)
|
57
|
+
tags.each { |t| add_tag(t) }
|
58
|
+
end
|
59
|
+
|
60
|
+
# Add a single tag to existing tags
|
61
|
+
def add_tag(tag)
|
62
|
+
clean_tag = tag.strip.downcase.gsub(' ', '')
|
63
|
+
@tags << clean_tag unless tagged?(clean_tag) # prevent duplicates
|
64
|
+
end
|
65
|
+
|
66
|
+
def tagged?(tag)
|
67
|
+
@tags.include?(tag)
|
68
|
+
end
|
69
|
+
|
70
|
+
def system
|
71
|
+
if @url.match(/search\.do\?action=advanced/i)
|
72
|
+
'idox'
|
73
|
+
elsif @url.match(/generalsearch\.aspx/i)
|
74
|
+
'northgate'
|
75
|
+
elsif @url.match(/ocellaweb/i)
|
76
|
+
'ocellaweb'
|
77
|
+
elsif @url.match(/\/apas\//)
|
78
|
+
'agileplanning'
|
79
|
+
else
|
80
|
+
'unknownsystem'
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
def self.all
|
85
|
+
@@authorities
|
86
|
+
end
|
87
|
+
|
88
|
+
# List all the tags in use
|
89
|
+
def self.tags
|
90
|
+
tags = []
|
91
|
+
@@authorities.each { |a| tags << a.tags }
|
92
|
+
tags.flatten.uniq.sort
|
93
|
+
end
|
94
|
+
|
95
|
+
def self.named(name)
|
96
|
+
authority = @@authorities.find { |a| name == a.name }
|
97
|
+
raise AuthorityNotFound if authority.nil?
|
98
|
+
authority
|
99
|
+
end
|
100
|
+
|
101
|
+
# Tagged x
|
102
|
+
def self.tagged(tag)
|
103
|
+
found = []
|
104
|
+
@@authorities.each { |a| found << a if a.tagged?(tag) }
|
105
|
+
found
|
106
|
+
end
|
107
|
+
|
108
|
+
# Not tagged x
|
109
|
+
def self.not_tagged(tag)
|
110
|
+
found = []
|
111
|
+
@@authorities.each { |a| found << a unless a.tagged?(tag) }
|
112
|
+
found
|
113
|
+
end
|
114
|
+
|
115
|
+
# Authorities with no tags
|
116
|
+
def self.untagged
|
117
|
+
found = []
|
118
|
+
@@authorities.each { |a| found << a if a.tags.empty? }
|
119
|
+
found
|
120
|
+
end
|
121
|
+
|
122
|
+
def self.load
|
123
|
+
# Don't run this method more than once
|
124
|
+
return unless @@authorities.empty?
|
125
|
+
CSV.foreach(File.join(File.dirname(__dir__), 'uk_planning_scraper', \
|
126
|
+
'authorities.csv'), :headers => true) do |line|
|
127
|
+
auth = Authority.new(line['authority_name'], line['url'])
|
128
|
+
|
129
|
+
if line['tags']
|
130
|
+
auth.add_tags(line['tags'].split(/\s+/))
|
131
|
+
end
|
132
|
+
|
133
|
+
auth.add_tag(auth.system)
|
134
|
+
@@authorities << auth
|
135
|
+
end
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
UKPlanningScraper::Authority.load
|
@@ -0,0 +1,134 @@
|
|
1
|
+
require 'date'
|
2
|
+
|
3
|
+
module UKPlanningScraper
|
4
|
+
class Authority
|
5
|
+
# Parameter methods for Authority#scrape
|
6
|
+
# Desgined to be method chained, eg:
|
7
|
+
#
|
8
|
+
# applications = UKPlanningScraper::Authority.named("Barnet"). \
|
9
|
+
# development_type("Q22").keywords("illuminat"). \
|
10
|
+
# validated_days(30).scrape
|
11
|
+
|
12
|
+
def validated_days(n)
|
13
|
+
# Validated within the last n days
|
14
|
+
# Assumes that every scraper/system can do a date range search
|
15
|
+
check_class(n, Fixnum)
|
16
|
+
|
17
|
+
unless n > 0
|
18
|
+
raise ArgumentError.new("validated_days must be greater than 0")
|
19
|
+
end
|
20
|
+
|
21
|
+
validated_from(Date.today - (n - 1))
|
22
|
+
validated_to(Date.today)
|
23
|
+
self
|
24
|
+
end
|
25
|
+
|
26
|
+
def received_days(n)
|
27
|
+
# received within the last n days
|
28
|
+
# Assumes that every scraper/system can do a date range search
|
29
|
+
check_class(n, Fixnum)
|
30
|
+
|
31
|
+
unless n > 0
|
32
|
+
raise ArgumentError.new("received_days must be greater than 0")
|
33
|
+
end
|
34
|
+
|
35
|
+
received_from(Date.today - (n - 1))
|
36
|
+
received_to(Date.today)
|
37
|
+
self
|
38
|
+
end
|
39
|
+
|
40
|
+
def decided_days(n)
|
41
|
+
# decided within the last n days
|
42
|
+
# Assumes that every scraper/system can do a date range search
|
43
|
+
check_class(n, Fixnum)
|
44
|
+
|
45
|
+
unless n > 0
|
46
|
+
raise ArgumentError.new("decided_days must be greater than 0")
|
47
|
+
end
|
48
|
+
|
49
|
+
decided_from(Date.today - (n - 1))
|
50
|
+
decided_to(Date.today)
|
51
|
+
self
|
52
|
+
end
|
53
|
+
|
54
|
+
def applicant_name(s)
|
55
|
+
unless system == 'idox'
|
56
|
+
raise NoMethodError.new("applicant_name is only implemented for Idox. \
|
57
|
+
This authority (#{@name}) is #{system.capitalize}.")
|
58
|
+
end
|
59
|
+
|
60
|
+
check_class(s, String)
|
61
|
+
@scrape_params[:applicant_name] = s.strip
|
62
|
+
self
|
63
|
+
end
|
64
|
+
|
65
|
+
def application_type(s)
|
66
|
+
unless system == 'idox'
|
67
|
+
raise NoMethodError.new("application_type is only implemented for \
|
68
|
+
Idox. This authority (#{@name}) is #{system.capitalize}.")
|
69
|
+
end
|
70
|
+
|
71
|
+
check_class(s, String)
|
72
|
+
@scrape_params[:application_type] = s.strip
|
73
|
+
self
|
74
|
+
end
|
75
|
+
|
76
|
+
def development_type(s)
|
77
|
+
unless system == 'idox'
|
78
|
+
raise NoMethodError.new("development_type is only implemented for \
|
79
|
+
Idox. This authority (#{@name}) is #{system.capitalize}.")
|
80
|
+
end
|
81
|
+
|
82
|
+
check_class(s, String)
|
83
|
+
@scrape_params[:development_type] = s.strip
|
84
|
+
self
|
85
|
+
end
|
86
|
+
|
87
|
+
private
|
88
|
+
|
89
|
+
# Handle the simple params with this
|
90
|
+
def method_missing(method_name, *args)
|
91
|
+
sc_params = {
|
92
|
+
validated_from: Date,
|
93
|
+
validated_to: Date,
|
94
|
+
received_from: Date,
|
95
|
+
received_to: Date,
|
96
|
+
decided_from: Date,
|
97
|
+
decided_to: Date,
|
98
|
+
keywords: String
|
99
|
+
}
|
100
|
+
|
101
|
+
value = args[0]
|
102
|
+
|
103
|
+
if sc_params[method_name]
|
104
|
+
check_class(value, sc_params[method_name], method_name.to_s)
|
105
|
+
value.strip! if value.class == String
|
106
|
+
|
107
|
+
if value.class == Date && value > Date.today
|
108
|
+
raise ArgumentError.new("#{method_name} can't be a date in the " + \
|
109
|
+
"future (#{value.to_s})")
|
110
|
+
end
|
111
|
+
|
112
|
+
@scrape_params[method_name] = value
|
113
|
+
self
|
114
|
+
else
|
115
|
+
raise NoMethodError.new(method_name.to_s)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def clear_scrape_params
|
120
|
+
@scrape_params = {}
|
121
|
+
end
|
122
|
+
|
123
|
+
# https://stackoverflow.com/questions/5100299/how-to-get-the-name-of-the-calling-method
|
124
|
+
def check_class(
|
125
|
+
param_value,
|
126
|
+
expected_class,
|
127
|
+
param_name = caller_locations(1, 1)[0].label) # name of calling method
|
128
|
+
unless param_value.class == expected_class
|
129
|
+
raise TypeError.new("#{param_name} must be a " \
|
130
|
+
"#{expected_class} not a #{param_value.class.to_s}")
|
131
|
+
end
|
132
|
+
end
|
133
|
+
end
|
134
|
+
end
|
@@ -0,0 +1,182 @@
|
|
1
|
+
require 'mechanize'
|
2
|
+
require 'pp'
|
3
|
+
|
4
|
+
module UKPlanningScraper
|
5
|
+
class Authority
|
6
|
+
private
|
7
|
+
def scrape_idox(params, options)
|
8
|
+
puts "Using Idox scraper."
|
9
|
+
base_url = @url.match(/(https?:\/\/.+?)\//)[1]
|
10
|
+
|
11
|
+
apps = []
|
12
|
+
|
13
|
+
agent = Mechanize.new
|
14
|
+
puts "Getting: #{@url}"
|
15
|
+
page = agent.get(@url) # load the search form page
|
16
|
+
|
17
|
+
# Check that the search form is actually present.
|
18
|
+
# When Idox has an internal error it returns an error page with HTTP 200.
|
19
|
+
unless form = page.form('searchCriteriaForm')
|
20
|
+
puts "Error: Search form page failed to load due to Idox internal error."
|
21
|
+
return []
|
22
|
+
end
|
23
|
+
# form.action = form.action + '&searchCriteria.resultsPerPage=100'
|
24
|
+
|
25
|
+
# Fill out and submit search form
|
26
|
+
|
27
|
+
# Add expected fields to form if they're not already present so that searches using these terms work
|
28
|
+
%w{
|
29
|
+
date(applicationReceivedStart)
|
30
|
+
date(applicationReceivedEnd)
|
31
|
+
}.each { |f| form.add_field!(f) unless form.has_field?(f) }
|
32
|
+
|
33
|
+
date_format = "%d/%m/%Y"
|
34
|
+
|
35
|
+
form.send(:"date(applicationReceivedStart)", params[:received_from].strftime(date_format)) if params[:received_from]
|
36
|
+
form.send(:"date(applicationReceivedEnd)", params[:received_to].strftime(date_format)) if params[:received_to]
|
37
|
+
|
38
|
+
form.send(:"date(applicationValidatedStart)", params[:validated_from].strftime(date_format)) if params[:validated_from]
|
39
|
+
form.send(:"date(applicationValidatedEnd)", params[:validated_to].strftime(date_format)) if params[:validated_to]
|
40
|
+
|
41
|
+
form.send(:"date(applicationDecisionStart)", params[:decided_from].strftime(date_format)) if params[:decided_from]
|
42
|
+
form.send(:"date(applicationDecisionEnd)", params[:decided_to].strftime(date_format)) if params[:decided_to]
|
43
|
+
|
44
|
+
form.send(:"searchCriteria\.description", params[:keywords])
|
45
|
+
|
46
|
+
# Some councils don't have the applicant name on their form, eg Bexley
|
47
|
+
form.send(:"searchCriteria\.applicantName", params[:applicant_name]) if form.has_field? 'searchCriteria.applicantName'
|
48
|
+
|
49
|
+
form.send(:"searchCriteria\.caseType", params[:application_type]) if form.has_field? 'searchCriteria.caseType'
|
50
|
+
|
51
|
+
# Only some Idox sites (eg Bolton) have a 'searchCriteria.developmentType' parameter
|
52
|
+
form.send(:"searchCriteria\.developmentType", params[:development_type]) if form.has_field? 'searchCriteria.developmentType'
|
53
|
+
|
54
|
+
page = form.submit
|
55
|
+
|
56
|
+
if page.search('.errors').inner_text.match(/Too many results found/i)
|
57
|
+
raise TooManySearchResults.new("Scrape in smaller chunks. Use shorter date ranges and/or more search parameters.")
|
58
|
+
end
|
59
|
+
|
60
|
+
loop do
|
61
|
+
# Parse search results
|
62
|
+
items = page.search('li.searchresult')
|
63
|
+
|
64
|
+
puts "Found #{items.size} apps on this page."
|
65
|
+
|
66
|
+
items.each do |app|
|
67
|
+
data = Application.new
|
68
|
+
|
69
|
+
# Parse info line
|
70
|
+
info_line = app.at("p.metaInfo").inner_text.strip
|
71
|
+
bits = info_line.split('|').map { |e| e.strip.delete("\r\n") }
|
72
|
+
|
73
|
+
bits.each do |bit|
|
74
|
+
if matches = bit.match(/Ref\. No:\s+(.+)/)
|
75
|
+
data.council_reference = matches[1]
|
76
|
+
end
|
77
|
+
|
78
|
+
if matches = bit.match(/(Received|Registered):\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
|
79
|
+
data.date_received = Date.parse(matches[2])
|
80
|
+
end
|
81
|
+
|
82
|
+
if matches = bit.match(/Validated:\s+.*(\d{2}\s\w{3}\s\d{2}\d{2}?)/)
|
83
|
+
data.date_validated = Date.parse(matches[1])
|
84
|
+
end
|
85
|
+
|
86
|
+
if matches = bit.match(/Status:\s+(.+)/)
|
87
|
+
data.status = matches[1]
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
data.scraped_at = Time.now
|
92
|
+
data.info_url = base_url + app.at('a')['href']
|
93
|
+
data.address = app.at('p.address').inner_text.strip
|
94
|
+
data.description = app.at('a').inner_text.strip
|
95
|
+
|
96
|
+
apps << data
|
97
|
+
end
|
98
|
+
|
99
|
+
# Get the Next button from the pager, if there is one
|
100
|
+
if next_button = page.at('a.next')
|
101
|
+
next_url = base_url + next_button[:href]# + '&searchCriteria.resultsPerPage=100'
|
102
|
+
sleep options[:delay]
|
103
|
+
puts "Getting: #{next_url}"
|
104
|
+
page = agent.get(next_url)
|
105
|
+
else
|
106
|
+
break
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
# Scrape the summary tab for each app
|
111
|
+
apps.each_with_index do |app, i|
|
112
|
+
sleep options[:delay]
|
113
|
+
puts "#{i + 1} of #{apps.size}: #{app.info_url}"
|
114
|
+
res = agent.get(app.info_url)
|
115
|
+
|
116
|
+
if res.code == '200' # That's a String not an Integer, ffs
|
117
|
+
# Parse the summary tab for this app
|
118
|
+
|
119
|
+
app.scraped_at = Time.now
|
120
|
+
|
121
|
+
# The Documents tab doesn't show if there are no documents (we get li.nodocuments instead)
|
122
|
+
# Bradford has #tab_documents but without the document count on it
|
123
|
+
app.documents_count = 0
|
124
|
+
|
125
|
+
if documents_link = res.at('.associateddocument a')
|
126
|
+
if documents_link.inner_text.match(/\d+/)
|
127
|
+
app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
|
128
|
+
app.documents_url = base_url + documents_link[:href]
|
129
|
+
end
|
130
|
+
elsif documents_link = res.at('#tab_documents')
|
131
|
+
if documents_link.inner_text.match(/\d+/)
|
132
|
+
app.documents_count = documents_link.inner_text.match(/\d+/)[0].to_i
|
133
|
+
app.documents_url = base_url + documents_link[:href]
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
# We need to find values in the table by using the th labels.
|
138
|
+
# The row indexes/positions change from site to site (or even app to app) so we can't rely on that.
|
139
|
+
|
140
|
+
res.search('#simpleDetailsTable tr').each do |row|
|
141
|
+
key = row.at('th').inner_text.strip
|
142
|
+
value = row.at('td').inner_text.strip
|
143
|
+
|
144
|
+
case key
|
145
|
+
when 'Reference'
|
146
|
+
app.council_reference = value
|
147
|
+
when 'Alternative Reference'
|
148
|
+
app.alternative_reference = value unless value.empty?
|
149
|
+
when 'Planning Portal Reference'
|
150
|
+
app.alternative_reference = value unless value.empty?
|
151
|
+
when 'Application Received'
|
152
|
+
app.date_received = Date.parse(value) if value.match(/\d/)
|
153
|
+
when 'Application Registered'
|
154
|
+
app.date_received = Date.parse(value) if value.match(/\d/)
|
155
|
+
when 'Application Validated'
|
156
|
+
app.date_validated = Date.parse(value) if value.match(/\d/)
|
157
|
+
when 'Address'
|
158
|
+
app.address = value unless value.empty?
|
159
|
+
when 'Proposal'
|
160
|
+
app.description = value unless value.empty?
|
161
|
+
when 'Status'
|
162
|
+
app.status = value unless value.empty?
|
163
|
+
when 'Decision'
|
164
|
+
app.decision = value unless value.empty?
|
165
|
+
when 'Decision Issued Date'
|
166
|
+
app.date_decision = Date.parse(value) if value.match(/\d/)
|
167
|
+
when 'Appeal Status'
|
168
|
+
app.appeal_status = value unless value.empty?
|
169
|
+
when 'Appeal Decision'
|
170
|
+
app.appeal_decision = value unless value.empty?
|
171
|
+
else
|
172
|
+
puts "Error: key '#{key}' not found"
|
173
|
+
end # case
|
174
|
+
end # each row
|
175
|
+
else
|
176
|
+
puts "Error: HTTP #{res.code}"
|
177
|
+
end # if
|
178
|
+
end # scrape summary tab for apps
|
179
|
+
apps
|
180
|
+
end # scrape_idox
|
181
|
+
end # class
|
182
|
+
end
|
@@ -0,0 +1,127 @@
|
|
1
|
+
require 'http'
|
2
|
+
require 'nokogiri'
|
3
|
+
require 'logger'
|
4
|
+
|
5
|
+
module UKPlanningScraper
|
6
|
+
class Authority
|
7
|
+
private
|
8
|
+
def scrape_northgate(params, options)
|
9
|
+
puts "Using Northgate scraper."
|
10
|
+
base_url = @url.match(/(https?:\/\/.+?)\//)[1]
|
11
|
+
|
12
|
+
# Remove 'generalsearch.aspx' from the end and add '/Generic/' - case sensitive?
|
13
|
+
generic_url = @url.match(/.+\//)[0] + 'Generic/'
|
14
|
+
|
15
|
+
apps = []
|
16
|
+
|
17
|
+
$stdout.sync = true # Flush output buffer after every write so log messages appear immediately.
|
18
|
+
logger = Logger.new($stdout)
|
19
|
+
logger.level = Logger::DEBUG
|
20
|
+
|
21
|
+
date_regex = /\d{2}-\d{2}-\d{4}/
|
22
|
+
|
23
|
+
form_vars = {
|
24
|
+
'csbtnSearch' => 'Search' # required
|
25
|
+
}
|
26
|
+
|
27
|
+
form_vars['txtProposal'] = params[:keywords]
|
28
|
+
|
29
|
+
# Date received from and to
|
30
|
+
if params[:received_from] || params[:received_to]
|
31
|
+
form_vars['cboSelectDateValue'] = 'DATE_RECEIVED'
|
32
|
+
form_vars['rbGroup'] = 'rbRange'
|
33
|
+
form_vars['dateStart'] = params[:received_from].to_s if params[:received_from] # YYYY-MM-DD
|
34
|
+
form_vars['dateEnd'] = params[:received_to].to_s if params[:received_to] # YYYY-MM-DD
|
35
|
+
end
|
36
|
+
|
37
|
+
# Date validated from and to
|
38
|
+
if params[:validated_from] || params[:validated_to]
|
39
|
+
form_vars['cboSelectDateValue'] = 'DATE_VALID'
|
40
|
+
form_vars['rbGroup'] = 'rbRange'
|
41
|
+
form_vars['dateStart'] = params[:validated_from].to_s if params[:validated_from] # YYYY-MM-DD
|
42
|
+
form_vars['dateEnd'] = params[:validated_to].to_s if params[:validated_to] # YYYY-MM-DD
|
43
|
+
end
|
44
|
+
|
45
|
+
# Date decided from and to
|
46
|
+
if params[:decided_from] || params[:decided_to]
|
47
|
+
form_vars['cboSelectDateValue'] = 'DATE_DECISION'
|
48
|
+
form_vars['rbGroup'] = 'rbRange'
|
49
|
+
form_vars['dateStart'] = params[:decided_from].to_s if params[:decided_from] # YYYY-MM-DD
|
50
|
+
form_vars['dateEnd'] = params[:decided_to].to_s if params[:decided_to] # YYYY-MM-DD
|
51
|
+
end
|
52
|
+
|
53
|
+
logger.info "Form variables: #{form_vars.to_s}"
|
54
|
+
|
55
|
+
headers = {
|
56
|
+
'Origin' => base_url,
|
57
|
+
'Referer' => @url,
|
58
|
+
}
|
59
|
+
|
60
|
+
logger.debug "HTTP request headers:"
|
61
|
+
logger.debug(headers.to_s)
|
62
|
+
|
63
|
+
logger.debug "GET: " + @url
|
64
|
+
response = HTTP.headers(headers).get(@url)
|
65
|
+
logger.debug "Response code: HTTP " + response.code.to_s
|
66
|
+
|
67
|
+
if response.code == 200
|
68
|
+
doc = Nokogiri::HTML(response.to_s)
|
69
|
+
asp_vars = {
|
70
|
+
'__VIEWSTATE' => doc.at('#__VIEWSTATE')['value'],
|
71
|
+
'__EVENTVALIDATION' => doc.at('#__EVENTVALIDATION')['value']
|
72
|
+
}
|
73
|
+
else
|
74
|
+
logger.fatal "Bad response from search page. Response code: #{response.code.to_s}."
|
75
|
+
raise RuntimeError.new("Northgate: Bad response from search page. Response code: #{response.code.to_s}.")
|
76
|
+
end
|
77
|
+
|
78
|
+
cookies = {}
|
79
|
+
response.cookies.each { |c| cookies[c.name] = c.value }
|
80
|
+
|
81
|
+
form_vars.merge!(asp_vars)
|
82
|
+
|
83
|
+
logger.debug "POST: " + @url
|
84
|
+
response2 = HTTP.headers(headers).cookies(cookies).post(@url, :form => form_vars)
|
85
|
+
logger.debug "Response code: HTTP " + response2.code.to_s
|
86
|
+
|
87
|
+
if response2.code == 302
|
88
|
+
# Follow the redirect manually
|
89
|
+
# Set the page size (PS) to max so we don't have to page through search results
|
90
|
+
logger.debug "Location: #{response2.headers['Location']}"
|
91
|
+
results_url = URI::encode(base_url + response2.headers['Location'].gsub!('PS=10', 'PS=99999'))
|
92
|
+
logger.debug "GET: " + results_url
|
93
|
+
response3 = HTTP.headers(headers).cookies(cookies).get(results_url)
|
94
|
+
logger.debug "Response code: HTTP " + response3.code.to_s
|
95
|
+
doc = Nokogiri::HTML(response3.to_s)
|
96
|
+
else
|
97
|
+
logger.error "Didn't get redirected from search."
|
98
|
+
raise RuntimeError.new("Northgate: didn't get redirected from search.")
|
99
|
+
end
|
100
|
+
|
101
|
+
rows = doc.search("table.display_table tr")
|
102
|
+
logger.info "Found #{rows.size - 1} applications in search results." # The first row is the header row
|
103
|
+
|
104
|
+
# Iterate over search results
|
105
|
+
rows.each do |row|
|
106
|
+
if row.at("td") # skip header row which only has th's
|
107
|
+
cells = row.search("td")
|
108
|
+
|
109
|
+
app = Application.new
|
110
|
+
app.scraped_at = Time.now
|
111
|
+
app.council_reference = cells[0].inner_text.strip
|
112
|
+
app.info_url = URI::encode(generic_url + cells[0].at('a')[:href].strip)
|
113
|
+
app.info_url.gsub!(/%0./, '') # FIXME. Strip junk chars from URL - how can we prevent this?
|
114
|
+
app.address = cells[1].inner_text.strip
|
115
|
+
app.description = cells[2].inner_text.strip
|
116
|
+
app.status = cells[3].inner_text.strip
|
117
|
+
raw_date_received = cells[4].inner_text.strip
|
118
|
+
app.date_received = Date.parse(raw_date_received) if raw_date_received != '--'
|
119
|
+
app.decision = cells[5].inner_text.strip if cells[5] # Some councils don't have this column, eg Hackney
|
120
|
+
|
121
|
+
apps << app
|
122
|
+
end
|
123
|
+
end
|
124
|
+
apps
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
require "uk_planning_scraper/version"
|
2
|
+
require "uk_planning_scraper/authority"
|
3
|
+
require "uk_planning_scraper/authority_scrape_params"
|
4
|
+
require "uk_planning_scraper/application"
|
5
|
+
require 'uk_planning_scraper/idox'
|
6
|
+
require 'uk_planning_scraper/northgate'
|
7
|
+
require 'logger'
|
8
|
+
|
9
|
+
module UKPlanningScraper
|
10
|
+
class SystemNotSupported < StandardError; end
|
11
|
+
class AuthorityNotFound < StandardError; end
|
12
|
+
class TooManySearchResults < StandardError; end
|
13
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path("../lib", __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require "uk_planning_scraper/version"
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "uk_planning_scraper"
|
8
|
+
spec.version = UKPlanningScraper::VERSION
|
9
|
+
spec.authors = ["Adrian Short"]
|
10
|
+
spec.email = 'rubygems@adrianshort.org'
|
11
|
+
spec.summary = %q{Scrape planning applications data from UK council websites.}
|
12
|
+
# spec.description = %q{TODO: Write a longer description or delete this line.}
|
13
|
+
spec.homepage = "https://github.com/adrianshort/uk_planning_scraper/"
|
14
|
+
spec.licenses = ['LGPL-3.0']
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0").reject do |f|
|
17
|
+
f.match(%r{^(test|spec|features)/})
|
18
|
+
end
|
19
|
+
spec.bindir = "exe"
|
20
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
21
|
+
spec.require_paths = ["lib"]
|
22
|
+
|
23
|
+
spec.add_development_dependency "bundler", "~> 2.0"
|
24
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
25
|
+
spec.add_development_dependency "rspec", "~> 3.8"
|
26
|
+
spec.add_development_dependency "simplecov", "~> 0.16"
|
27
|
+
spec.add_development_dependency "vcr", "~> 4.0"
|
28
|
+
spec.add_development_dependency "webmock", "~> 3.5"
|
29
|
+
spec.add_development_dependency "pry", "~> 0.11"
|
30
|
+
|
31
|
+
spec.add_runtime_dependency "mechanize", "~> 2.7"
|
32
|
+
spec.add_runtime_dependency "http", "~> 3.3"
|
33
|
+
end
|