bplgeo 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Rakefile +32 -0
- data/lib/bplgeo/constants.rb +478 -0
- data/lib/bplgeo/geonames.rb +146 -0
- data/lib/bplgeo/parser.rb +227 -0
- data/lib/bplgeo/standardizer.rb +213 -0
- data/lib/bplgeo/tgn.rb +314 -0
- data/lib/bplgeo/town_lookup.rb +19 -0
- data/lib/bplgeo/version.rb +3 -0
- data/lib/bplgeo.rb +35 -0
- data/lib/tasks/bplgeo_tasks.rake +4 -0
- data/test/bplgeo_test.rb +102 -0
- data/test/dummy/README.rdoc +28 -0
- data/test/dummy/Rakefile +6 -0
- data/test/dummy/app/assets/javascripts/application.js +13 -0
- data/test/dummy/app/assets/stylesheets/application.css +13 -0
- data/test/dummy/app/controllers/application_controller.rb +5 -0
- data/test/dummy/app/helpers/application_helper.rb +2 -0
- data/test/dummy/app/views/layouts/application.html.erb +14 -0
- data/test/dummy/bin/bundle +3 -0
- data/test/dummy/bin/rails +4 -0
- data/test/dummy/bin/rake +4 -0
- data/test/dummy/config/application.rb +23 -0
- data/test/dummy/config/boot.rb +5 -0
- data/test/dummy/config/bplgeo.yml +23 -0
- data/test/dummy/config/bplgeo.yml.sample +24 -0
- data/test/dummy/config/database.yml +25 -0
- data/test/dummy/config/environment.rb +5 -0
- data/test/dummy/config/environments/development.rb +29 -0
- data/test/dummy/config/environments/production.rb +80 -0
- data/test/dummy/config/environments/test.rb +36 -0
- data/test/dummy/config/initializers/backtrace_silencers.rb +7 -0
- data/test/dummy/config/initializers/filter_parameter_logging.rb +4 -0
- data/test/dummy/config/initializers/inflections.rb +16 -0
- data/test/dummy/config/initializers/mime_types.rb +5 -0
- data/test/dummy/config/initializers/secret_token.rb +12 -0
- data/test/dummy/config/initializers/session_store.rb +3 -0
- data/test/dummy/config/initializers/wrap_parameters.rb +14 -0
- data/test/dummy/config/locales/en.yml +23 -0
- data/test/dummy/config/routes.rb +56 -0
- data/test/dummy/config.ru +4 -0
- data/test/dummy/db/test.sqlite3 +0 -0
- data/test/dummy/log/development.log +35 -0
- data/test/dummy/public/404.html +58 -0
- data/test/dummy/public/422.html +58 -0
- data/test/dummy/public/500.html +57 -0
- data/test/dummy/public/favicon.ico +0 -0
- data/test/geonames_test.rb +24 -0
- data/test/parser_test.rb +33 -0
- data/test/test_helper.rb +15 -0
- data/test/tgn_test.rb +19 -0
- data/test/town_lookup_test.rb +11 -0
- metadata +236 -0
@@ -0,0 +1,146 @@
|
|
1
|
+
module Bplgeo
|
2
|
+
class Geonames
|
3
|
+
def self.bplgeo_config
|
4
|
+
root = Rails.root || './test/dummy'
|
5
|
+
env = Rails.env || 'test'
|
6
|
+
@bplgeo_config ||= YAML::load(ERB.new(IO.read(File.join(root, 'config', 'bplgeo.yml'))).result)[env].with_indifferent_access
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.geonames_username
|
10
|
+
bplgeo_config[:geonames_username] || '<username>'
|
11
|
+
end
|
12
|
+
|
13
|
+
def self.get_geonames_data(geoname_id)
|
14
|
+
max_retry = 3
|
15
|
+
sleep_time = 60 # In seconds
|
16
|
+
retry_count = 0
|
17
|
+
|
18
|
+
hier_geo = {}
|
19
|
+
coords = {}
|
20
|
+
geonames_data = {}
|
21
|
+
|
22
|
+
begin
|
23
|
+
if retry_count > 0
|
24
|
+
sleep(sleep_time)
|
25
|
+
end
|
26
|
+
retry_count = retry_count + 1
|
27
|
+
|
28
|
+
geonames_response = Typhoeus::Request.get("http://api.geonames.org/hierarchy?username=#{self.geonames_username}&lang=en&style=FULL&geonameId=" + geoname_id)
|
29
|
+
|
30
|
+
end until (geonames_response.code != 500 || retry_count == max_retry)
|
31
|
+
|
32
|
+
unless geonames_response.code == 500
|
33
|
+
parsed_xml = Nokogiri::Slop(geonames_response.body)
|
34
|
+
|
35
|
+
parsed_xml.geonames.geoname.each do |geoname|
|
36
|
+
hier_geo[geoname.fcode.text.downcase.to_sym] = geoname.toponymName.text
|
37
|
+
end
|
38
|
+
|
39
|
+
#FIXME: Code4Lib lazy implementation... will get last result
|
40
|
+
geoname = parsed_xml.geonames.geoname.last
|
41
|
+
coords[:latitude] = geoname.lat.text
|
42
|
+
coords[:longitude] = geoname.lng.text
|
43
|
+
coords[:combined] = coords[:latitude] + ',' + coords[:longitude]
|
44
|
+
coords[:box] = {}
|
45
|
+
coords[:box][:west] = geoname.bbox.west.text
|
46
|
+
coords[:box][:north] = geoname.bbox.north.text
|
47
|
+
coords[:box][:east] = geoname.bbox.east.text
|
48
|
+
coords[:box][:south] = geoname.bbox.south.text
|
49
|
+
|
50
|
+
geonames_data[:coords] = coords
|
51
|
+
geonames_data[:hier_geo] = hier_geo.present? ? hier_geo : nil
|
52
|
+
end
|
53
|
+
|
54
|
+
return geonames_data
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
def self.geonames_id_from_geo_hash(geo_hash)
|
59
|
+
return nil if Bplgeo::Geonames.geonames_username == '<username>'
|
60
|
+
geo_hash = geo_hash.clone
|
61
|
+
|
62
|
+
max_retry = 3
|
63
|
+
sleep_time = 60 # In seconds
|
64
|
+
retry_count = 0
|
65
|
+
|
66
|
+
geonames_search_array = []
|
67
|
+
return_hash = {}
|
68
|
+
|
69
|
+
#Don't do both neighborhood and city!
|
70
|
+
if geo_hash[:neighborhood_part].present?
|
71
|
+
geonames_search_array << geo_hash[:neighborhood_part]
|
72
|
+
elsif geo_hash[:city_part].present?
|
73
|
+
geonames_search_array << geo_hash[:city_part]
|
74
|
+
end
|
75
|
+
|
76
|
+
geonames_search_array << geo_hash[:state_part] if geo_hash[:state_part].present?
|
77
|
+
geonames_search_array << geo_hash[:country_part] if geo_hash[:country_part].present?
|
78
|
+
geonames_search_string = geonames_search_array.join(', ')
|
79
|
+
|
80
|
+
match_term = geonames_search_array.first.to_ascii.downcase.strip
|
81
|
+
|
82
|
+
begin
|
83
|
+
if retry_count > 0
|
84
|
+
sleep(sleep_time)
|
85
|
+
end
|
86
|
+
retry_count = retry_count + 1
|
87
|
+
|
88
|
+
geonames_response = Typhoeus::Request.get("http://api.geonames.org/search?username=#{self.geonames_username}&lang=en&style=FULL&q=" + CGI.escape(geonames_search_string))
|
89
|
+
|
90
|
+
end until (geonames_response.code != 500 || retry_count == max_retry)
|
91
|
+
|
92
|
+
unless geonames_response.code == 500
|
93
|
+
|
94
|
+
parsed_xml = Nokogiri::Slop(geonames_response.body)
|
95
|
+
|
96
|
+
#This is ugly and needs to be redone to achieve better recursive...
|
97
|
+
if parsed_xml.geonames.totalResultsCount.text == '0'
|
98
|
+
if neighborhood_part.present?
|
99
|
+
geo_hash[:neighborhood_part] = nil
|
100
|
+
geo_hash = geonames_id_from_geo_hash(geo_hash)
|
101
|
+
elsif city_part.present?
|
102
|
+
geo_hash[:city_part] = nil
|
103
|
+
geo_hash = geonames_id_from_geo_hash(geo_hash)
|
104
|
+
end
|
105
|
+
|
106
|
+
return geo_hash
|
107
|
+
end
|
108
|
+
|
109
|
+
#Exact Match
|
110
|
+
parsed_xml.geonames.geoname.each do |geoname|
|
111
|
+
|
112
|
+
current_term = geoname.toponymName.text.to_ascii.downcase.strip
|
113
|
+
|
114
|
+
if current_term == match_term && return_hash.blank?
|
115
|
+
return_hash[:id] = geoname.geonameId.text
|
116
|
+
return_hash[:original_string_differs] = Bplgeo::Standardizer.parsed_and_original_check(geo_hash)
|
117
|
+
break
|
118
|
+
end
|
119
|
+
end
|
120
|
+
|
121
|
+
if return_hash.blank?
|
122
|
+
#Starts With
|
123
|
+
parsed_xml.geonames.geoname.each do |geoname|
|
124
|
+
|
125
|
+
current_term = geoname.toponymName.text.to_ascii.downcase.strip
|
126
|
+
|
127
|
+
if current_term.starts_with?(match_term) && return_hash.blank?
|
128
|
+
return_hash[:id] = geoname.geonameId.text
|
129
|
+
return_hash[:original_string_differs] = Bplgeo::Standardizer.parsed_and_original_check(geo_hash)
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
end
|
135
|
+
|
136
|
+
if geonames_response.code == 500
|
137
|
+
raise 'Geonames Server appears to not be responding for Geographic query: ' + term
|
138
|
+
end
|
139
|
+
|
140
|
+
return return_hash if return_hash.present?
|
141
|
+
|
142
|
+
return nil
|
143
|
+
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
@@ -0,0 +1,227 @@
|
|
1
|
+
module Bplgeo
|
2
|
+
class Parser
|
3
|
+
|
4
|
+
def self.bplgeo_config
|
5
|
+
root = Rails.root || './test/dummy'
|
6
|
+
env = Rails.env || 'test'
|
7
|
+
|
8
|
+
@bplgeo_config ||= YAML::load(ERB.new(IO.read(File.join(root, 'config', 'bplgeo.yml'))).result)[env].with_indifferent_access
|
9
|
+
end
|
10
|
+
|
11
|
+
def self.mapquest_key
|
12
|
+
bplgeo_config[:mapquest_key] || '<mapquest_key>'
|
13
|
+
end
|
14
|
+
|
15
|
+
def self.bing_key
|
16
|
+
bplgeo_config[:bing_key] || '<bing_key>'
|
17
|
+
end
|
18
|
+
|
19
|
+
def self.timeout
|
20
|
+
bplgeo_config[:timeout]
|
21
|
+
end
|
22
|
+
|
23
|
+
#Note: Limited to only looking at United States places...
|
24
|
+
def self.parse_bing_api(term, parse_term_flag=false)
|
25
|
+
return_hash = {}
|
26
|
+
retry_count = 3
|
27
|
+
|
28
|
+
#Skip if no bing_key... possibly move this elsewhere?
|
29
|
+
return return_hash if self.bing_key == '<bing_key>'
|
30
|
+
|
31
|
+
return_hash[:original_term] = term
|
32
|
+
|
33
|
+
term = Bplgeo::Standardizer.parse_for_geographic_term(term) if parse_term_flag
|
34
|
+
term = Bplgeo::Standardizer.standardize_geographic_term(term)
|
35
|
+
|
36
|
+
if term.blank?
|
37
|
+
return {}
|
38
|
+
end
|
39
|
+
|
40
|
+
return_hash[:standardized_term] = term
|
41
|
+
|
42
|
+
#Bing API does badly with parentheses...
|
43
|
+
if term.match(/[\(\)]+/)
|
44
|
+
return {}
|
45
|
+
end
|
46
|
+
|
47
|
+
#Sometimes with building, city, state, bing is dumb and will only return state. Example: Boston Harbor, Boston, Mass.
|
48
|
+
#So if not a street address, pass to have google handle it for better results...
|
49
|
+
#Example of another bad record: South Street bridge, West Bridgewater, Mass. would give a place in Holyoke
|
50
|
+
if term.split(',').length >= 3 && term.match(/\d/).blank? && term.downcase.match(/ave\.*,/).blank? && term.downcase.match(/avenue\.*,/).blank? && term.downcase.match(/street\.*,/).blank? && term.downcase.match(/st\.*,/).blank? && term.downcase.match(/road\.*,/).blank? && term.downcase.match(/rd\.*,/).blank?
|
51
|
+
return {}
|
52
|
+
end
|
53
|
+
|
54
|
+
Geocoder.configure(:lookup => :bing,:api_key => self.bing_key,:timeout => self.timeout, :always_raise => :all)
|
55
|
+
bing_api_result = Geocoder.search(term)
|
56
|
+
|
57
|
+
rescue SocketError => e
|
58
|
+
retry unless (retry_count -= 1).zero?
|
59
|
+
else
|
60
|
+
|
61
|
+
#Use only for United States results... international results are inaccurate.
|
62
|
+
if bing_api_result.present? && bing_api_result.first.data["address"]["countryRegion"] == 'United States'
|
63
|
+
|
64
|
+
if bing_api_result.first.data["entityType"] == 'Neighborhood'
|
65
|
+
return {} #Doesn't return a city... Google handles this better.
|
66
|
+
end
|
67
|
+
|
68
|
+
if bing_api_result.first.data["address"]["addressLine"].present?
|
69
|
+
return_hash[:term_differs_from_tgn] = true
|
70
|
+
return_hash[:street_part] = bing_api_result.first.data["address"]["addressLine"]
|
71
|
+
return_hash[:coords] = {:latitude=>bing_api_result.first.data["geocodePoints"].first["coordinates"].first.to_s,
|
72
|
+
:longitude=>bing_api_result.first.data["geocodePoints"].first["coordinates"].last.to_s,
|
73
|
+
:combined=>bing_api_result.first.data["geocodePoints"].first["coordinates"].first.to_s + ',' + bing_api_result.first.data["geocodePoints"].first["coordinates"].last.to_s}
|
74
|
+
end
|
75
|
+
|
76
|
+
return_hash[:country_part] = bing_api_result.first.data["address"]["countryRegion"]
|
77
|
+
|
78
|
+
if return_hash[:country_part] == 'United States'
|
79
|
+
return_hash[:state_part] = Bplgeo::Constants::STATE_ABBR[bing_api_result.first.data["address"]["adminDistrict"]]
|
80
|
+
else
|
81
|
+
return_hash[:state_part] = bing_api_result.first.data["address"]["adminDistrict"]
|
82
|
+
end
|
83
|
+
|
84
|
+
return_hash[:city_part] = bing_api_result.first.data["address"]["locality"]
|
85
|
+
else
|
86
|
+
return {}
|
87
|
+
end
|
88
|
+
|
89
|
+
#Only return if USA for now. International results often awful.
|
90
|
+
return return_hash[:country_part] == 'United States' ? return_hash : {}
|
91
|
+
end
|
92
|
+
|
93
|
+
#Mapquest allows unlimited requests - start here?
|
94
|
+
def self.parse_mapquest_api(term, parse_term_flag=false)
|
95
|
+
return_hash = {}
|
96
|
+
retry_count = 3
|
97
|
+
|
98
|
+
#Skip if no bing_key... possibly move this elsewhere?
|
99
|
+
return return_hash if self.bing_key == '<mapquest_key>'
|
100
|
+
|
101
|
+
return_hash[:original_term] = term
|
102
|
+
|
103
|
+
term = Bplgeo::Standardizer.parse_for_geographic_term(term) if parse_term_flag
|
104
|
+
term = Bplgeo::Standardizer.standardize_geographic_term(term)
|
105
|
+
|
106
|
+
if term.blank?
|
107
|
+
return {}
|
108
|
+
end
|
109
|
+
|
110
|
+
return_hash[:standardized_term] = term
|
111
|
+
|
112
|
+
#Mapquest returns bad data for: Manchester, Mass.
|
113
|
+
if term.include?('Manchester') || term.include?('Atlanta, MI')
|
114
|
+
return {}
|
115
|
+
end
|
116
|
+
|
117
|
+
#Messed up with just neighborhoods. Example: Hyde Park (Boston, Mass.) or Hyde Park (Boston, Mass.)
|
118
|
+
#So if not a street address, pass to have google handle it for better results...
|
119
|
+
if term.split(',').length >= 3 && term.match(/\d/).blank? && term.downcase.match(/ave\.*,/).blank? && term.downcase.match(/avenue\.*,/).blank? && term.downcase.match(/street\.*,/).blank? && term.downcase.match(/st\.*,/).blank? && term.downcase.match(/road\.*,/).blank? && term.downcase.match(/rd\.*,/).blank?
|
120
|
+
return {}
|
121
|
+
end
|
122
|
+
|
123
|
+
Geocoder.configure(:lookup => :mapquest,:api_key => self.mapquest_key,:timeout => self.timeout, :always_raise => :all)
|
124
|
+
|
125
|
+
mapquest_api_result = Geocoder.search(term)
|
126
|
+
rescue SocketError => e
|
127
|
+
retry unless (retry_count -= 1).zero?
|
128
|
+
else
|
129
|
+
|
130
|
+
|
131
|
+
#If this call returned a result...
|
132
|
+
if mapquest_api_result.present?
|
133
|
+
|
134
|
+
if mapquest_api_result.first.data["street"].present?
|
135
|
+
#return_hash[:term_differs_from_tgn] = true
|
136
|
+
return_hash[:street_part] = mapquest_api_result.first.data["street"]
|
137
|
+
return_hash[:coords] = {:latitude=>mapquest_api_result.first.data['latLng']['lat'].to_s,
|
138
|
+
:longitude=>mapquest_api_result.first.data['latLng']['lng'].to_s,
|
139
|
+
:combined=>mapquest_api_result.first.data['latLng']['lat'].to_s + ',' + mapquest_api_result.first.data['latLng']['lng'].to_s}
|
140
|
+
end
|
141
|
+
|
142
|
+
return_hash[:country_part] = Country.new(mapquest_api_result.first.data["adminArea1"]).name
|
143
|
+
|
144
|
+
if return_hash[:country_part] == 'United States'
|
145
|
+
return_hash[:state_part] = Bplgeo::Constants::STATE_ABBR[mapquest_api_result.first.data["adminArea3"]] || mapquest_api_result.first.data["adminArea4"]
|
146
|
+
else
|
147
|
+
return_hash[:state_part] = mapquest_api_result.first.data["adminArea3"].gsub(' province', '')
|
148
|
+
end
|
149
|
+
|
150
|
+
return_hash[:city_part] = mapquest_api_result.first.data["adminArea5"]
|
151
|
+
|
152
|
+
return_hash[:city_part] = return_hash[:city_part].gsub(' City', '') #Return New York as New York City...
|
153
|
+
end
|
154
|
+
|
155
|
+
#Only return if USA for now. Google is better with stuff like: 'Long Binh, Vietnam'
|
156
|
+
#Also only return if there is a city if there were more than two terms passed in. Fixes: Roxbury, MA
|
157
|
+
return {} unless return_hash[:country_part] == 'United States'
|
158
|
+
return {} if term.split(',').length >= 2 && return_hash[:city_part].blank?
|
159
|
+
|
160
|
+
return return_hash
|
161
|
+
end
|
162
|
+
|
163
|
+
#Final fallback is google API. The best but we are limited to 2500 requests per day unless we pay the $10k a year premium account...
|
164
|
+
#Note: If google cannot find street, it will return just city/state, like for "Salem Street and Paradise Road, Swampscott, MA, 01907"
|
165
|
+
#Seems like it sets a partial_match=>true in the data section...
|
166
|
+
def self.parse_google_api(term, parse_term_flag=false)
|
167
|
+
return_hash = {}
|
168
|
+
retry_count = 3
|
169
|
+
|
170
|
+
return_hash[:original_term] = term
|
171
|
+
|
172
|
+
term = Bplgeo::Standardizer.parse_for_geographic_term(term) if parse_term_flag
|
173
|
+
term = Bplgeo::Standardizer.standardize_geographic_term(term)
|
174
|
+
|
175
|
+
#Soviet Union returns back a place in Kazakhstan
|
176
|
+
if term.blank? || term == 'Soviet Union'
|
177
|
+
return {}
|
178
|
+
end
|
179
|
+
|
180
|
+
return_hash[:standardized_term] = term
|
181
|
+
|
182
|
+
::Geocoder.configure(:lookup => :google,:api_key => nil,:timeout => self.timeout, :always_raise => :all)
|
183
|
+
|
184
|
+
google_api_result = ::Geocoder.search(term)
|
185
|
+
rescue SocketError => e
|
186
|
+
retry unless (retry_count -= 1).zero?
|
187
|
+
else
|
188
|
+
|
189
|
+
|
190
|
+
#Check if only a partial match. To avoid errors, strip out the first part and try again...
|
191
|
+
#Need better way to check for street endings. See: http://pe.usps.gov/text/pub28/28apc_002.htm
|
192
|
+
if google_api_result.present?
|
193
|
+
if google_api_result.first.data['partial_match'] && term.split(',').length > 1 && !term.downcase.include?('street') && !term.downcase.include?('st.') && !term.downcase.include?('avenue') && !term.downcase.include?('ave.') && !term.downcase.include?('court') && !term.downcase.include?('dr.')
|
194
|
+
term = term.split(',')[1..term.split(',').length-1].join(',').strip
|
195
|
+
google_api_result = Geocoder.search(term)
|
196
|
+
end
|
197
|
+
end
|
198
|
+
|
199
|
+
if google_api_result.present?
|
200
|
+
#Types: street number, route, neighborhood, establishment, transit_station, bus_station
|
201
|
+
google_api_result.first.data["address_components"].each do |result|
|
202
|
+
if (result['types'] & ['street number', 'route', 'neighborhood', 'establishment', 'transit_station', 'bus_station']).present?
|
203
|
+
#return_hash[:term_differs_from_tgn] = true
|
204
|
+
#TODO: Not implemented for Google results right now.
|
205
|
+
return_hash[:street_part] = 'TODO: Not Implemented for Google Results'
|
206
|
+
return_hash[:coords] = {:latitude=>google_api_result.first.data['geometry']['location']['lat'].to_s,
|
207
|
+
:longitude=>google_api_result.first.data['geometry']['location']['lng'].to_s,
|
208
|
+
:combined=>google_api_result.first.data['geometry']['location']['lat'].to_s + ',' + google_api_result.first.data['geometry']['location']['lng'].to_s}
|
209
|
+
elsif (result['types'] & ['country']).present?
|
210
|
+
return_hash[:country_part] = result['long_name']
|
211
|
+
elsif (result['types'] & ['administrative_area_level_1']).present?
|
212
|
+
return_hash[:state_part] = result['long_name'].to_ascii
|
213
|
+
elsif (result['types'] & ['locality']).present?
|
214
|
+
return_hash[:city_part] = result['long_name']
|
215
|
+
elsif (result['types'] & ['sublocality', 'political']).length == 2
|
216
|
+
return_hash[:neighborhood_part] = result['long_name']
|
217
|
+
end
|
218
|
+
end
|
219
|
+
|
220
|
+
return_hash[:term_differs_from_tgn] ||= google_api_result.first.data['partial_match'] unless google_api_result.first.data['partial_match'].blank?
|
221
|
+
end
|
222
|
+
|
223
|
+
|
224
|
+
return return_hash
|
225
|
+
end
|
226
|
+
end
|
227
|
+
end
|
@@ -0,0 +1,213 @@
|
|
1
|
+
module Bplgeo
|
2
|
+
class Standardizer
|
3
|
+
|
4
|
+
#Take a subject string and look for potential geographic terms.
|
5
|
+
def self.parse_for_geographic_term(term)
|
6
|
+
geo_term = ''
|
7
|
+
|
8
|
+
#Likely too long to be an address... some fields have junk with an address string...
|
9
|
+
if term.length > 125
|
10
|
+
return ''
|
11
|
+
end
|
12
|
+
|
13
|
+
state_abbr_list = ['Mass']
|
14
|
+
state_name_list = []
|
15
|
+
|
16
|
+
#Countries gem of https://github.com/hexorx/countries
|
17
|
+
Country.new('US').states.each do |state_abbr, state_names|
|
18
|
+
state_abbr_list << ' ' + state_abbr
|
19
|
+
state_name_list << state_names["name"]
|
20
|
+
end
|
21
|
+
|
22
|
+
#Parsing a subject geographic term.
|
23
|
+
if term.include?('--')
|
24
|
+
term.split('--').each_with_index do |split_term, index|
|
25
|
+
if state_name_list.any? { |state| split_term.include? state }
|
26
|
+
geo_term = term.split('--')[index..term.split('--').length-1].reverse!.join(',')
|
27
|
+
elsif state_abbr_list.any? { |abbr| split_term.include? abbr }
|
28
|
+
geo_term = split_term
|
29
|
+
end
|
30
|
+
end
|
31
|
+
#Other than a '--' field
|
32
|
+
#Experimental... example: Palmer (Mass) - history or Stores (retail trade) - Palmer, Mass
|
33
|
+
elsif term.include?(' - ')
|
34
|
+
term.split(' - ').each do |split_term|
|
35
|
+
if state_name_list.any? { |state| split_term.include? state } || state_abbr_list.any? { |abbr| split_term.include? abbr }
|
36
|
+
geo_term = split_term
|
37
|
+
end
|
38
|
+
|
39
|
+
end
|
40
|
+
else
|
41
|
+
if state_name_list.any? { |state| term.include? state } || state_abbr_list.any? { |abbr| term.include? abbr }
|
42
|
+
geo_term = term
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
return geo_term
|
47
|
+
end
|
48
|
+
|
49
|
+
#Make a string in a standard format.
|
50
|
+
def self.standardize_geographic_term(geo_term)
|
51
|
+
|
52
|
+
geo_term = geo_term.clone #Don't change original
|
53
|
+
|
54
|
+
#Remove common junk terms
|
55
|
+
Bplgeo::Constants::JUNK_TERMS.each { |term| geo_term.gsub!(term, '') }
|
56
|
+
|
57
|
+
#Strip any leading periods or commas from junk terms
|
58
|
+
geo_term = geo_term.gsub(/^[\.,]+/, '').strip
|
59
|
+
|
60
|
+
#Replace any semicolons with commas... possible strip them?
|
61
|
+
geo_term = geo_term.gsub(';', ',')
|
62
|
+
|
63
|
+
#Terms in paranthesis will cause some geographic parsers to freak out. Switch to commas instead.
|
64
|
+
if geo_term.match(/[\(\)]+/)
|
65
|
+
#Attempt to fix address if something like (word)
|
66
|
+
if geo_term.match(/ \(+.*\)+/)
|
67
|
+
#Make this replacement better?
|
68
|
+
geo_term = geo_term.gsub(/ *\((?=[\S ]+\))/,', ')
|
69
|
+
geo_term = geo_term.gsub(')', '')
|
70
|
+
|
71
|
+
#Else skip this as data returned likely will be unreliable for now... FIXME when use case occurs.
|
72
|
+
else
|
73
|
+
return nil
|
74
|
+
end
|
75
|
+
end
|
76
|
+
|
77
|
+
return geo_term
|
78
|
+
end
|
79
|
+
|
80
|
+
#Attempt to dedup a list of geographic areas.
|
81
|
+
#FIXME: Horrendous first pass.
|
82
|
+
#Aggresive flag removes less specific matches. IE. ['Hanoi, Vietnam' and 'Vietnam'] would return just ['Hanoi, Vietnam']
|
83
|
+
def self.dedup_geo(geo_list, aggresive=false)
|
84
|
+
geo_list = geo_list.clone
|
85
|
+
|
86
|
+
base_word_geo_list = []
|
87
|
+
geo_list.each do |geo_term|
|
88
|
+
geo_term = geo_term.gsub('(','').gsub(')','').gsub('.','').gsub(',','').gsub(';','')
|
89
|
+
#Remove common junk terms
|
90
|
+
Bplgeo::Constants::JUNK_TERMS.each { |term| geo_term.gsub!(term, '') }
|
91
|
+
|
92
|
+
geo_term = geo_term.squish
|
93
|
+
|
94
|
+
base_word_geo_list << geo_term
|
95
|
+
end
|
96
|
+
|
97
|
+
indexes_to_remove = []
|
98
|
+
|
99
|
+
0.upto base_word_geo_list.size-1 do |index|
|
100
|
+
matched_words_count = []
|
101
|
+
current_best_term = geo_list[index]
|
102
|
+
current_best_term_index = index
|
103
|
+
|
104
|
+
base_word_geo_list[index].split(' ').each { |word|
|
105
|
+
|
106
|
+
(index+1).upto base_word_geo_list.size-1 do |inner_index|
|
107
|
+
if base_word_geo_list[inner_index].split(' ').any? { |single_word| single_word == word }
|
108
|
+
matched_words_count[inner_index] ||= 0
|
109
|
+
matched_words_count[inner_index] = matched_words_count[inner_index] + 1
|
110
|
+
|
111
|
+
end
|
112
|
+
end
|
113
|
+
}
|
114
|
+
|
115
|
+
matched_words_count.each_with_index do |matched_count, matched_index|
|
116
|
+
if matched_count == base_word_geo_list[index].split(' ').size && ((base_word_geo_list[matched_index].split(' ').size < base_word_geo_list[index].split(' ').size && aggresive) || (base_word_geo_list[matched_index].split(' ').size == base_word_geo_list[index].split(' ').size))
|
117
|
+
if current_best_term.split(',').size < geo_list[matched_index].split(',').size || (current_best_term.size+1 < geo_list[matched_index].size && !geo_list[matched_index].include?('('))
|
118
|
+
current_best_term = geo_list[matched_index]
|
119
|
+
indexes_to_remove << current_best_term_index
|
120
|
+
current_best_term_index = matched_index
|
121
|
+
else
|
122
|
+
indexes_to_remove << matched_index
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
indexes_to_remove.each do |removal_index|
|
130
|
+
geo_list[removal_index] = nil
|
131
|
+
end
|
132
|
+
|
133
|
+
return geo_list.compact
|
134
|
+
end
|
135
|
+
|
136
|
+
def self.parsed_and_original_check(geo_hash)
|
137
|
+
term = geo_hash[:standardized_term]
|
138
|
+
|
139
|
+
if geo_hash[:street_part].present? || geo_hash[:coords].present?
|
140
|
+
return true
|
141
|
+
end
|
142
|
+
|
143
|
+
#Keep original string if three parts at least or if there is a number in the term.
|
144
|
+
#TODO: Make this better!
|
145
|
+
if (term.split(',').length >= 3 && geo_hash[:neighborhood_part].blank?) || (term.split(',').length >= 2 && geo_hash[:city_part].blank?) || term.split(',').length >= 4 || term.match(/\d/).present?
|
146
|
+
return true
|
147
|
+
end
|
148
|
+
|
149
|
+
if geo_hash[:country_part] != 'United States'
|
150
|
+
if geo_hash[:city_part].blank? && geo_hash[:state_part].blank?
|
151
|
+
#Currently do noting
|
152
|
+
elsif !((geo_hash[:city_part].present? && term.to_ascii.downcase.include?(geo_hash[:city_part].to_ascii.downcase)) || (geo_hash[:state_part].present? && term.to_ascii.downcase.include?(geo_hash[:state_part].to_ascii.downcase)))
|
153
|
+
return true
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
|
158
|
+
return false
|
159
|
+
end
|
160
|
+
|
161
|
+
|
162
|
+
|
163
|
+
#Take LCSH subjects and make them standard.
|
164
|
+
def self.LCSHize(value)
|
165
|
+
#Remove ending periods ... except when an initial or etc.
|
166
|
+
if value.last == '.' && value[-2].match(/[^A-Z]/) && !value[-4..-1].match('etc.')
|
167
|
+
value = value.slice(0..-2)
|
168
|
+
end
|
169
|
+
|
170
|
+
#Fix when '- -' occurs
|
171
|
+
value = value.gsub(/-\s-/,'--')
|
172
|
+
|
173
|
+
#Fix for "em" dashes - two types?
|
174
|
+
value = value.gsub('—','--')
|
175
|
+
|
176
|
+
#Fix for "em" dashes - two types?
|
177
|
+
value = value.gsub('–','--')
|
178
|
+
|
179
|
+
#Fix for ' - ' combinations
|
180
|
+
value = value.gsub(' - ','--')
|
181
|
+
|
182
|
+
#Remove white space after and before '--'
|
183
|
+
value = value.gsub(/\s+--/,'--')
|
184
|
+
value = value.gsub(/--\s+/,'--')
|
185
|
+
|
186
|
+
#Ensure first work is capitalized
|
187
|
+
value[0] = value.first.capitalize[0]
|
188
|
+
|
189
|
+
#Strip any white space
|
190
|
+
value = strip_value(value)
|
191
|
+
|
192
|
+
return value
|
193
|
+
end
|
194
|
+
|
195
|
+
def self.strip_value(value)
|
196
|
+
if(value.blank?)
|
197
|
+
return nil
|
198
|
+
else
|
199
|
+
if value.class == Float || value.class == Fixnum
|
200
|
+
value = value.to_i.to_s
|
201
|
+
end
|
202
|
+
|
203
|
+
# Make sure it is all UTF-8 and not character encodings or HTML tags and remove any cariage returns
|
204
|
+
return utf8Encode(value)
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
#TODO: Better name for this. Should be part of an overall helped gem.
|
209
|
+
def self.utf8Encode(value)
|
210
|
+
return HTMLEntities.new.decode(ActionView::Base.full_sanitizer.sanitize(value.to_s.gsub(/\r?\n?\t/, ' ').gsub(/\r?\n/, ' ').gsub(/<br[\s]*\/>/,' '))).strip
|
211
|
+
end
|
212
|
+
end
|
213
|
+
end
|