geomash 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Rakefile +32 -0
- data/config/geomash.yml.sample +24 -0
- data/lib/geomash.rb +86 -0
- data/lib/geomash/constants.rb +478 -0
- data/lib/geomash/geonames.rb +145 -0
- data/lib/geomash/parser.rb +220 -0
- data/lib/geomash/standardizer.rb +250 -0
- data/lib/geomash/tgn.rb +673 -0
- data/lib/geomash/town_lookup.rb +19 -0
- data/lib/geomash/version.rb +3 -0
- data/test/geomash_test.rb +146 -0
- data/test/geonames_test.rb +24 -0
- data/test/parser_test.rb +33 -0
- data/test/standardizer_test.rb +26 -0
- data/test/test_helper.rb +16 -0
- data/test/tgn_test.rb +19 -0
- data/test/town_lookup_test.rb +11 -0
- metadata +210 -0
@@ -0,0 +1,145 @@
|
|
1
|
+
module Geomash
|
2
|
+
class Geonames
|
3
|
+
|
4
|
+
def self.geonames_username
|
5
|
+
Geomash.config[:geonames_username] || '<username>'
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.get_geonames_data(geoname_id)
|
9
|
+
max_retry = 3
|
10
|
+
sleep_time = 60 # In seconds
|
11
|
+
retry_count = 0
|
12
|
+
|
13
|
+
hier_geo = {}
|
14
|
+
coords = {}
|
15
|
+
geonames_data = {}
|
16
|
+
|
17
|
+
begin
|
18
|
+
if retry_count > 0
|
19
|
+
sleep(sleep_time)
|
20
|
+
end
|
21
|
+
retry_count = retry_count + 1
|
22
|
+
|
23
|
+
geonames_response = Typhoeus::Request.get("http://api.geonames.org/hierarchy?username=#{self.geonames_username}&lang=en&style=FULL&geonameId=" + geoname_id)
|
24
|
+
|
25
|
+
end until (geonames_response.code != 500 || retry_count == max_retry)
|
26
|
+
|
27
|
+
unless geonames_response.code == 500
|
28
|
+
parsed_xml = Nokogiri::Slop(geonames_response.body)
|
29
|
+
|
30
|
+
parsed_xml.geonames.geoname.each do |geoname|
|
31
|
+
hier_geo[geoname.fcode.text.downcase.to_sym] = geoname.toponymName.text
|
32
|
+
end
|
33
|
+
|
34
|
+
#FIXME: Code4Lib lazy implementation... will get last result
|
35
|
+
geoname = parsed_xml.geonames.geoname.last
|
36
|
+
coords[:latitude] = geoname.lat.text
|
37
|
+
coords[:longitude] = geoname.lng.text
|
38
|
+
coords[:combined] = coords[:latitude] + ',' + coords[:longitude]
|
39
|
+
#FIXME: Will be corrected as part of Geomash rename later this week.
|
40
|
+
begin
|
41
|
+
coords[:box] = {}
|
42
|
+
coords[:box][:west] = geoname.bbox.west.text
|
43
|
+
coords[:box][:north] = geoname.bbox.north.text
|
44
|
+
coords[:box][:east] = geoname.bbox.east.text
|
45
|
+
coords[:box][:south] = geoname.bbox.south.text
|
46
|
+
rescue
|
47
|
+
coords[:box] = {}
|
48
|
+
end
|
49
|
+
|
50
|
+
geonames_data[:coords] = coords
|
51
|
+
geonames_data[:hier_geo] = hier_geo.present? ? hier_geo : nil
|
52
|
+
end
|
53
|
+
|
54
|
+
return geonames_data
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
def self.geonames_id_from_geo_hash(geo_hash)
|
59
|
+
return nil if Geomash::Geonames.geonames_username == '<username>'
|
60
|
+
geo_hash = geo_hash.clone
|
61
|
+
|
62
|
+
max_retry = 3
|
63
|
+
sleep_time = 60 # In seconds
|
64
|
+
retry_count = 0
|
65
|
+
|
66
|
+
geonames_search_array = []
|
67
|
+
return_hash = {}
|
68
|
+
|
69
|
+
#Don't do both neighborhood and city!
|
70
|
+
if geo_hash[:neighborhood_part].present?
|
71
|
+
geonames_search_array << geo_hash[:neighborhood_part]
|
72
|
+
exact_name_term = geo_hash[:neighborhood_part]
|
73
|
+
elsif geo_hash[:city_part].present?
|
74
|
+
geonames_search_array << geo_hash[:city_part]
|
75
|
+
exact_name_term = geo_hash[:neighborhood_part]
|
76
|
+
end
|
77
|
+
|
78
|
+
geonames_search_array << geo_hash[:state_part] if geo_hash[:state_part].present?
|
79
|
+
exact_name_term ||= geo_hash[:neighborhood_part]
|
80
|
+
geonames_search_array << geo_hash[:country_part] if geo_hash[:country_part].present?
|
81
|
+
exact_name_term ||= geo_hash[:country_part]
|
82
|
+
geonames_search_string = geonames_search_array.join(', ')
|
83
|
+
|
84
|
+
exact_name_term = geonames_search_array.first.strip
|
85
|
+
|
86
|
+
begin
|
87
|
+
if retry_count > 0
|
88
|
+
sleep(sleep_time)
|
89
|
+
end
|
90
|
+
retry_count = retry_count + 1
|
91
|
+
|
92
|
+
geonames_response = Typhoeus::Request.get("http://api.geonames.org/search?username=#{self.geonames_username}&lang=en&style=FULL&q=#{CGI.escape(geonames_search_string)}&name_equals=#{CGI.escape(exact_name_term)}&country=#{Country.find_country_by_name(geo_hash[:country_part]).alpha2}")
|
93
|
+
|
94
|
+
end until (geonames_response.code != 500 || retry_count == max_retry)
|
95
|
+
|
96
|
+
unless geonames_response.code == 500
|
97
|
+
|
98
|
+
parsed_xml = Nokogiri::Slop(geonames_response.body)
|
99
|
+
|
100
|
+
begin
|
101
|
+
raise "geonames status error message of: #{parsed_xml.to_s}" if parsed_xml.geonames.status
|
102
|
+
rescue
|
103
|
+
#Do nothing but FIXME to not use slop
|
104
|
+
end
|
105
|
+
|
106
|
+
#This is ugly and needs to be redone to achieve better recursive...
|
107
|
+
if parsed_xml.geonames.totalResultsCount.text == '0'
|
108
|
+
if geo_hash[:neighborhood_part].present?
|
109
|
+
geo_hash_temp = geo_hash.clone
|
110
|
+
geo_hash_temp[:neighborhood_part] = nil
|
111
|
+
return_hash = geonames_id_from_geo_hash(geo_hash_temp)
|
112
|
+
return return_hash if return_hash.present?
|
113
|
+
elsif geo_hash[:city_part].present?
|
114
|
+
geo_hash_temp = geo_hash.clone
|
115
|
+
geo_hash_temp[:city_part] = nil
|
116
|
+
return_hash = geonames_id_from_geo_hash(geo_hash_temp)
|
117
|
+
return return_hash if return_hash.present?
|
118
|
+
end
|
119
|
+
|
120
|
+
return nil
|
121
|
+
end
|
122
|
+
|
123
|
+
#Exact Match ... FIXME to not use Slop
|
124
|
+
if parsed_xml.geonames.geoname.class == Nokogiri::XML::Element
|
125
|
+
return_hash[:id] = parsed_xml.geonames.geoname.geonameId.text
|
126
|
+
return_hash[:rdf] = "http://sws.geonames.org/#{return_hash[:id]}/about.rdf"
|
127
|
+
elsif parsed_xml.geonames.geoname.class ==Nokogiri::XML::NodeSet
|
128
|
+
return_hash[:id] = parsed_xml.geonames.geoname.first.geonameId.text
|
129
|
+
return_hash[:rdf] = "http://sws.geonames.org/#{return_hash[:id]}/about.rdf"
|
130
|
+
end
|
131
|
+
return_hash[:original_string_differs] = Geomash::Standardizer.parsed_and_original_check(geo_hash)
|
132
|
+
|
133
|
+
end
|
134
|
+
|
135
|
+
if geonames_response.code == 500
|
136
|
+
raise 'Geonames Server appears to not be responding for Geographic query: ' + term
|
137
|
+
end
|
138
|
+
|
139
|
+
return return_hash if return_hash.present?
|
140
|
+
|
141
|
+
return nil
|
142
|
+
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
@@ -0,0 +1,220 @@
|
|
1
|
+
module Geomash
|
2
|
+
class Parser
|
3
|
+
|
4
|
+
def self.mapquest_key
|
5
|
+
Geomash.config[:mapquest_key] || '<mapquest_key>'
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.bing_key
|
9
|
+
Geomash.config[:bing_key] || '<bing_key>'
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.timeout
|
13
|
+
Geomash.config[:timeout]
|
14
|
+
end
|
15
|
+
|
16
|
+
#Note: Limited to only looking at United States places...
|
17
|
+
def self.parse_bing_api(term, parse_term_flag=false)
|
18
|
+
return_hash = {}
|
19
|
+
retry_count = 3
|
20
|
+
|
21
|
+
#Skip if no bing_key... possibly move this elsewhere?
|
22
|
+
return return_hash if self.bing_key == '<bing_key>'
|
23
|
+
|
24
|
+
return_hash[:original_term] = term
|
25
|
+
|
26
|
+
term = Geomash::Standardizer.parse_for_geographic_term(term) if parse_term_flag
|
27
|
+
term = Geomash::Standardizer.standardize_geographic_term(term)
|
28
|
+
|
29
|
+
if term.blank?
|
30
|
+
return {}
|
31
|
+
end
|
32
|
+
|
33
|
+
return_hash[:standardized_term] = term
|
34
|
+
|
35
|
+
#Bing API does badly with parentheses...
|
36
|
+
if term.match(/[\(\)]+/)
|
37
|
+
return {}
|
38
|
+
end
|
39
|
+
|
40
|
+
#Sometimes with building, city, state, bing is dumb and will only return state. Example: Boston Harbor, Boston, Mass.
|
41
|
+
#So if not a street address, pass to have google handle it for better results...
|
42
|
+
#Example of another bad record: South Street bridge, West Bridgewater, Mass. would give a place in Holyoke
|
43
|
+
if term.split(',').length >= 3 && term.match(/\d/).blank? && term.downcase.match(/ave\.*,/).blank? && term.downcase.match(/avenue\.*,/).blank? && term.downcase.match(/street\.*,/).blank? && term.downcase.match(/st\.*,/).blank? && term.downcase.match(/road\.*,/).blank? && term.downcase.match(/rd\.*,/).blank?
|
44
|
+
return {}
|
45
|
+
end
|
46
|
+
|
47
|
+
Geocoder.configure(:lookup => :bing,:api_key => self.bing_key,:timeout => self.timeout, :always_raise => :all)
|
48
|
+
bing_api_result = Geocoder.search(term)
|
49
|
+
|
50
|
+
rescue SocketError => e
|
51
|
+
retry unless (retry_count -= 1).zero?
|
52
|
+
else
|
53
|
+
|
54
|
+
#Use only for United States results... international results are inaccurate.
|
55
|
+
if bing_api_result.present? && bing_api_result.first.data["address"]["countryRegion"] == 'United States'
|
56
|
+
|
57
|
+
if bing_api_result.first.data["entityType"] == 'Neighborhood'
|
58
|
+
return {} #Doesn't return a city... Google handles this better.
|
59
|
+
end
|
60
|
+
|
61
|
+
if bing_api_result.first.data["address"]["addressLine"].present?
|
62
|
+
return_hash[:term_differs_from_tgn] = true
|
63
|
+
return_hash[:street_part] = bing_api_result.first.data["address"]["addressLine"]
|
64
|
+
return_hash[:coords] = {:latitude=>bing_api_result.first.data["geocodePoints"].first["coordinates"].first.to_s,
|
65
|
+
:longitude=>bing_api_result.first.data["geocodePoints"].first["coordinates"].last.to_s,
|
66
|
+
:combined=>bing_api_result.first.data["geocodePoints"].first["coordinates"].first.to_s + ',' + bing_api_result.first.data["geocodePoints"].first["coordinates"].last.to_s}
|
67
|
+
end
|
68
|
+
|
69
|
+
return_hash[:country_part] = bing_api_result.first.data["address"]["countryRegion"]
|
70
|
+
|
71
|
+
if return_hash[:country_part] == 'United States'
|
72
|
+
return_hash[:state_part] = Geomash::Constants::STATE_ABBR[bing_api_result.first.data["address"]["adminDistrict"]]
|
73
|
+
else
|
74
|
+
return_hash[:state_part] = bing_api_result.first.data["address"]["adminDistrict"]
|
75
|
+
end
|
76
|
+
|
77
|
+
return_hash[:city_part] = bing_api_result.first.data["address"]["locality"]
|
78
|
+
else
|
79
|
+
return {}
|
80
|
+
end
|
81
|
+
|
82
|
+
#Only return if USA for now. International results often awful.
|
83
|
+
return return_hash[:country_part] == 'United States' ? return_hash : {}
|
84
|
+
end
|
85
|
+
|
86
|
+
#Mapquest allows unlimited requests - start here?
|
87
|
+
def self.parse_mapquest_api(term, parse_term_flag=false)
|
88
|
+
return_hash = {}
|
89
|
+
retry_count = 3
|
90
|
+
|
91
|
+
#Skip if no bing_key... possibly move this elsewhere?
|
92
|
+
return return_hash if self.mapquest_key == '<mapquest_key>'
|
93
|
+
|
94
|
+
return_hash[:original_term] = term
|
95
|
+
|
96
|
+
term = Geomash::Standardizer.parse_for_geographic_term(term) if parse_term_flag
|
97
|
+
term = Geomash::Standardizer.standardize_geographic_term(term)
|
98
|
+
|
99
|
+
if term.blank?
|
100
|
+
return {}
|
101
|
+
end
|
102
|
+
|
103
|
+
return_hash[:standardized_term] = term
|
104
|
+
|
105
|
+
#Mapquest returns bad data for: Manchester, Mass.
|
106
|
+
if term.include?('Manchester') || term.include?('Atlanta, MI')
|
107
|
+
return {}
|
108
|
+
end
|
109
|
+
|
110
|
+
#Messed up with just neighborhoods. Example: Hyde Park (Boston, Mass.) or Hyde Park (Boston, Mass.)
|
111
|
+
#So if not a street address, pass to have google handle it for better results...
|
112
|
+
if term.split(',').length >= 3 && term.match(/\d/).blank? && term.downcase.match(/ave\.*,/).blank? && term.downcase.match(/avenue\.*,/).blank? && term.downcase.match(/street\.*,/).blank? && term.downcase.match(/st\.*,/).blank? && term.downcase.match(/road\.*,/).blank? && term.downcase.match(/rd\.*,/).blank?
|
113
|
+
return {}
|
114
|
+
end
|
115
|
+
|
116
|
+
Geocoder.configure(:lookup => :mapquest,:api_key => self.mapquest_key,:timeout => self.timeout, :always_raise => :all)
|
117
|
+
|
118
|
+
mapquest_api_result = Geocoder.search(term)
|
119
|
+
rescue SocketError => e
|
120
|
+
retry unless (retry_count -= 1).zero?
|
121
|
+
else
|
122
|
+
|
123
|
+
|
124
|
+
#If this call returned a result...
|
125
|
+
if mapquest_api_result.present?
|
126
|
+
|
127
|
+
if mapquest_api_result.first.data["street"].present?
|
128
|
+
#return_hash[:term_differs_from_tgn] = true
|
129
|
+
return_hash[:street_part] = mapquest_api_result.first.data["street"]
|
130
|
+
return_hash[:coords] = {:latitude=>mapquest_api_result.first.data['latLng']['lat'].to_s,
|
131
|
+
:longitude=>mapquest_api_result.first.data['latLng']['lng'].to_s,
|
132
|
+
:combined=>mapquest_api_result.first.data['latLng']['lat'].to_s + ',' + mapquest_api_result.first.data['latLng']['lng'].to_s}
|
133
|
+
end
|
134
|
+
|
135
|
+
return_hash[:country_part] = Country.new(mapquest_api_result.first.data["adminArea1"]).name
|
136
|
+
|
137
|
+
if return_hash[:country_part] == 'United States'
|
138
|
+
return_hash[:state_part] = Geomash::Constants::STATE_ABBR[mapquest_api_result.first.data["adminArea3"]] || mapquest_api_result.first.data["adminArea4"]
|
139
|
+
else
|
140
|
+
return_hash[:state_part] = mapquest_api_result.first.data["adminArea3"].gsub(' province', '')
|
141
|
+
end
|
142
|
+
|
143
|
+
return_hash[:city_part] = mapquest_api_result.first.data["adminArea5"]
|
144
|
+
|
145
|
+
return_hash[:city_part] = return_hash[:city_part].gsub(' City', '') #Return New York as New York City...
|
146
|
+
end
|
147
|
+
|
148
|
+
#Only return if USA for now. Google is better with stuff like: 'Long Binh, Vietnam'
|
149
|
+
#Also only return if there is a city if there were more than two terms passed in. Fixes: Roxbury, MA
|
150
|
+
return {} unless return_hash[:country_part] == 'United States'
|
151
|
+
return {} if term.split(',').length >= 2 && return_hash[:city_part].blank?
|
152
|
+
|
153
|
+
return return_hash
|
154
|
+
end
|
155
|
+
|
156
|
+
#Final fallback is google API. The best but we are limited to 2500 requests per day unless we pay the $10k a year premium account...
|
157
|
+
#Note: If google cannot find street, it will return just city/state, like for "Salem Street and Paradise Road, Swampscott, MA, 01907"
|
158
|
+
#Seems like it sets a partial_match=>true in the data section...
|
159
|
+
def self.parse_google_api(term, parse_term_flag=false)
|
160
|
+
return_hash = {}
|
161
|
+
retry_count = 3
|
162
|
+
|
163
|
+
return_hash[:original_term] = term
|
164
|
+
|
165
|
+
term = Geomash::Standardizer.parse_for_geographic_term(term) if parse_term_flag
|
166
|
+
term = Geomash::Standardizer.standardize_geographic_term(term)
|
167
|
+
|
168
|
+
#Soviet Union returns back a place in Kazakhstan
|
169
|
+
if term.blank? || term == 'Soviet Union'
|
170
|
+
return {}
|
171
|
+
end
|
172
|
+
|
173
|
+
return_hash[:standardized_term] = term
|
174
|
+
|
175
|
+
::Geocoder.configure(:lookup => :google,:api_key => nil,:timeout => self.timeout, :always_raise => :all)
|
176
|
+
|
177
|
+
google_api_result = ::Geocoder.search(term)
|
178
|
+
rescue SocketError => e
|
179
|
+
retry unless (retry_count -= 1).zero?
|
180
|
+
else
|
181
|
+
|
182
|
+
|
183
|
+
#Check if only a partial match. To avoid errors, strip out the first part and try again...
|
184
|
+
#Need better way to check for street endings. See: http://pe.usps.gov/text/pub28/28apc_002.htm
|
185
|
+
if google_api_result.present?
|
186
|
+
if google_api_result.first.data['partial_match'] && term.split(',').length > 1 && !term.downcase.include?('street') && !term.downcase.include?('st.') && !term.downcase.include?('avenue') && !term.downcase.include?('ave.') && !term.downcase.include?('court') && !term.downcase.include?('dr.')
|
187
|
+
term = term.split(',')[1..term.split(',').length-1].join(',').strip
|
188
|
+
google_api_result = Geocoder.search(term)
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
if google_api_result.present?
|
193
|
+
#Types: street number, route, neighborhood, establishment, transit_station, bus_station
|
194
|
+
google_api_result.first.data["address_components"].each do |result|
|
195
|
+
if (result['types'] & ['street number', 'route', 'establishment', 'transit_station', 'bus_station']).present? || (result['types'].include?('neighborhood') && !result['types'].include?('political'))
|
196
|
+
#return_hash[:term_differs_from_tgn] = true
|
197
|
+
#TODO: Not implemented for Google results right now.
|
198
|
+
#return_hash[:street_part] = 'TODO: Not Implemented for Google Results'
|
199
|
+
return_hash[:coords] = {:latitude=>google_api_result.first.data['geometry']['location']['lat'].to_s,
|
200
|
+
:longitude=>google_api_result.first.data['geometry']['location']['lng'].to_s,
|
201
|
+
:combined=>google_api_result.first.data['geometry']['location']['lat'].to_s + ',' + google_api_result.first.data['geometry']['location']['lng'].to_s}
|
202
|
+
elsif (result['types'] & ['country']).present?
|
203
|
+
return_hash[:country_part] = result['long_name']
|
204
|
+
elsif (result['types'] & ['administrative_area_level_1']).present?
|
205
|
+
return_hash[:state_part] = result['long_name'].to_ascii
|
206
|
+
elsif (result['types'] & ['locality']).present?
|
207
|
+
return_hash[:city_part] = result['long_name']
|
208
|
+
elsif (result['types'] & ['sublocality', 'political']).length == 2 || result['types'].include?('neighborhood')
|
209
|
+
return_hash[:neighborhood_part] = result['long_name']
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
return_hash[:term_differs_from_tgn] ||= google_api_result.first.data['partial_match'] unless google_api_result.first.data['partial_match'].blank?
|
214
|
+
end
|
215
|
+
|
216
|
+
|
217
|
+
return return_hash
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
@@ -0,0 +1,250 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
module Geomash
|
3
|
+
class Standardizer
|
4
|
+
|
5
|
+
#Take a subject string and look for potential geographic terms.
|
6
|
+
def self.parse_for_geographic_term(term)
|
7
|
+
geo_term = ''
|
8
|
+
|
9
|
+
#Likely too long to be an address... some fields have junk with an address string...
|
10
|
+
if term.length > 125
|
11
|
+
return ''
|
12
|
+
end
|
13
|
+
|
14
|
+
state_abbr_list = ['Mass']
|
15
|
+
state_name_list = []
|
16
|
+
country_name_list = []
|
17
|
+
|
18
|
+
#Countries gem of https://github.com/hexorx/countries
|
19
|
+
Country.new('US').states.each do |state_abbr, state_names|
|
20
|
+
state_abbr_list << ' ' + state_abbr
|
21
|
+
state_name_list << state_names["name"]
|
22
|
+
end
|
23
|
+
|
24
|
+
Country.all.each do |country_name_abbr_pair|
|
25
|
+
country_name_list << country_name_abbr_pair.first
|
26
|
+
end
|
27
|
+
|
28
|
+
#Parsing a subject geographic term.
|
29
|
+
if term.include?('--')
|
30
|
+
term.split('--').each_with_index do |split_term, index|
|
31
|
+
if state_name_list.any? { |state| split_term.include? state } || country_name_list.any? { |country| split_term.include? country }
|
32
|
+
geo_term = term.split('--')[index..term.split('--').length-1].reverse!.join(',')
|
33
|
+
elsif state_abbr_list.any? { |abbr| split_term.include? abbr }
|
34
|
+
geo_term = split_term
|
35
|
+
end
|
36
|
+
end
|
37
|
+
#Other than a '--' field
|
38
|
+
#Experimental... example: Palmer (Mass) - history or Stores (retail trade) - Palmer, Mass
|
39
|
+
elsif term.include?(' - ')
|
40
|
+
term.split(' - ').each do |split_term|
|
41
|
+
if state_name_list.any? { |state| split_term.include? state } || state_abbr_list.any? { |abbr| split_term.include? abbr } || country_name_list.any? { |country| split_term.include? country }
|
42
|
+
geo_term = split_term
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
else
|
47
|
+
if state_name_list.any? { |state| term.include? state } || state_abbr_list.any? { |abbr| term.include? abbr } || country_name_list.any? { |country| term.include? country }
|
48
|
+
geo_term = term
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
return geo_term
|
53
|
+
end
|
54
|
+
|
55
|
+
#Make a string in a standard format.
|
56
|
+
def self.standardize_geographic_term(geo_term)
|
57
|
+
|
58
|
+
geo_term = geo_term.clone #Don't change original
|
59
|
+
|
60
|
+
#Remove common junk terms
|
61
|
+
Geomash::Constants::JUNK_TERMS.each { |term| geo_term.gsub!(term, '') }
|
62
|
+
|
63
|
+
#Strip any leading periods or commas from junk terms
|
64
|
+
geo_term = geo_term.gsub(/^[\.,]+/, '').strip
|
65
|
+
|
66
|
+
#Replace any four TGN dashes from removing a junk term
|
67
|
+
geo_term = geo_term.gsub('----', '--')
|
68
|
+
|
69
|
+
#Replace any semicolons with commas... possible strip them?
|
70
|
+
geo_term = geo_term.gsub(';', ',')
|
71
|
+
|
72
|
+
#Terms in paranthesis will cause some geographic parsers to freak out. Switch to commas instead.
|
73
|
+
if geo_term.match(/[\(\)]+/)
|
74
|
+
#Attempt to fix address if something like (word)
|
75
|
+
if geo_term.match(/ \(+.*\)+/)
|
76
|
+
#Make this replacement better?
|
77
|
+
geo_term = geo_term.gsub(/ *\((?=[\S ]+\))/,', ')
|
78
|
+
geo_term = geo_term.gsub(')', '')
|
79
|
+
|
80
|
+
#Else skip this as data returned likely will be unreliable for now... FIXME when use case occurs.
|
81
|
+
else
|
82
|
+
return nil
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
return geo_term
|
87
|
+
end
|
88
|
+
|
89
|
+
#Attempt to dedup a list of geographic areas.
|
90
|
+
#FIXME: Horrendous first pass.
|
91
|
+
#Aggresive flag removes less specific matches. IE. ['Hanoi, Vietnam' and 'Vietnam'] would return just ['Hanoi, Vietnam']
|
92
|
+
def self.dedup_geo(geo_list, aggressive=false)
|
93
|
+
geo_list = geo_list.clone
|
94
|
+
|
95
|
+
base_word_geo_list = []
|
96
|
+
geo_list.each do |geo_term|
|
97
|
+
geo_term = geo_term.gsub('(','').gsub(')','').gsub('.','').gsub(',','').gsub(';','')
|
98
|
+
#Remove common junk terms
|
99
|
+
Geomash::Constants::JUNK_TERMS.each { |term| geo_term.gsub!(term, '') }
|
100
|
+
|
101
|
+
geo_term = geo_term.squish
|
102
|
+
|
103
|
+
base_word_geo_list << geo_term
|
104
|
+
end
|
105
|
+
|
106
|
+
indexes_to_remove = []
|
107
|
+
|
108
|
+
0.upto base_word_geo_list.size-1 do |index|
|
109
|
+
matched_words_count = []
|
110
|
+
current_best_term = geo_list[index]
|
111
|
+
current_best_term_index = index
|
112
|
+
|
113
|
+
base_word_geo_list[index].split(' ').each { |word|
|
114
|
+
|
115
|
+
(index+1).upto base_word_geo_list.size-1 do |inner_index|
|
116
|
+
if base_word_geo_list[inner_index].split(' ').any? { |single_word| single_word == word }
|
117
|
+
matched_words_count[inner_index] ||= 0
|
118
|
+
matched_words_count[inner_index] = matched_words_count[inner_index] + 1
|
119
|
+
|
120
|
+
end
|
121
|
+
end
|
122
|
+
}
|
123
|
+
|
124
|
+
matched_words_count.each_with_index do |matched_count, matched_index|
|
125
|
+
matched_count ||= 0
|
126
|
+
|
127
|
+
if (matched_count == base_word_geo_list[matched_index].split(' ').size) && ((base_word_geo_list[matched_index].split(' ').size < base_word_geo_list[index].split(' ').size && aggressive) || (base_word_geo_list[matched_index].split(' ').size == base_word_geo_list[index].split(' ').size))
|
128
|
+
if current_best_term.split(',').size < geo_list[matched_index].split(',').size || (current_best_term.size+1 < geo_list[matched_index].size && !geo_list[matched_index].include?('('))
|
129
|
+
current_best_term = geo_list[matched_index]
|
130
|
+
indexes_to_remove << current_best_term_index
|
131
|
+
current_best_term_index = matched_index
|
132
|
+
else
|
133
|
+
indexes_to_remove << matched_index
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
indexes_to_remove.each do |removal_index|
|
141
|
+
geo_list[removal_index] = nil
|
142
|
+
end
|
143
|
+
|
144
|
+
return geo_list.compact
|
145
|
+
end
|
146
|
+
|
147
|
+
def self.parsed_and_original_check(geo_hash)
|
148
|
+
term = geo_hash[:standardized_term]
|
149
|
+
|
150
|
+
if geo_hash[:street_part].present? || geo_hash[:coords].present?
|
151
|
+
return true
|
152
|
+
end
|
153
|
+
|
154
|
+
#Keep original string if three parts at least or if there is a number in the term.
|
155
|
+
#TODO: Make this better!
|
156
|
+
if (term.split(',').length >= 3 && geo_hash[:neighborhood_part].blank?) || (term.split(',').length >= 2 && geo_hash[:city_part].blank?) || term.split(',').length >= 4 || term.match(/\d/).present?
|
157
|
+
return true
|
158
|
+
end
|
159
|
+
|
160
|
+
if geo_hash[:country_part] != 'United States'
|
161
|
+
if geo_hash[:city_part].blank? && geo_hash[:state_part].blank?
|
162
|
+
#Currently do noting
|
163
|
+
elsif !((geo_hash[:city_part].present? && term.to_ascii.downcase.include?(geo_hash[:city_part].to_ascii.downcase)) || (geo_hash[:state_part].present? && term.to_ascii.downcase.include?(geo_hash[:state_part].to_ascii.downcase)))
|
164
|
+
return true
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
|
169
|
+
return false
|
170
|
+
end
|
171
|
+
|
172
|
+
|
173
|
+
|
174
|
+
#Take LCSH subjects and make them standard.
|
175
|
+
def self.LCSHize(value)
|
176
|
+
#Remove ending periods ... except when an initial or etc.
|
177
|
+
if value.last == '.' && value[-2].match(/[^A-Z]/) && !value[-4..-1].match('etc.')
|
178
|
+
value = value.slice(0..-2)
|
179
|
+
end
|
180
|
+
|
181
|
+
#Fix when '- -' occurs
|
182
|
+
value = value.gsub(/-\s-/,'--')
|
183
|
+
|
184
|
+
#Fix for "em" dashes - two types?
|
185
|
+
value = value.gsub('—','--')
|
186
|
+
|
187
|
+
#Fix for "em" dashes - two types?
|
188
|
+
value = value.gsub('–','--')
|
189
|
+
|
190
|
+
#Fix for ' - ' combinations
|
191
|
+
value = value.gsub(' - ','--')
|
192
|
+
|
193
|
+
#Remove white space after and before '--'
|
194
|
+
value = value.gsub(/\s+--/,'--')
|
195
|
+
value = value.gsub(/--\s+/,'--')
|
196
|
+
|
197
|
+
#Ensure first work is capitalized
|
198
|
+
value[0] = value.first.capitalize[0]
|
199
|
+
|
200
|
+
#Strip any white space
|
201
|
+
value = strip_value(value)
|
202
|
+
|
203
|
+
return value
|
204
|
+
end
|
205
|
+
|
206
|
+
def self.strip_value(value)
|
207
|
+
if(value.blank?)
|
208
|
+
return nil
|
209
|
+
else
|
210
|
+
if value.class == Float || value.class == Fixnum
|
211
|
+
value = value.to_i.to_s
|
212
|
+
end
|
213
|
+
|
214
|
+
# Make sure it is all UTF-8 and not character encodings or HTML tags and remove any cariage returns
|
215
|
+
return utf8Encode(value)
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
#TODO: Better name for this. Should be part of an overall helped gem.
|
220
|
+
def self.utf8Encode(value)
|
221
|
+
return HTMLEntities.new.decode(ActionView::Base.full_sanitizer.sanitize(value.to_s.gsub(/\r?\n?\t/, ' ').gsub(/\r?\n/, ' ').gsub(/<br[\s]*\/>/,' '))).strip
|
222
|
+
end
|
223
|
+
|
224
|
+
|
225
|
+
def self.try_with_entered_names(geo_hash)
|
226
|
+
geo_hash_local = geo_hash.clone
|
227
|
+
if geo_hash_local[:neighborhood_part].present?
|
228
|
+
orig_string_check = geo_hash_local[:standardized_term].gsub(',', ' ').squish.split(' ').select { |value| value.downcase.to_ascii == geo_hash_local[:neighborhood_part].downcase.to_ascii}
|
229
|
+
geo_hash_local[:neighborhood_part] = orig_string_check.first.strip if orig_string_check.present? && orig_string_check != geo_hash_local[:neighborhood_part]
|
230
|
+
return geo_hash_local
|
231
|
+
end
|
232
|
+
|
233
|
+
if geo_hash_local[:city_part].present?
|
234
|
+
orig_string_check = geo_hash_local[:standardized_term].gsub(',', ' ').squish.split(' ').select { |value| value.downcase.to_ascii == geo_hash_local[:city_part].downcase.to_ascii}
|
235
|
+
geo_hash_local[:city_part] = orig_string_check.first.strip if orig_string_check.present?
|
236
|
+
return geo_hash_local
|
237
|
+
end
|
238
|
+
|
239
|
+
|
240
|
+
if geo_hash_local[:state_part].present?
|
241
|
+
orig_string_check = geo_hash_local[:standardized_term].gsub(',', ' ').squish.split(' ').select { |value| value.downcase.to_ascii == geo_hash_local[:state_part].downcase.to_ascii}
|
242
|
+
geo_hash_local[:state_part] = orig_string_check.first.strip if orig_string_check.present?
|
243
|
+
return geo_hash_local
|
244
|
+
end
|
245
|
+
|
246
|
+
return nil
|
247
|
+
end
|
248
|
+
|
249
|
+
end
|
250
|
+
end
|