geomash 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Rakefile +32 -0
- data/config/geomash.yml.sample +24 -0
- data/lib/geomash.rb +86 -0
- data/lib/geomash/constants.rb +478 -0
- data/lib/geomash/geonames.rb +145 -0
- data/lib/geomash/parser.rb +220 -0
- data/lib/geomash/standardizer.rb +250 -0
- data/lib/geomash/tgn.rb +673 -0
- data/lib/geomash/town_lookup.rb +19 -0
- data/lib/geomash/version.rb +3 -0
- data/test/geomash_test.rb +146 -0
- data/test/geonames_test.rb +24 -0
- data/test/parser_test.rb +33 -0
- data/test/standardizer_test.rb +26 -0
- data/test/test_helper.rb +16 -0
- data/test/tgn_test.rb +19 -0
- data/test/town_lookup_test.rb +11 -0
- metadata +210 -0
@@ -0,0 +1,145 @@
|
|
1
|
+
module Geomash
|
2
|
+
class Geonames
|
3
|
+
|
4
|
+
def self.geonames_username
|
5
|
+
Geomash.config[:geonames_username] || '<username>'
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.get_geonames_data(geoname_id)
|
9
|
+
max_retry = 3
|
10
|
+
sleep_time = 60 # In seconds
|
11
|
+
retry_count = 0
|
12
|
+
|
13
|
+
hier_geo = {}
|
14
|
+
coords = {}
|
15
|
+
geonames_data = {}
|
16
|
+
|
17
|
+
begin
|
18
|
+
if retry_count > 0
|
19
|
+
sleep(sleep_time)
|
20
|
+
end
|
21
|
+
retry_count = retry_count + 1
|
22
|
+
|
23
|
+
geonames_response = Typhoeus::Request.get("http://api.geonames.org/hierarchy?username=#{self.geonames_username}&lang=en&style=FULL&geonameId=" + geoname_id)
|
24
|
+
|
25
|
+
end until (geonames_response.code != 500 || retry_count == max_retry)
|
26
|
+
|
27
|
+
unless geonames_response.code == 500
|
28
|
+
parsed_xml = Nokogiri::Slop(geonames_response.body)
|
29
|
+
|
30
|
+
parsed_xml.geonames.geoname.each do |geoname|
|
31
|
+
hier_geo[geoname.fcode.text.downcase.to_sym] = geoname.toponymName.text
|
32
|
+
end
|
33
|
+
|
34
|
+
#FIXME: Code4Lib lazy implementation... will get last result
|
35
|
+
geoname = parsed_xml.geonames.geoname.last
|
36
|
+
coords[:latitude] = geoname.lat.text
|
37
|
+
coords[:longitude] = geoname.lng.text
|
38
|
+
coords[:combined] = coords[:latitude] + ',' + coords[:longitude]
|
39
|
+
#FIXME: Will be corrected as part of Geomash rename later this week.
|
40
|
+
begin
|
41
|
+
coords[:box] = {}
|
42
|
+
coords[:box][:west] = geoname.bbox.west.text
|
43
|
+
coords[:box][:north] = geoname.bbox.north.text
|
44
|
+
coords[:box][:east] = geoname.bbox.east.text
|
45
|
+
coords[:box][:south] = geoname.bbox.south.text
|
46
|
+
rescue
|
47
|
+
coords[:box] = {}
|
48
|
+
end
|
49
|
+
|
50
|
+
geonames_data[:coords] = coords
|
51
|
+
geonames_data[:hier_geo] = hier_geo.present? ? hier_geo : nil
|
52
|
+
end
|
53
|
+
|
54
|
+
return geonames_data
|
55
|
+
end
|
56
|
+
|
57
|
+
|
58
|
+
def self.geonames_id_from_geo_hash(geo_hash)
|
59
|
+
return nil if Geomash::Geonames.geonames_username == '<username>'
|
60
|
+
geo_hash = geo_hash.clone
|
61
|
+
|
62
|
+
max_retry = 3
|
63
|
+
sleep_time = 60 # In seconds
|
64
|
+
retry_count = 0
|
65
|
+
|
66
|
+
geonames_search_array = []
|
67
|
+
return_hash = {}
|
68
|
+
|
69
|
+
#Don't do both neighborhood and city!
|
70
|
+
if geo_hash[:neighborhood_part].present?
|
71
|
+
geonames_search_array << geo_hash[:neighborhood_part]
|
72
|
+
exact_name_term = geo_hash[:neighborhood_part]
|
73
|
+
elsif geo_hash[:city_part].present?
|
74
|
+
geonames_search_array << geo_hash[:city_part]
|
75
|
+
exact_name_term = geo_hash[:neighborhood_part]
|
76
|
+
end
|
77
|
+
|
78
|
+
geonames_search_array << geo_hash[:state_part] if geo_hash[:state_part].present?
|
79
|
+
exact_name_term ||= geo_hash[:neighborhood_part]
|
80
|
+
geonames_search_array << geo_hash[:country_part] if geo_hash[:country_part].present?
|
81
|
+
exact_name_term ||= geo_hash[:country_part]
|
82
|
+
geonames_search_string = geonames_search_array.join(', ')
|
83
|
+
|
84
|
+
exact_name_term = geonames_search_array.first.strip
|
85
|
+
|
86
|
+
begin
|
87
|
+
if retry_count > 0
|
88
|
+
sleep(sleep_time)
|
89
|
+
end
|
90
|
+
retry_count = retry_count + 1
|
91
|
+
|
92
|
+
geonames_response = Typhoeus::Request.get("http://api.geonames.org/search?username=#{self.geonames_username}&lang=en&style=FULL&q=#{CGI.escape(geonames_search_string)}&name_equals=#{CGI.escape(exact_name_term)}&country=#{Country.find_country_by_name(geo_hash[:country_part]).alpha2}")
|
93
|
+
|
94
|
+
end until (geonames_response.code != 500 || retry_count == max_retry)
|
95
|
+
|
96
|
+
unless geonames_response.code == 500
|
97
|
+
|
98
|
+
parsed_xml = Nokogiri::Slop(geonames_response.body)
|
99
|
+
|
100
|
+
begin
|
101
|
+
raise "geonames status error message of: #{parsed_xml.to_s}" if parsed_xml.geonames.status
|
102
|
+
rescue
|
103
|
+
#Do nothing but FIXME to not use slop
|
104
|
+
end
|
105
|
+
|
106
|
+
#This is ugly and needs to be redone to achieve better recursive...
|
107
|
+
if parsed_xml.geonames.totalResultsCount.text == '0'
|
108
|
+
if geo_hash[:neighborhood_part].present?
|
109
|
+
geo_hash_temp = geo_hash.clone
|
110
|
+
geo_hash_temp[:neighborhood_part] = nil
|
111
|
+
return_hash = geonames_id_from_geo_hash(geo_hash_temp)
|
112
|
+
return return_hash if return_hash.present?
|
113
|
+
elsif geo_hash[:city_part].present?
|
114
|
+
geo_hash_temp = geo_hash.clone
|
115
|
+
geo_hash_temp[:city_part] = nil
|
116
|
+
return_hash = geonames_id_from_geo_hash(geo_hash_temp)
|
117
|
+
return return_hash if return_hash.present?
|
118
|
+
end
|
119
|
+
|
120
|
+
return nil
|
121
|
+
end
|
122
|
+
|
123
|
+
#Exact Match ... FIXME to not use Slop
|
124
|
+
if parsed_xml.geonames.geoname.class == Nokogiri::XML::Element
|
125
|
+
return_hash[:id] = parsed_xml.geonames.geoname.geonameId.text
|
126
|
+
return_hash[:rdf] = "http://sws.geonames.org/#{return_hash[:id]}/about.rdf"
|
127
|
+
elsif parsed_xml.geonames.geoname.class ==Nokogiri::XML::NodeSet
|
128
|
+
return_hash[:id] = parsed_xml.geonames.geoname.first.geonameId.text
|
129
|
+
return_hash[:rdf] = "http://sws.geonames.org/#{return_hash[:id]}/about.rdf"
|
130
|
+
end
|
131
|
+
return_hash[:original_string_differs] = Geomash::Standardizer.parsed_and_original_check(geo_hash)
|
132
|
+
|
133
|
+
end
|
134
|
+
|
135
|
+
if geonames_response.code == 500
|
136
|
+
raise 'Geonames Server appears to not be responding for Geographic query: ' + term
|
137
|
+
end
|
138
|
+
|
139
|
+
return return_hash if return_hash.present?
|
140
|
+
|
141
|
+
return nil
|
142
|
+
|
143
|
+
end
|
144
|
+
end
|
145
|
+
end
|
@@ -0,0 +1,220 @@
|
|
1
|
+
module Geomash
|
2
|
+
class Parser
|
3
|
+
|
4
|
+
def self.mapquest_key
|
5
|
+
Geomash.config[:mapquest_key] || '<mapquest_key>'
|
6
|
+
end
|
7
|
+
|
8
|
+
def self.bing_key
|
9
|
+
Geomash.config[:bing_key] || '<bing_key>'
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.timeout
|
13
|
+
Geomash.config[:timeout]
|
14
|
+
end
|
15
|
+
|
16
|
+
#Note: Limited to only looking at United States places...
|
17
|
+
def self.parse_bing_api(term, parse_term_flag=false)
|
18
|
+
return_hash = {}
|
19
|
+
retry_count = 3
|
20
|
+
|
21
|
+
#Skip if no bing_key... possibly move this elsewhere?
|
22
|
+
return return_hash if self.bing_key == '<bing_key>'
|
23
|
+
|
24
|
+
return_hash[:original_term] = term
|
25
|
+
|
26
|
+
term = Geomash::Standardizer.parse_for_geographic_term(term) if parse_term_flag
|
27
|
+
term = Geomash::Standardizer.standardize_geographic_term(term)
|
28
|
+
|
29
|
+
if term.blank?
|
30
|
+
return {}
|
31
|
+
end
|
32
|
+
|
33
|
+
return_hash[:standardized_term] = term
|
34
|
+
|
35
|
+
#Bing API does badly with parentheses...
|
36
|
+
if term.match(/[\(\)]+/)
|
37
|
+
return {}
|
38
|
+
end
|
39
|
+
|
40
|
+
#Sometimes with building, city, state, bing is dumb and will only return state. Example: Boston Harbor, Boston, Mass.
|
41
|
+
#So if not a street address, pass to have google handle it for better results...
|
42
|
+
#Example of another bad record: South Street bridge, West Bridgewater, Mass. would give a place in Holyoke
|
43
|
+
if term.split(',').length >= 3 && term.match(/\d/).blank? && term.downcase.match(/ave\.*,/).blank? && term.downcase.match(/avenue\.*,/).blank? && term.downcase.match(/street\.*,/).blank? && term.downcase.match(/st\.*,/).blank? && term.downcase.match(/road\.*,/).blank? && term.downcase.match(/rd\.*,/).blank?
|
44
|
+
return {}
|
45
|
+
end
|
46
|
+
|
47
|
+
Geocoder.configure(:lookup => :bing,:api_key => self.bing_key,:timeout => self.timeout, :always_raise => :all)
|
48
|
+
bing_api_result = Geocoder.search(term)
|
49
|
+
|
50
|
+
rescue SocketError => e
|
51
|
+
retry unless (retry_count -= 1).zero?
|
52
|
+
else
|
53
|
+
|
54
|
+
#Use only for United States results... international results are inaccurate.
|
55
|
+
if bing_api_result.present? && bing_api_result.first.data["address"]["countryRegion"] == 'United States'
|
56
|
+
|
57
|
+
if bing_api_result.first.data["entityType"] == 'Neighborhood'
|
58
|
+
return {} #Doesn't return a city... Google handles this better.
|
59
|
+
end
|
60
|
+
|
61
|
+
if bing_api_result.first.data["address"]["addressLine"].present?
|
62
|
+
return_hash[:term_differs_from_tgn] = true
|
63
|
+
return_hash[:street_part] = bing_api_result.first.data["address"]["addressLine"]
|
64
|
+
return_hash[:coords] = {:latitude=>bing_api_result.first.data["geocodePoints"].first["coordinates"].first.to_s,
|
65
|
+
:longitude=>bing_api_result.first.data["geocodePoints"].first["coordinates"].last.to_s,
|
66
|
+
:combined=>bing_api_result.first.data["geocodePoints"].first["coordinates"].first.to_s + ',' + bing_api_result.first.data["geocodePoints"].first["coordinates"].last.to_s}
|
67
|
+
end
|
68
|
+
|
69
|
+
return_hash[:country_part] = bing_api_result.first.data["address"]["countryRegion"]
|
70
|
+
|
71
|
+
if return_hash[:country_part] == 'United States'
|
72
|
+
return_hash[:state_part] = Geomash::Constants::STATE_ABBR[bing_api_result.first.data["address"]["adminDistrict"]]
|
73
|
+
else
|
74
|
+
return_hash[:state_part] = bing_api_result.first.data["address"]["adminDistrict"]
|
75
|
+
end
|
76
|
+
|
77
|
+
return_hash[:city_part] = bing_api_result.first.data["address"]["locality"]
|
78
|
+
else
|
79
|
+
return {}
|
80
|
+
end
|
81
|
+
|
82
|
+
#Only return if USA for now. International results often awful.
|
83
|
+
return return_hash[:country_part] == 'United States' ? return_hash : {}
|
84
|
+
end
|
85
|
+
|
86
|
+
#Mapquest allows unlimited requests - start here?
|
87
|
+
def self.parse_mapquest_api(term, parse_term_flag=false)
|
88
|
+
return_hash = {}
|
89
|
+
retry_count = 3
|
90
|
+
|
91
|
+
#Skip if no bing_key... possibly move this elsewhere?
|
92
|
+
return return_hash if self.mapquest_key == '<mapquest_key>'
|
93
|
+
|
94
|
+
return_hash[:original_term] = term
|
95
|
+
|
96
|
+
term = Geomash::Standardizer.parse_for_geographic_term(term) if parse_term_flag
|
97
|
+
term = Geomash::Standardizer.standardize_geographic_term(term)
|
98
|
+
|
99
|
+
if term.blank?
|
100
|
+
return {}
|
101
|
+
end
|
102
|
+
|
103
|
+
return_hash[:standardized_term] = term
|
104
|
+
|
105
|
+
#Mapquest returns bad data for: Manchester, Mass.
|
106
|
+
if term.include?('Manchester') || term.include?('Atlanta, MI')
|
107
|
+
return {}
|
108
|
+
end
|
109
|
+
|
110
|
+
#Messed up with just neighborhoods. Example: Hyde Park (Boston, Mass.) or Hyde Park (Boston, Mass.)
|
111
|
+
#So if not a street address, pass to have google handle it for better results...
|
112
|
+
if term.split(',').length >= 3 && term.match(/\d/).blank? && term.downcase.match(/ave\.*,/).blank? && term.downcase.match(/avenue\.*,/).blank? && term.downcase.match(/street\.*,/).blank? && term.downcase.match(/st\.*,/).blank? && term.downcase.match(/road\.*,/).blank? && term.downcase.match(/rd\.*,/).blank?
|
113
|
+
return {}
|
114
|
+
end
|
115
|
+
|
116
|
+
Geocoder.configure(:lookup => :mapquest,:api_key => self.mapquest_key,:timeout => self.timeout, :always_raise => :all)
|
117
|
+
|
118
|
+
mapquest_api_result = Geocoder.search(term)
|
119
|
+
rescue SocketError => e
|
120
|
+
retry unless (retry_count -= 1).zero?
|
121
|
+
else
|
122
|
+
|
123
|
+
|
124
|
+
#If this call returned a result...
|
125
|
+
if mapquest_api_result.present?
|
126
|
+
|
127
|
+
if mapquest_api_result.first.data["street"].present?
|
128
|
+
#return_hash[:term_differs_from_tgn] = true
|
129
|
+
return_hash[:street_part] = mapquest_api_result.first.data["street"]
|
130
|
+
return_hash[:coords] = {:latitude=>mapquest_api_result.first.data['latLng']['lat'].to_s,
|
131
|
+
:longitude=>mapquest_api_result.first.data['latLng']['lng'].to_s,
|
132
|
+
:combined=>mapquest_api_result.first.data['latLng']['lat'].to_s + ',' + mapquest_api_result.first.data['latLng']['lng'].to_s}
|
133
|
+
end
|
134
|
+
|
135
|
+
return_hash[:country_part] = Country.new(mapquest_api_result.first.data["adminArea1"]).name
|
136
|
+
|
137
|
+
if return_hash[:country_part] == 'United States'
|
138
|
+
return_hash[:state_part] = Geomash::Constants::STATE_ABBR[mapquest_api_result.first.data["adminArea3"]] || mapquest_api_result.first.data["adminArea4"]
|
139
|
+
else
|
140
|
+
return_hash[:state_part] = mapquest_api_result.first.data["adminArea3"].gsub(' province', '')
|
141
|
+
end
|
142
|
+
|
143
|
+
return_hash[:city_part] = mapquest_api_result.first.data["adminArea5"]
|
144
|
+
|
145
|
+
return_hash[:city_part] = return_hash[:city_part].gsub(' City', '') #Return New York as New York City...
|
146
|
+
end
|
147
|
+
|
148
|
+
#Only return if USA for now. Google is better with stuff like: 'Long Binh, Vietnam'
|
149
|
+
#Also only return if there is a city if there were more than two terms passed in. Fixes: Roxbury, MA
|
150
|
+
return {} unless return_hash[:country_part] == 'United States'
|
151
|
+
return {} if term.split(',').length >= 2 && return_hash[:city_part].blank?
|
152
|
+
|
153
|
+
return return_hash
|
154
|
+
end
|
155
|
+
|
156
|
+
#Final fallback is google API. The best but we are limited to 2500 requests per day unless we pay the $10k a year premium account...
|
157
|
+
#Note: If google cannot find street, it will return just city/state, like for "Salem Street and Paradise Road, Swampscott, MA, 01907"
|
158
|
+
#Seems like it sets a partial_match=>true in the data section...
|
159
|
+
def self.parse_google_api(term, parse_term_flag=false)
|
160
|
+
return_hash = {}
|
161
|
+
retry_count = 3
|
162
|
+
|
163
|
+
return_hash[:original_term] = term
|
164
|
+
|
165
|
+
term = Geomash::Standardizer.parse_for_geographic_term(term) if parse_term_flag
|
166
|
+
term = Geomash::Standardizer.standardize_geographic_term(term)
|
167
|
+
|
168
|
+
#Soviet Union returns back a place in Kazakhstan
|
169
|
+
if term.blank? || term == 'Soviet Union'
|
170
|
+
return {}
|
171
|
+
end
|
172
|
+
|
173
|
+
return_hash[:standardized_term] = term
|
174
|
+
|
175
|
+
::Geocoder.configure(:lookup => :google,:api_key => nil,:timeout => self.timeout, :always_raise => :all)
|
176
|
+
|
177
|
+
google_api_result = ::Geocoder.search(term)
|
178
|
+
rescue SocketError => e
|
179
|
+
retry unless (retry_count -= 1).zero?
|
180
|
+
else
|
181
|
+
|
182
|
+
|
183
|
+
#Check if only a partial match. To avoid errors, strip out the first part and try again...
|
184
|
+
#Need better way to check for street endings. See: http://pe.usps.gov/text/pub28/28apc_002.htm
|
185
|
+
if google_api_result.present?
|
186
|
+
if google_api_result.first.data['partial_match'] && term.split(',').length > 1 && !term.downcase.include?('street') && !term.downcase.include?('st.') && !term.downcase.include?('avenue') && !term.downcase.include?('ave.') && !term.downcase.include?('court') && !term.downcase.include?('dr.')
|
187
|
+
term = term.split(',')[1..term.split(',').length-1].join(',').strip
|
188
|
+
google_api_result = Geocoder.search(term)
|
189
|
+
end
|
190
|
+
end
|
191
|
+
|
192
|
+
if google_api_result.present?
|
193
|
+
#Types: street number, route, neighborhood, establishment, transit_station, bus_station
|
194
|
+
google_api_result.first.data["address_components"].each do |result|
|
195
|
+
if (result['types'] & ['street number', 'route', 'establishment', 'transit_station', 'bus_station']).present? || (result['types'].include?('neighborhood') && !result['types'].include?('political'))
|
196
|
+
#return_hash[:term_differs_from_tgn] = true
|
197
|
+
#TODO: Not implemented for Google results right now.
|
198
|
+
#return_hash[:street_part] = 'TODO: Not Implemented for Google Results'
|
199
|
+
return_hash[:coords] = {:latitude=>google_api_result.first.data['geometry']['location']['lat'].to_s,
|
200
|
+
:longitude=>google_api_result.first.data['geometry']['location']['lng'].to_s,
|
201
|
+
:combined=>google_api_result.first.data['geometry']['location']['lat'].to_s + ',' + google_api_result.first.data['geometry']['location']['lng'].to_s}
|
202
|
+
elsif (result['types'] & ['country']).present?
|
203
|
+
return_hash[:country_part] = result['long_name']
|
204
|
+
elsif (result['types'] & ['administrative_area_level_1']).present?
|
205
|
+
return_hash[:state_part] = result['long_name'].to_ascii
|
206
|
+
elsif (result['types'] & ['locality']).present?
|
207
|
+
return_hash[:city_part] = result['long_name']
|
208
|
+
elsif (result['types'] & ['sublocality', 'political']).length == 2 || result['types'].include?('neighborhood')
|
209
|
+
return_hash[:neighborhood_part] = result['long_name']
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
return_hash[:term_differs_from_tgn] ||= google_api_result.first.data['partial_match'] unless google_api_result.first.data['partial_match'].blank?
|
214
|
+
end
|
215
|
+
|
216
|
+
|
217
|
+
return return_hash
|
218
|
+
end
|
219
|
+
end
|
220
|
+
end
|
@@ -0,0 +1,250 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
module Geomash
|
3
|
+
class Standardizer
|
4
|
+
|
5
|
+
#Take a subject string and look for potential geographic terms.
|
6
|
+
def self.parse_for_geographic_term(term)
|
7
|
+
geo_term = ''
|
8
|
+
|
9
|
+
#Likely too long to be an address... some fields have junk with an address string...
|
10
|
+
if term.length > 125
|
11
|
+
return ''
|
12
|
+
end
|
13
|
+
|
14
|
+
state_abbr_list = ['Mass']
|
15
|
+
state_name_list = []
|
16
|
+
country_name_list = []
|
17
|
+
|
18
|
+
#Countries gem of https://github.com/hexorx/countries
|
19
|
+
Country.new('US').states.each do |state_abbr, state_names|
|
20
|
+
state_abbr_list << ' ' + state_abbr
|
21
|
+
state_name_list << state_names["name"]
|
22
|
+
end
|
23
|
+
|
24
|
+
Country.all.each do |country_name_abbr_pair|
|
25
|
+
country_name_list << country_name_abbr_pair.first
|
26
|
+
end
|
27
|
+
|
28
|
+
#Parsing a subject geographic term.
|
29
|
+
if term.include?('--')
|
30
|
+
term.split('--').each_with_index do |split_term, index|
|
31
|
+
if state_name_list.any? { |state| split_term.include? state } || country_name_list.any? { |country| split_term.include? country }
|
32
|
+
geo_term = term.split('--')[index..term.split('--').length-1].reverse!.join(',')
|
33
|
+
elsif state_abbr_list.any? { |abbr| split_term.include? abbr }
|
34
|
+
geo_term = split_term
|
35
|
+
end
|
36
|
+
end
|
37
|
+
#Other than a '--' field
|
38
|
+
#Experimental... example: Palmer (Mass) - history or Stores (retail trade) - Palmer, Mass
|
39
|
+
elsif term.include?(' - ')
|
40
|
+
term.split(' - ').each do |split_term|
|
41
|
+
if state_name_list.any? { |state| split_term.include? state } || state_abbr_list.any? { |abbr| split_term.include? abbr } || country_name_list.any? { |country| split_term.include? country }
|
42
|
+
geo_term = split_term
|
43
|
+
end
|
44
|
+
|
45
|
+
end
|
46
|
+
else
|
47
|
+
if state_name_list.any? { |state| term.include? state } || state_abbr_list.any? { |abbr| term.include? abbr } || country_name_list.any? { |country| term.include? country }
|
48
|
+
geo_term = term
|
49
|
+
end
|
50
|
+
end
|
51
|
+
|
52
|
+
return geo_term
|
53
|
+
end
|
54
|
+
|
55
|
+
#Make a string in a standard format.
|
56
|
+
def self.standardize_geographic_term(geo_term)
|
57
|
+
|
58
|
+
geo_term = geo_term.clone #Don't change original
|
59
|
+
|
60
|
+
#Remove common junk terms
|
61
|
+
Geomash::Constants::JUNK_TERMS.each { |term| geo_term.gsub!(term, '') }
|
62
|
+
|
63
|
+
#Strip any leading periods or commas from junk terms
|
64
|
+
geo_term = geo_term.gsub(/^[\.,]+/, '').strip
|
65
|
+
|
66
|
+
#Replace any four TGN dashes from removing a junk term
|
67
|
+
geo_term = geo_term.gsub('----', '--')
|
68
|
+
|
69
|
+
#Replace any semicolons with commas... possible strip them?
|
70
|
+
geo_term = geo_term.gsub(';', ',')
|
71
|
+
|
72
|
+
#Terms in paranthesis will cause some geographic parsers to freak out. Switch to commas instead.
|
73
|
+
if geo_term.match(/[\(\)]+/)
|
74
|
+
#Attempt to fix address if something like (word)
|
75
|
+
if geo_term.match(/ \(+.*\)+/)
|
76
|
+
#Make this replacement better?
|
77
|
+
geo_term = geo_term.gsub(/ *\((?=[\S ]+\))/,', ')
|
78
|
+
geo_term = geo_term.gsub(')', '')
|
79
|
+
|
80
|
+
#Else skip this as data returned likely will be unreliable for now... FIXME when use case occurs.
|
81
|
+
else
|
82
|
+
return nil
|
83
|
+
end
|
84
|
+
end
|
85
|
+
|
86
|
+
return geo_term
|
87
|
+
end
|
88
|
+
|
89
|
+
#Attempt to dedup a list of geographic areas.
|
90
|
+
#FIXME: Horrendous first pass.
|
91
|
+
#Aggresive flag removes less specific matches. IE. ['Hanoi, Vietnam' and 'Vietnam'] would return just ['Hanoi, Vietnam']
|
92
|
+
def self.dedup_geo(geo_list, aggressive=false)
|
93
|
+
geo_list = geo_list.clone
|
94
|
+
|
95
|
+
base_word_geo_list = []
|
96
|
+
geo_list.each do |geo_term|
|
97
|
+
geo_term = geo_term.gsub('(','').gsub(')','').gsub('.','').gsub(',','').gsub(';','')
|
98
|
+
#Remove common junk terms
|
99
|
+
Geomash::Constants::JUNK_TERMS.each { |term| geo_term.gsub!(term, '') }
|
100
|
+
|
101
|
+
geo_term = geo_term.squish
|
102
|
+
|
103
|
+
base_word_geo_list << geo_term
|
104
|
+
end
|
105
|
+
|
106
|
+
indexes_to_remove = []
|
107
|
+
|
108
|
+
0.upto base_word_geo_list.size-1 do |index|
|
109
|
+
matched_words_count = []
|
110
|
+
current_best_term = geo_list[index]
|
111
|
+
current_best_term_index = index
|
112
|
+
|
113
|
+
base_word_geo_list[index].split(' ').each { |word|
|
114
|
+
|
115
|
+
(index+1).upto base_word_geo_list.size-1 do |inner_index|
|
116
|
+
if base_word_geo_list[inner_index].split(' ').any? { |single_word| single_word == word }
|
117
|
+
matched_words_count[inner_index] ||= 0
|
118
|
+
matched_words_count[inner_index] = matched_words_count[inner_index] + 1
|
119
|
+
|
120
|
+
end
|
121
|
+
end
|
122
|
+
}
|
123
|
+
|
124
|
+
matched_words_count.each_with_index do |matched_count, matched_index|
|
125
|
+
matched_count ||= 0
|
126
|
+
|
127
|
+
if (matched_count == base_word_geo_list[matched_index].split(' ').size) && ((base_word_geo_list[matched_index].split(' ').size < base_word_geo_list[index].split(' ').size && aggressive) || (base_word_geo_list[matched_index].split(' ').size == base_word_geo_list[index].split(' ').size))
|
128
|
+
if current_best_term.split(',').size < geo_list[matched_index].split(',').size || (current_best_term.size+1 < geo_list[matched_index].size && !geo_list[matched_index].include?('('))
|
129
|
+
current_best_term = geo_list[matched_index]
|
130
|
+
indexes_to_remove << current_best_term_index
|
131
|
+
current_best_term_index = matched_index
|
132
|
+
else
|
133
|
+
indexes_to_remove << matched_index
|
134
|
+
end
|
135
|
+
end
|
136
|
+
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
140
|
+
indexes_to_remove.each do |removal_index|
|
141
|
+
geo_list[removal_index] = nil
|
142
|
+
end
|
143
|
+
|
144
|
+
return geo_list.compact
|
145
|
+
end
|
146
|
+
|
147
|
+
def self.parsed_and_original_check(geo_hash)
|
148
|
+
term = geo_hash[:standardized_term]
|
149
|
+
|
150
|
+
if geo_hash[:street_part].present? || geo_hash[:coords].present?
|
151
|
+
return true
|
152
|
+
end
|
153
|
+
|
154
|
+
#Keep original string if three parts at least or if there is a number in the term.
|
155
|
+
#TODO: Make this better!
|
156
|
+
if (term.split(',').length >= 3 && geo_hash[:neighborhood_part].blank?) || (term.split(',').length >= 2 && geo_hash[:city_part].blank?) || term.split(',').length >= 4 || term.match(/\d/).present?
|
157
|
+
return true
|
158
|
+
end
|
159
|
+
|
160
|
+
if geo_hash[:country_part] != 'United States'
|
161
|
+
if geo_hash[:city_part].blank? && geo_hash[:state_part].blank?
|
162
|
+
#Currently do noting
|
163
|
+
elsif !((geo_hash[:city_part].present? && term.to_ascii.downcase.include?(geo_hash[:city_part].to_ascii.downcase)) || (geo_hash[:state_part].present? && term.to_ascii.downcase.include?(geo_hash[:state_part].to_ascii.downcase)))
|
164
|
+
return true
|
165
|
+
end
|
166
|
+
end
|
167
|
+
|
168
|
+
|
169
|
+
return false
|
170
|
+
end
|
171
|
+
|
172
|
+
|
173
|
+
|
174
|
+
#Take LCSH subjects and make them standard.
|
175
|
+
def self.LCSHize(value)
|
176
|
+
#Remove ending periods ... except when an initial or etc.
|
177
|
+
if value.last == '.' && value[-2].match(/[^A-Z]/) && !value[-4..-1].match('etc.')
|
178
|
+
value = value.slice(0..-2)
|
179
|
+
end
|
180
|
+
|
181
|
+
#Fix when '- -' occurs
|
182
|
+
value = value.gsub(/-\s-/,'--')
|
183
|
+
|
184
|
+
#Fix for "em" dashes - two types?
|
185
|
+
value = value.gsub('—','--')
|
186
|
+
|
187
|
+
#Fix for "em" dashes - two types?
|
188
|
+
value = value.gsub('–','--')
|
189
|
+
|
190
|
+
#Fix for ' - ' combinations
|
191
|
+
value = value.gsub(' - ','--')
|
192
|
+
|
193
|
+
#Remove white space after and before '--'
|
194
|
+
value = value.gsub(/\s+--/,'--')
|
195
|
+
value = value.gsub(/--\s+/,'--')
|
196
|
+
|
197
|
+
#Ensure first work is capitalized
|
198
|
+
value[0] = value.first.capitalize[0]
|
199
|
+
|
200
|
+
#Strip any white space
|
201
|
+
value = strip_value(value)
|
202
|
+
|
203
|
+
return value
|
204
|
+
end
|
205
|
+
|
206
|
+
def self.strip_value(value)
|
207
|
+
if(value.blank?)
|
208
|
+
return nil
|
209
|
+
else
|
210
|
+
if value.class == Float || value.class == Fixnum
|
211
|
+
value = value.to_i.to_s
|
212
|
+
end
|
213
|
+
|
214
|
+
# Make sure it is all UTF-8 and not character encodings or HTML tags and remove any cariage returns
|
215
|
+
return utf8Encode(value)
|
216
|
+
end
|
217
|
+
end
|
218
|
+
|
219
|
+
#TODO: Better name for this. Should be part of an overall helped gem.
|
220
|
+
def self.utf8Encode(value)
|
221
|
+
return HTMLEntities.new.decode(ActionView::Base.full_sanitizer.sanitize(value.to_s.gsub(/\r?\n?\t/, ' ').gsub(/\r?\n/, ' ').gsub(/<br[\s]*\/>/,' '))).strip
|
222
|
+
end
|
223
|
+
|
224
|
+
|
225
|
+
def self.try_with_entered_names(geo_hash)
|
226
|
+
geo_hash_local = geo_hash.clone
|
227
|
+
if geo_hash_local[:neighborhood_part].present?
|
228
|
+
orig_string_check = geo_hash_local[:standardized_term].gsub(',', ' ').squish.split(' ').select { |value| value.downcase.to_ascii == geo_hash_local[:neighborhood_part].downcase.to_ascii}
|
229
|
+
geo_hash_local[:neighborhood_part] = orig_string_check.first.strip if orig_string_check.present? && orig_string_check != geo_hash_local[:neighborhood_part]
|
230
|
+
return geo_hash_local
|
231
|
+
end
|
232
|
+
|
233
|
+
if geo_hash_local[:city_part].present?
|
234
|
+
orig_string_check = geo_hash_local[:standardized_term].gsub(',', ' ').squish.split(' ').select { |value| value.downcase.to_ascii == geo_hash_local[:city_part].downcase.to_ascii}
|
235
|
+
geo_hash_local[:city_part] = orig_string_check.first.strip if orig_string_check.present?
|
236
|
+
return geo_hash_local
|
237
|
+
end
|
238
|
+
|
239
|
+
|
240
|
+
if geo_hash_local[:state_part].present?
|
241
|
+
orig_string_check = geo_hash_local[:standardized_term].gsub(',', ' ').squish.split(' ').select { |value| value.downcase.to_ascii == geo_hash_local[:state_part].downcase.to_ascii}
|
242
|
+
geo_hash_local[:state_part] = orig_string_check.first.strip if orig_string_check.present?
|
243
|
+
return geo_hash_local
|
244
|
+
end
|
245
|
+
|
246
|
+
return nil
|
247
|
+
end
|
248
|
+
|
249
|
+
end
|
250
|
+
end
|