bplgeo 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/Rakefile +32 -0
  3. data/lib/bplgeo/constants.rb +478 -0
  4. data/lib/bplgeo/geonames.rb +146 -0
  5. data/lib/bplgeo/parser.rb +227 -0
  6. data/lib/bplgeo/standardizer.rb +213 -0
  7. data/lib/bplgeo/tgn.rb +314 -0
  8. data/lib/bplgeo/town_lookup.rb +19 -0
  9. data/lib/bplgeo/version.rb +3 -0
  10. data/lib/bplgeo.rb +35 -0
  11. data/lib/tasks/bplgeo_tasks.rake +4 -0
  12. data/test/bplgeo_test.rb +102 -0
  13. data/test/dummy/README.rdoc +28 -0
  14. data/test/dummy/Rakefile +6 -0
  15. data/test/dummy/app/assets/javascripts/application.js +13 -0
  16. data/test/dummy/app/assets/stylesheets/application.css +13 -0
  17. data/test/dummy/app/controllers/application_controller.rb +5 -0
  18. data/test/dummy/app/helpers/application_helper.rb +2 -0
  19. data/test/dummy/app/views/layouts/application.html.erb +14 -0
  20. data/test/dummy/bin/bundle +3 -0
  21. data/test/dummy/bin/rails +4 -0
  22. data/test/dummy/bin/rake +4 -0
  23. data/test/dummy/config/application.rb +23 -0
  24. data/test/dummy/config/boot.rb +5 -0
  25. data/test/dummy/config/bplgeo.yml +23 -0
  26. data/test/dummy/config/bplgeo.yml.sample +24 -0
  27. data/test/dummy/config/database.yml +25 -0
  28. data/test/dummy/config/environment.rb +5 -0
  29. data/test/dummy/config/environments/development.rb +29 -0
  30. data/test/dummy/config/environments/production.rb +80 -0
  31. data/test/dummy/config/environments/test.rb +36 -0
  32. data/test/dummy/config/initializers/backtrace_silencers.rb +7 -0
  33. data/test/dummy/config/initializers/filter_parameter_logging.rb +4 -0
  34. data/test/dummy/config/initializers/inflections.rb +16 -0
  35. data/test/dummy/config/initializers/mime_types.rb +5 -0
  36. data/test/dummy/config/initializers/secret_token.rb +12 -0
  37. data/test/dummy/config/initializers/session_store.rb +3 -0
  38. data/test/dummy/config/initializers/wrap_parameters.rb +14 -0
  39. data/test/dummy/config/locales/en.yml +23 -0
  40. data/test/dummy/config/routes.rb +56 -0
  41. data/test/dummy/config.ru +4 -0
  42. data/test/dummy/db/test.sqlite3 +0 -0
  43. data/test/dummy/log/development.log +35 -0
  44. data/test/dummy/public/404.html +58 -0
  45. data/test/dummy/public/422.html +58 -0
  46. data/test/dummy/public/500.html +57 -0
  47. data/test/dummy/public/favicon.ico +0 -0
  48. data/test/geonames_test.rb +24 -0
  49. data/test/parser_test.rb +33 -0
  50. data/test/test_helper.rb +15 -0
  51. data/test/tgn_test.rb +19 -0
  52. data/test/town_lookup_test.rb +11 -0
  53. metadata +236 -0
@@ -0,0 +1,146 @@
1
+ module Bplgeo
2
+ class Geonames
3
+ def self.bplgeo_config
4
+ root = Rails.root || './test/dummy'
5
+ env = Rails.env || 'test'
6
+ @bplgeo_config ||= YAML::load(ERB.new(IO.read(File.join(root, 'config', 'bplgeo.yml'))).result)[env].with_indifferent_access
7
+ end
8
+
9
+ def self.geonames_username
10
+ bplgeo_config[:geonames_username] || '<username>'
11
+ end
12
+
13
+ def self.get_geonames_data(geoname_id)
14
+ max_retry = 3
15
+ sleep_time = 60 # In seconds
16
+ retry_count = 0
17
+
18
+ hier_geo = {}
19
+ coords = {}
20
+ geonames_data = {}
21
+
22
+ begin
23
+ if retry_count > 0
24
+ sleep(sleep_time)
25
+ end
26
+ retry_count = retry_count + 1
27
+
28
+ geonames_response = Typhoeus::Request.get("http://api.geonames.org/hierarchy?username=#{self.geonames_username}&lang=en&style=FULL&geonameId=" + geoname_id)
29
+
30
+ end until (geonames_response.code != 500 || retry_count == max_retry)
31
+
32
+ unless geonames_response.code == 500
33
+ parsed_xml = Nokogiri::Slop(geonames_response.body)
34
+
35
+ parsed_xml.geonames.geoname.each do |geoname|
36
+ hier_geo[geoname.fcode.text.downcase.to_sym] = geoname.toponymName.text
37
+ end
38
+
39
+ #FIXME: Code4Lib lazy implementation... will get last result
40
+ geoname = parsed_xml.geonames.geoname.last
41
+ coords[:latitude] = geoname.lat.text
42
+ coords[:longitude] = geoname.lng.text
43
+ coords[:combined] = coords[:latitude] + ',' + coords[:longitude]
44
+ coords[:box] = {}
45
+ coords[:box][:west] = geoname.bbox.west.text
46
+ coords[:box][:north] = geoname.bbox.north.text
47
+ coords[:box][:east] = geoname.bbox.east.text
48
+ coords[:box][:south] = geoname.bbox.south.text
49
+
50
+ geonames_data[:coords] = coords
51
+ geonames_data[:hier_geo] = hier_geo.present? ? hier_geo : nil
52
+ end
53
+
54
+ return geonames_data
55
+ end
56
+
57
+
58
+ def self.geonames_id_from_geo_hash(geo_hash)
59
+ return nil if Bplgeo::Geonames.geonames_username == '<username>'
60
+ geo_hash = geo_hash.clone
61
+
62
+ max_retry = 3
63
+ sleep_time = 60 # In seconds
64
+ retry_count = 0
65
+
66
+ geonames_search_array = []
67
+ return_hash = {}
68
+
69
+ #Don't do both neighborhood and city!
70
+ if geo_hash[:neighborhood_part].present?
71
+ geonames_search_array << geo_hash[:neighborhood_part]
72
+ elsif geo_hash[:city_part].present?
73
+ geonames_search_array << geo_hash[:city_part]
74
+ end
75
+
76
+ geonames_search_array << geo_hash[:state_part] if geo_hash[:state_part].present?
77
+ geonames_search_array << geo_hash[:country_part] if geo_hash[:country_part].present?
78
+ geonames_search_string = geonames_search_array.join(', ')
79
+
80
+ match_term = geonames_search_array.first.to_ascii.downcase.strip
81
+
82
+ begin
83
+ if retry_count > 0
84
+ sleep(sleep_time)
85
+ end
86
+ retry_count = retry_count + 1
87
+
88
+ geonames_response = Typhoeus::Request.get("http://api.geonames.org/search?username=#{self.geonames_username}&lang=en&style=FULL&q=" + CGI.escape(geonames_search_string))
89
+
90
+ end until (geonames_response.code != 500 || retry_count == max_retry)
91
+
92
+ unless geonames_response.code == 500
93
+
94
+ parsed_xml = Nokogiri::Slop(geonames_response.body)
95
+
96
+ #This is ugly and needs to be redone to achieve better recursive...
97
+ if parsed_xml.geonames.totalResultsCount.text == '0'
98
+ if neighborhood_part.present?
99
+ geo_hash[:neighborhood_part] = nil
100
+ geo_hash = geonames_id_from_geo_hash(geo_hash)
101
+ elsif city_part.present?
102
+ geo_hash[:city_part] = nil
103
+ geo_hash = geonames_id_from_geo_hash(geo_hash)
104
+ end
105
+
106
+ return geo_hash
107
+ end
108
+
109
+ #Exact Match
110
+ parsed_xml.geonames.geoname.each do |geoname|
111
+
112
+ current_term = geoname.toponymName.text.to_ascii.downcase.strip
113
+
114
+ if current_term == match_term && return_hash.blank?
115
+ return_hash[:id] = geoname.geonameId.text
116
+ return_hash[:original_string_differs] = Bplgeo::Standardizer.parsed_and_original_check(geo_hash)
117
+ break
118
+ end
119
+ end
120
+
121
+ if return_hash.blank?
122
+ #Starts With
123
+ parsed_xml.geonames.geoname.each do |geoname|
124
+
125
+ current_term = geoname.toponymName.text.to_ascii.downcase.strip
126
+
127
+ if current_term.starts_with?(match_term) && return_hash.blank?
128
+ return_hash[:id] = geoname.geonameId.text
129
+ return_hash[:original_string_differs] = Bplgeo::Standardizer.parsed_and_original_check(geo_hash)
130
+ end
131
+ end
132
+ end
133
+
134
+ end
135
+
136
+ if geonames_response.code == 500
137
+ raise 'Geonames Server appears to not be responding for Geographic query: ' + term
138
+ end
139
+
140
+ return return_hash if return_hash.present?
141
+
142
+ return nil
143
+
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,227 @@
1
+ module Bplgeo
2
+ class Parser
3
+
4
+ def self.bplgeo_config
5
+ root = Rails.root || './test/dummy'
6
+ env = Rails.env || 'test'
7
+
8
+ @bplgeo_config ||= YAML::load(ERB.new(IO.read(File.join(root, 'config', 'bplgeo.yml'))).result)[env].with_indifferent_access
9
+ end
10
+
11
+ def self.mapquest_key
12
+ bplgeo_config[:mapquest_key] || '<mapquest_key>'
13
+ end
14
+
15
+ def self.bing_key
16
+ bplgeo_config[:bing_key] || '<bing_key>'
17
+ end
18
+
19
+ def self.timeout
20
+ bplgeo_config[:timeout]
21
+ end
22
+
23
+ #Note: Limited to only looking at United States places...
24
+ def self.parse_bing_api(term, parse_term_flag=false)
25
+ return_hash = {}
26
+ retry_count = 3
27
+
28
+ #Skip if no bing_key... possibly move this elsewhere?
29
+ return return_hash if self.bing_key == '<bing_key>'
30
+
31
+ return_hash[:original_term] = term
32
+
33
+ term = Bplgeo::Standardizer.parse_for_geographic_term(term) if parse_term_flag
34
+ term = Bplgeo::Standardizer.standardize_geographic_term(term)
35
+
36
+ if term.blank?
37
+ return {}
38
+ end
39
+
40
+ return_hash[:standardized_term] = term
41
+
42
+ #Bing API does badly with parentheses...
43
+ if term.match(/[\(\)]+/)
44
+ return {}
45
+ end
46
+
47
+ #Sometimes with building, city, state, bing is dumb and will only return state. Example: Boston Harbor, Boston, Mass.
48
+ #So if not a street address, pass to have google handle it for better results...
49
+ #Example of another bad record: South Street bridge, West Bridgewater, Mass. would give a place in Holyoke
50
+ if term.split(',').length >= 3 && term.match(/\d/).blank? && term.downcase.match(/ave\.*,/).blank? && term.downcase.match(/avenue\.*,/).blank? && term.downcase.match(/street\.*,/).blank? && term.downcase.match(/st\.*,/).blank? && term.downcase.match(/road\.*,/).blank? && term.downcase.match(/rd\.*,/).blank?
51
+ return {}
52
+ end
53
+
54
+ Geocoder.configure(:lookup => :bing,:api_key => self.bing_key,:timeout => self.timeout, :always_raise => :all)
55
+ bing_api_result = Geocoder.search(term)
56
+
57
+ rescue SocketError => e
58
+ retry unless (retry_count -= 1).zero?
59
+ else
60
+
61
+ #Use only for United States results... international results are inaccurate.
62
+ if bing_api_result.present? && bing_api_result.first.data["address"]["countryRegion"] == 'United States'
63
+
64
+ if bing_api_result.first.data["entityType"] == 'Neighborhood'
65
+ return {} #Doesn't return a city... Google handles this better.
66
+ end
67
+
68
+ if bing_api_result.first.data["address"]["addressLine"].present?
69
+ return_hash[:term_differs_from_tgn] = true
70
+ return_hash[:street_part] = bing_api_result.first.data["address"]["addressLine"]
71
+ return_hash[:coords] = {:latitude=>bing_api_result.first.data["geocodePoints"].first["coordinates"].first.to_s,
72
+ :longitude=>bing_api_result.first.data["geocodePoints"].first["coordinates"].last.to_s,
73
+ :combined=>bing_api_result.first.data["geocodePoints"].first["coordinates"].first.to_s + ',' + bing_api_result.first.data["geocodePoints"].first["coordinates"].last.to_s}
74
+ end
75
+
76
+ return_hash[:country_part] = bing_api_result.first.data["address"]["countryRegion"]
77
+
78
+ if return_hash[:country_part] == 'United States'
79
+ return_hash[:state_part] = Bplgeo::Constants::STATE_ABBR[bing_api_result.first.data["address"]["adminDistrict"]]
80
+ else
81
+ return_hash[:state_part] = bing_api_result.first.data["address"]["adminDistrict"]
82
+ end
83
+
84
+ return_hash[:city_part] = bing_api_result.first.data["address"]["locality"]
85
+ else
86
+ return {}
87
+ end
88
+
89
+ #Only return if USA for now. International results often awful.
90
+ return return_hash[:country_part] == 'United States' ? return_hash : {}
91
+ end
92
+
93
+ #Mapquest allows unlimited requests - start here?
94
+ def self.parse_mapquest_api(term, parse_term_flag=false)
95
+ return_hash = {}
96
+ retry_count = 3
97
+
98
+ #Skip if no bing_key... possibly move this elsewhere?
99
+ return return_hash if self.bing_key == '<mapquest_key>'
100
+
101
+ return_hash[:original_term] = term
102
+
103
+ term = Bplgeo::Standardizer.parse_for_geographic_term(term) if parse_term_flag
104
+ term = Bplgeo::Standardizer.standardize_geographic_term(term)
105
+
106
+ if term.blank?
107
+ return {}
108
+ end
109
+
110
+ return_hash[:standardized_term] = term
111
+
112
+ #Mapquest returns bad data for: Manchester, Mass.
113
+ if term.include?('Manchester') || term.include?('Atlanta, MI')
114
+ return {}
115
+ end
116
+
117
+ #Messed up with just neighborhoods. Example: Hyde Park (Boston, Mass.) or Hyde Park (Boston, Mass.)
118
+ #So if not a street address, pass to have google handle it for better results...
119
+ if term.split(',').length >= 3 && term.match(/\d/).blank? && term.downcase.match(/ave\.*,/).blank? && term.downcase.match(/avenue\.*,/).blank? && term.downcase.match(/street\.*,/).blank? && term.downcase.match(/st\.*,/).blank? && term.downcase.match(/road\.*,/).blank? && term.downcase.match(/rd\.*,/).blank?
120
+ return {}
121
+ end
122
+
123
+ Geocoder.configure(:lookup => :mapquest,:api_key => self.mapquest_key,:timeout => self.timeout, :always_raise => :all)
124
+
125
+ mapquest_api_result = Geocoder.search(term)
126
+ rescue SocketError => e
127
+ retry unless (retry_count -= 1).zero?
128
+ else
129
+
130
+
131
+ #If this call returned a result...
132
+ if mapquest_api_result.present?
133
+
134
+ if mapquest_api_result.first.data["street"].present?
135
+ #return_hash[:term_differs_from_tgn] = true
136
+ return_hash[:street_part] = mapquest_api_result.first.data["street"]
137
+ return_hash[:coords] = {:latitude=>mapquest_api_result.first.data['latLng']['lat'].to_s,
138
+ :longitude=>mapquest_api_result.first.data['latLng']['lng'].to_s,
139
+ :combined=>mapquest_api_result.first.data['latLng']['lat'].to_s + ',' + mapquest_api_result.first.data['latLng']['lng'].to_s}
140
+ end
141
+
142
+ return_hash[:country_part] = Country.new(mapquest_api_result.first.data["adminArea1"]).name
143
+
144
+ if return_hash[:country_part] == 'United States'
145
+ return_hash[:state_part] = Bplgeo::Constants::STATE_ABBR[mapquest_api_result.first.data["adminArea3"]] || mapquest_api_result.first.data["adminArea4"]
146
+ else
147
+ return_hash[:state_part] = mapquest_api_result.first.data["adminArea3"].gsub(' province', '')
148
+ end
149
+
150
+ return_hash[:city_part] = mapquest_api_result.first.data["adminArea5"]
151
+
152
+ return_hash[:city_part] = return_hash[:city_part].gsub(' City', '') #Return New York as New York City...
153
+ end
154
+
155
+ #Only return if USA for now. Google is better with stuff like: 'Long Binh, Vietnam'
156
+ #Also only return if there is a city if there were more than two terms passed in. Fixes: Roxbury, MA
157
+ return {} unless return_hash[:country_part] == 'United States'
158
+ return {} if term.split(',').length >= 2 && return_hash[:city_part].blank?
159
+
160
+ return return_hash
161
+ end
162
+
163
+ #Final fallback is google API. The best but we are limited to 2500 requests per day unless we pay the $10k a year premium account...
164
+ #Note: If google cannot find street, it will return just city/state, like for "Salem Street and Paradise Road, Swampscott, MA, 01907"
165
+ #Seems like it sets a partial_match=>true in the data section...
166
+ def self.parse_google_api(term, parse_term_flag=false)
167
+ return_hash = {}
168
+ retry_count = 3
169
+
170
+ return_hash[:original_term] = term
171
+
172
+ term = Bplgeo::Standardizer.parse_for_geographic_term(term) if parse_term_flag
173
+ term = Bplgeo::Standardizer.standardize_geographic_term(term)
174
+
175
+ #Soviet Union returns back a place in Kazakhstan
176
+ if term.blank? || term == 'Soviet Union'
177
+ return {}
178
+ end
179
+
180
+ return_hash[:standardized_term] = term
181
+
182
+ ::Geocoder.configure(:lookup => :google,:api_key => nil,:timeout => self.timeout, :always_raise => :all)
183
+
184
+ google_api_result = ::Geocoder.search(term)
185
+ rescue SocketError => e
186
+ retry unless (retry_count -= 1).zero?
187
+ else
188
+
189
+
190
+ #Check if only a partial match. To avoid errors, strip out the first part and try again...
191
+ #Need better way to check for street endings. See: http://pe.usps.gov/text/pub28/28apc_002.htm
192
+ if google_api_result.present?
193
+ if google_api_result.first.data['partial_match'] && term.split(',').length > 1 && !term.downcase.include?('street') && !term.downcase.include?('st.') && !term.downcase.include?('avenue') && !term.downcase.include?('ave.') && !term.downcase.include?('court') && !term.downcase.include?('dr.')
194
+ term = term.split(',')[1..term.split(',').length-1].join(',').strip
195
+ google_api_result = Geocoder.search(term)
196
+ end
197
+ end
198
+
199
+ if google_api_result.present?
200
+ #Types: street number, route, neighborhood, establishment, transit_station, bus_station
201
+ google_api_result.first.data["address_components"].each do |result|
202
+ if (result['types'] & ['street number', 'route', 'neighborhood', 'establishment', 'transit_station', 'bus_station']).present?
203
+ #return_hash[:term_differs_from_tgn] = true
204
+ #TODO: Not implemented for Google results right now.
205
+ return_hash[:street_part] = 'TODO: Not Implemented for Google Results'
206
+ return_hash[:coords] = {:latitude=>google_api_result.first.data['geometry']['location']['lat'].to_s,
207
+ :longitude=>google_api_result.first.data['geometry']['location']['lng'].to_s,
208
+ :combined=>google_api_result.first.data['geometry']['location']['lat'].to_s + ',' + google_api_result.first.data['geometry']['location']['lng'].to_s}
209
+ elsif (result['types'] & ['country']).present?
210
+ return_hash[:country_part] = result['long_name']
211
+ elsif (result['types'] & ['administrative_area_level_1']).present?
212
+ return_hash[:state_part] = result['long_name'].to_ascii
213
+ elsif (result['types'] & ['locality']).present?
214
+ return_hash[:city_part] = result['long_name']
215
+ elsif (result['types'] & ['sublocality', 'political']).length == 2
216
+ return_hash[:neighborhood_part] = result['long_name']
217
+ end
218
+ end
219
+
220
+ return_hash[:term_differs_from_tgn] ||= google_api_result.first.data['partial_match'] unless google_api_result.first.data['partial_match'].blank?
221
+ end
222
+
223
+
224
+ return return_hash
225
+ end
226
+ end
227
+ end
@@ -0,0 +1,213 @@
1
+ module Bplgeo
2
+ class Standardizer
3
+
4
+ #Take a subject string and look for potential geographic terms.
5
+ def self.parse_for_geographic_term(term)
6
+ geo_term = ''
7
+
8
+ #Likely too long to be an address... some fields have junk with an address string...
9
+ if term.length > 125
10
+ return ''
11
+ end
12
+
13
+ state_abbr_list = ['Mass']
14
+ state_name_list = []
15
+
16
+ #Countries gem of https://github.com/hexorx/countries
17
+ Country.new('US').states.each do |state_abbr, state_names|
18
+ state_abbr_list << ' ' + state_abbr
19
+ state_name_list << state_names["name"]
20
+ end
21
+
22
+ #Parsing a subject geographic term.
23
+ if term.include?('--')
24
+ term.split('--').each_with_index do |split_term, index|
25
+ if state_name_list.any? { |state| split_term.include? state }
26
+ geo_term = term.split('--')[index..term.split('--').length-1].reverse!.join(',')
27
+ elsif state_abbr_list.any? { |abbr| split_term.include? abbr }
28
+ geo_term = split_term
29
+ end
30
+ end
31
+ #Other than a '--' field
32
+ #Experimental... example: Palmer (Mass) - history or Stores (retail trade) - Palmer, Mass
33
+ elsif term.include?(' - ')
34
+ term.split(' - ').each do |split_term|
35
+ if state_name_list.any? { |state| split_term.include? state } || state_abbr_list.any? { |abbr| split_term.include? abbr }
36
+ geo_term = split_term
37
+ end
38
+
39
+ end
40
+ else
41
+ if state_name_list.any? { |state| term.include? state } || state_abbr_list.any? { |abbr| term.include? abbr }
42
+ geo_term = term
43
+ end
44
+ end
45
+
46
+ return geo_term
47
+ end
48
+
49
+ #Make a string in a standard format.
50
+ def self.standardize_geographic_term(geo_term)
51
+
52
+ geo_term = geo_term.clone #Don't change original
53
+
54
+ #Remove common junk terms
55
+ Bplgeo::Constants::JUNK_TERMS.each { |term| geo_term.gsub!(term, '') }
56
+
57
+ #Strip any leading periods or commas from junk terms
58
+ geo_term = geo_term.gsub(/^[\.,]+/, '').strip
59
+
60
+ #Replace any semicolons with commas... possible strip them?
61
+ geo_term = geo_term.gsub(';', ',')
62
+
63
+ #Terms in paranthesis will cause some geographic parsers to freak out. Switch to commas instead.
64
+ if geo_term.match(/[\(\)]+/)
65
+ #Attempt to fix address if something like (word)
66
+ if geo_term.match(/ \(+.*\)+/)
67
+ #Make this replacement better?
68
+ geo_term = geo_term.gsub(/ *\((?=[\S ]+\))/,', ')
69
+ geo_term = geo_term.gsub(')', '')
70
+
71
+ #Else skip this as data returned likely will be unreliable for now... FIXME when use case occurs.
72
+ else
73
+ return nil
74
+ end
75
+ end
76
+
77
+ return geo_term
78
+ end
79
+
80
+ #Attempt to dedup a list of geographic areas.
81
+ #FIXME: Horrendous first pass.
82
+ #Aggresive flag removes less specific matches. IE. ['Hanoi, Vietnam' and 'Vietnam'] would return just ['Hanoi, Vietnam']
83
+ def self.dedup_geo(geo_list, aggresive=false)
84
+ geo_list = geo_list.clone
85
+
86
+ base_word_geo_list = []
87
+ geo_list.each do |geo_term|
88
+ geo_term = geo_term.gsub('(','').gsub(')','').gsub('.','').gsub(',','').gsub(';','')
89
+ #Remove common junk terms
90
+ Bplgeo::Constants::JUNK_TERMS.each { |term| geo_term.gsub!(term, '') }
91
+
92
+ geo_term = geo_term.squish
93
+
94
+ base_word_geo_list << geo_term
95
+ end
96
+
97
+ indexes_to_remove = []
98
+
99
+ 0.upto base_word_geo_list.size-1 do |index|
100
+ matched_words_count = []
101
+ current_best_term = geo_list[index]
102
+ current_best_term_index = index
103
+
104
+ base_word_geo_list[index].split(' ').each { |word|
105
+
106
+ (index+1).upto base_word_geo_list.size-1 do |inner_index|
107
+ if base_word_geo_list[inner_index].split(' ').any? { |single_word| single_word == word }
108
+ matched_words_count[inner_index] ||= 0
109
+ matched_words_count[inner_index] = matched_words_count[inner_index] + 1
110
+
111
+ end
112
+ end
113
+ }
114
+
115
+ matched_words_count.each_with_index do |matched_count, matched_index|
116
+ if matched_count == base_word_geo_list[index].split(' ').size && ((base_word_geo_list[matched_index].split(' ').size < base_word_geo_list[index].split(' ').size && aggresive) || (base_word_geo_list[matched_index].split(' ').size == base_word_geo_list[index].split(' ').size))
117
+ if current_best_term.split(',').size < geo_list[matched_index].split(',').size || (current_best_term.size+1 < geo_list[matched_index].size && !geo_list[matched_index].include?('('))
118
+ current_best_term = geo_list[matched_index]
119
+ indexes_to_remove << current_best_term_index
120
+ current_best_term_index = matched_index
121
+ else
122
+ indexes_to_remove << matched_index
123
+ end
124
+ end
125
+
126
+ end
127
+ end
128
+
129
+ indexes_to_remove.each do |removal_index|
130
+ geo_list[removal_index] = nil
131
+ end
132
+
133
+ return geo_list.compact
134
+ end
135
+
136
+ def self.parsed_and_original_check(geo_hash)
137
+ term = geo_hash[:standardized_term]
138
+
139
+ if geo_hash[:street_part].present? || geo_hash[:coords].present?
140
+ return true
141
+ end
142
+
143
+ #Keep original string if three parts at least or if there is a number in the term.
144
+ #TODO: Make this better!
145
+ if (term.split(',').length >= 3 && geo_hash[:neighborhood_part].blank?) || (term.split(',').length >= 2 && geo_hash[:city_part].blank?) || term.split(',').length >= 4 || term.match(/\d/).present?
146
+ return true
147
+ end
148
+
149
+ if geo_hash[:country_part] != 'United States'
150
+ if geo_hash[:city_part].blank? && geo_hash[:state_part].blank?
151
+ #Currently do noting
152
+ elsif !((geo_hash[:city_part].present? && term.to_ascii.downcase.include?(geo_hash[:city_part].to_ascii.downcase)) || (geo_hash[:state_part].present? && term.to_ascii.downcase.include?(geo_hash[:state_part].to_ascii.downcase)))
153
+ return true
154
+ end
155
+ end
156
+
157
+
158
+ return false
159
+ end
160
+
161
+
162
+
163
+ #Take LCSH subjects and make them standard.
164
+ def self.LCSHize(value)
165
+ #Remove ending periods ... except when an initial or etc.
166
+ if value.last == '.' && value[-2].match(/[^A-Z]/) && !value[-4..-1].match('etc.')
167
+ value = value.slice(0..-2)
168
+ end
169
+
170
+ #Fix when '- -' occurs
171
+ value = value.gsub(/-\s-/,'--')
172
+
173
+ #Fix for "em" dashes - two types?
174
+ value = value.gsub('—','--')
175
+
176
+ #Fix for "em" dashes - two types?
177
+ value = value.gsub('–','--')
178
+
179
+ #Fix for ' - ' combinations
180
+ value = value.gsub(' - ','--')
181
+
182
+ #Remove white space after and before '--'
183
+ value = value.gsub(/\s+--/,'--')
184
+ value = value.gsub(/--\s+/,'--')
185
+
186
+ #Ensure first work is capitalized
187
+ value[0] = value.first.capitalize[0]
188
+
189
+ #Strip any white space
190
+ value = strip_value(value)
191
+
192
+ return value
193
+ end
194
+
195
+ def self.strip_value(value)
196
+ if(value.blank?)
197
+ return nil
198
+ else
199
+ if value.class == Float || value.class == Fixnum
200
+ value = value.to_i.to_s
201
+ end
202
+
203
+ # Make sure it is all UTF-8 and not character encodings or HTML tags and remove any cariage returns
204
+ return utf8Encode(value)
205
+ end
206
+ end
207
+
208
+ #TODO: Better name for this. Should be part of an overall helped gem.
209
+ def self.utf8Encode(value)
210
+ return HTMLEntities.new.decode(ActionView::Base.full_sanitizer.sanitize(value.to_s.gsub(/\r?\n?\t/, ' ').gsub(/\r?\n/, ' ').gsub(/<br[\s]*\/>/,' '))).strip
211
+ end
212
+ end
213
+ end