bplgeo 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. checksums.yaml +7 -0
  2. data/Rakefile +32 -0
  3. data/lib/bplgeo/constants.rb +478 -0
  4. data/lib/bplgeo/geonames.rb +146 -0
  5. data/lib/bplgeo/parser.rb +227 -0
  6. data/lib/bplgeo/standardizer.rb +213 -0
  7. data/lib/bplgeo/tgn.rb +314 -0
  8. data/lib/bplgeo/town_lookup.rb +19 -0
  9. data/lib/bplgeo/version.rb +3 -0
  10. data/lib/bplgeo.rb +35 -0
  11. data/lib/tasks/bplgeo_tasks.rake +4 -0
  12. data/test/bplgeo_test.rb +102 -0
  13. data/test/dummy/README.rdoc +28 -0
  14. data/test/dummy/Rakefile +6 -0
  15. data/test/dummy/app/assets/javascripts/application.js +13 -0
  16. data/test/dummy/app/assets/stylesheets/application.css +13 -0
  17. data/test/dummy/app/controllers/application_controller.rb +5 -0
  18. data/test/dummy/app/helpers/application_helper.rb +2 -0
  19. data/test/dummy/app/views/layouts/application.html.erb +14 -0
  20. data/test/dummy/bin/bundle +3 -0
  21. data/test/dummy/bin/rails +4 -0
  22. data/test/dummy/bin/rake +4 -0
  23. data/test/dummy/config/application.rb +23 -0
  24. data/test/dummy/config/boot.rb +5 -0
  25. data/test/dummy/config/bplgeo.yml +23 -0
  26. data/test/dummy/config/bplgeo.yml.sample +24 -0
  27. data/test/dummy/config/database.yml +25 -0
  28. data/test/dummy/config/environment.rb +5 -0
  29. data/test/dummy/config/environments/development.rb +29 -0
  30. data/test/dummy/config/environments/production.rb +80 -0
  31. data/test/dummy/config/environments/test.rb +36 -0
  32. data/test/dummy/config/initializers/backtrace_silencers.rb +7 -0
  33. data/test/dummy/config/initializers/filter_parameter_logging.rb +4 -0
  34. data/test/dummy/config/initializers/inflections.rb +16 -0
  35. data/test/dummy/config/initializers/mime_types.rb +5 -0
  36. data/test/dummy/config/initializers/secret_token.rb +12 -0
  37. data/test/dummy/config/initializers/session_store.rb +3 -0
  38. data/test/dummy/config/initializers/wrap_parameters.rb +14 -0
  39. data/test/dummy/config/locales/en.yml +23 -0
  40. data/test/dummy/config/routes.rb +56 -0
  41. data/test/dummy/config.ru +4 -0
  42. data/test/dummy/db/test.sqlite3 +0 -0
  43. data/test/dummy/log/development.log +35 -0
  44. data/test/dummy/public/404.html +58 -0
  45. data/test/dummy/public/422.html +58 -0
  46. data/test/dummy/public/500.html +57 -0
  47. data/test/dummy/public/favicon.ico +0 -0
  48. data/test/geonames_test.rb +24 -0
  49. data/test/parser_test.rb +33 -0
  50. data/test/test_helper.rb +15 -0
  51. data/test/tgn_test.rb +19 -0
  52. data/test/town_lookup_test.rb +11 -0
  53. metadata +236 -0
@@ -0,0 +1,146 @@
1
+ module Bplgeo
2
+ class Geonames
3
+ def self.bplgeo_config
4
+ root = Rails.root || './test/dummy'
5
+ env = Rails.env || 'test'
6
+ @bplgeo_config ||= YAML::load(ERB.new(IO.read(File.join(root, 'config', 'bplgeo.yml'))).result)[env].with_indifferent_access
7
+ end
8
+
9
+ def self.geonames_username
10
+ bplgeo_config[:geonames_username] || '<username>'
11
+ end
12
+
13
+ def self.get_geonames_data(geoname_id)
14
+ max_retry = 3
15
+ sleep_time = 60 # In seconds
16
+ retry_count = 0
17
+
18
+ hier_geo = {}
19
+ coords = {}
20
+ geonames_data = {}
21
+
22
+ begin
23
+ if retry_count > 0
24
+ sleep(sleep_time)
25
+ end
26
+ retry_count = retry_count + 1
27
+
28
+ geonames_response = Typhoeus::Request.get("http://api.geonames.org/hierarchy?username=#{self.geonames_username}&lang=en&style=FULL&geonameId=" + geoname_id)
29
+
30
+ end until (geonames_response.code != 500 || retry_count == max_retry)
31
+
32
+ unless geonames_response.code == 500
33
+ parsed_xml = Nokogiri::Slop(geonames_response.body)
34
+
35
+ parsed_xml.geonames.geoname.each do |geoname|
36
+ hier_geo[geoname.fcode.text.downcase.to_sym] = geoname.toponymName.text
37
+ end
38
+
39
+ #FIXME: Code4Lib lazy implementation... will get last result
40
+ geoname = parsed_xml.geonames.geoname.last
41
+ coords[:latitude] = geoname.lat.text
42
+ coords[:longitude] = geoname.lng.text
43
+ coords[:combined] = coords[:latitude] + ',' + coords[:longitude]
44
+ coords[:box] = {}
45
+ coords[:box][:west] = geoname.bbox.west.text
46
+ coords[:box][:north] = geoname.bbox.north.text
47
+ coords[:box][:east] = geoname.bbox.east.text
48
+ coords[:box][:south] = geoname.bbox.south.text
49
+
50
+ geonames_data[:coords] = coords
51
+ geonames_data[:hier_geo] = hier_geo.present? ? hier_geo : nil
52
+ end
53
+
54
+ return geonames_data
55
+ end
56
+
57
+
58
+ def self.geonames_id_from_geo_hash(geo_hash)
59
+ return nil if Bplgeo::Geonames.geonames_username == '<username>'
60
+ geo_hash = geo_hash.clone
61
+
62
+ max_retry = 3
63
+ sleep_time = 60 # In seconds
64
+ retry_count = 0
65
+
66
+ geonames_search_array = []
67
+ return_hash = {}
68
+
69
+ #Don't do both neighborhood and city!
70
+ if geo_hash[:neighborhood_part].present?
71
+ geonames_search_array << geo_hash[:neighborhood_part]
72
+ elsif geo_hash[:city_part].present?
73
+ geonames_search_array << geo_hash[:city_part]
74
+ end
75
+
76
+ geonames_search_array << geo_hash[:state_part] if geo_hash[:state_part].present?
77
+ geonames_search_array << geo_hash[:country_part] if geo_hash[:country_part].present?
78
+ geonames_search_string = geonames_search_array.join(', ')
79
+
80
+ match_term = geonames_search_array.first.to_ascii.downcase.strip
81
+
82
+ begin
83
+ if retry_count > 0
84
+ sleep(sleep_time)
85
+ end
86
+ retry_count = retry_count + 1
87
+
88
+ geonames_response = Typhoeus::Request.get("http://api.geonames.org/search?username=#{self.geonames_username}&lang=en&style=FULL&q=" + CGI.escape(geonames_search_string))
89
+
90
+ end until (geonames_response.code != 500 || retry_count == max_retry)
91
+
92
+ unless geonames_response.code == 500
93
+
94
+ parsed_xml = Nokogiri::Slop(geonames_response.body)
95
+
96
+ #This is ugly and needs to be redone to achieve better recursive...
97
+ if parsed_xml.geonames.totalResultsCount.text == '0'
98
+ if neighborhood_part.present?
99
+ geo_hash[:neighborhood_part] = nil
100
+ geo_hash = geonames_id_from_geo_hash(geo_hash)
101
+ elsif city_part.present?
102
+ geo_hash[:city_part] = nil
103
+ geo_hash = geonames_id_from_geo_hash(geo_hash)
104
+ end
105
+
106
+ return geo_hash
107
+ end
108
+
109
+ #Exact Match
110
+ parsed_xml.geonames.geoname.each do |geoname|
111
+
112
+ current_term = geoname.toponymName.text.to_ascii.downcase.strip
113
+
114
+ if current_term == match_term && return_hash.blank?
115
+ return_hash[:id] = geoname.geonameId.text
116
+ return_hash[:original_string_differs] = Bplgeo::Standardizer.parsed_and_original_check(geo_hash)
117
+ break
118
+ end
119
+ end
120
+
121
+ if return_hash.blank?
122
+ #Starts With
123
+ parsed_xml.geonames.geoname.each do |geoname|
124
+
125
+ current_term = geoname.toponymName.text.to_ascii.downcase.strip
126
+
127
+ if current_term.starts_with?(match_term) && return_hash.blank?
128
+ return_hash[:id] = geoname.geonameId.text
129
+ return_hash[:original_string_differs] = Bplgeo::Standardizer.parsed_and_original_check(geo_hash)
130
+ end
131
+ end
132
+ end
133
+
134
+ end
135
+
136
+ if geonames_response.code == 500
137
+ raise 'Geonames Server appears to not be responding for Geographic query: ' + term
138
+ end
139
+
140
+ return return_hash if return_hash.present?
141
+
142
+ return nil
143
+
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,227 @@
1
+ module Bplgeo
2
+ class Parser
3
+
4
+ def self.bplgeo_config
5
+ root = Rails.root || './test/dummy'
6
+ env = Rails.env || 'test'
7
+
8
+ @bplgeo_config ||= YAML::load(ERB.new(IO.read(File.join(root, 'config', 'bplgeo.yml'))).result)[env].with_indifferent_access
9
+ end
10
+
11
+ def self.mapquest_key
12
+ bplgeo_config[:mapquest_key] || '<mapquest_key>'
13
+ end
14
+
15
+ def self.bing_key
16
+ bplgeo_config[:bing_key] || '<bing_key>'
17
+ end
18
+
19
+ def self.timeout
20
+ bplgeo_config[:timeout]
21
+ end
22
+
23
+ #Note: Limited to only looking at United States places...
24
+ def self.parse_bing_api(term, parse_term_flag=false)
25
+ return_hash = {}
26
+ retry_count = 3
27
+
28
+ #Skip if no bing_key... possibly move this elsewhere?
29
+ return return_hash if self.bing_key == '<bing_key>'
30
+
31
+ return_hash[:original_term] = term
32
+
33
+ term = Bplgeo::Standardizer.parse_for_geographic_term(term) if parse_term_flag
34
+ term = Bplgeo::Standardizer.standardize_geographic_term(term)
35
+
36
+ if term.blank?
37
+ return {}
38
+ end
39
+
40
+ return_hash[:standardized_term] = term
41
+
42
+ #Bing API does badly with parentheses...
43
+ if term.match(/[\(\)]+/)
44
+ return {}
45
+ end
46
+
47
+ #Sometimes with building, city, state, bing is dumb and will only return state. Example: Boston Harbor, Boston, Mass.
48
+ #So if not a street address, pass to have google handle it for better results...
49
+ #Example of another bad record: South Street bridge, West Bridgewater, Mass. would give a place in Holyoke
50
+ if term.split(',').length >= 3 && term.match(/\d/).blank? && term.downcase.match(/ave\.*,/).blank? && term.downcase.match(/avenue\.*,/).blank? && term.downcase.match(/street\.*,/).blank? && term.downcase.match(/st\.*,/).blank? && term.downcase.match(/road\.*,/).blank? && term.downcase.match(/rd\.*,/).blank?
51
+ return {}
52
+ end
53
+
54
+ Geocoder.configure(:lookup => :bing,:api_key => self.bing_key,:timeout => self.timeout, :always_raise => :all)
55
+ bing_api_result = Geocoder.search(term)
56
+
57
+ rescue SocketError => e
58
+ retry unless (retry_count -= 1).zero?
59
+ else
60
+
61
+ #Use only for United States results... international results are inaccurate.
62
+ if bing_api_result.present? && bing_api_result.first.data["address"]["countryRegion"] == 'United States'
63
+
64
+ if bing_api_result.first.data["entityType"] == 'Neighborhood'
65
+ return {} #Doesn't return a city... Google handles this better.
66
+ end
67
+
68
+ if bing_api_result.first.data["address"]["addressLine"].present?
69
+ return_hash[:term_differs_from_tgn] = true
70
+ return_hash[:street_part] = bing_api_result.first.data["address"]["addressLine"]
71
+ return_hash[:coords] = {:latitude=>bing_api_result.first.data["geocodePoints"].first["coordinates"].first.to_s,
72
+ :longitude=>bing_api_result.first.data["geocodePoints"].first["coordinates"].last.to_s,
73
+ :combined=>bing_api_result.first.data["geocodePoints"].first["coordinates"].first.to_s + ',' + bing_api_result.first.data["geocodePoints"].first["coordinates"].last.to_s}
74
+ end
75
+
76
+ return_hash[:country_part] = bing_api_result.first.data["address"]["countryRegion"]
77
+
78
+ if return_hash[:country_part] == 'United States'
79
+ return_hash[:state_part] = Bplgeo::Constants::STATE_ABBR[bing_api_result.first.data["address"]["adminDistrict"]]
80
+ else
81
+ return_hash[:state_part] = bing_api_result.first.data["address"]["adminDistrict"]
82
+ end
83
+
84
+ return_hash[:city_part] = bing_api_result.first.data["address"]["locality"]
85
+ else
86
+ return {}
87
+ end
88
+
89
+ #Only return if USA for now. International results often awful.
90
+ return return_hash[:country_part] == 'United States' ? return_hash : {}
91
+ end
92
+
93
+ #Mapquest allows unlimited requests - start here?
94
+ def self.parse_mapquest_api(term, parse_term_flag=false)
95
+ return_hash = {}
96
+ retry_count = 3
97
+
98
+ #Skip if no bing_key... possibly move this elsewhere?
99
+ return return_hash if self.bing_key == '<mapquest_key>'
100
+
101
+ return_hash[:original_term] = term
102
+
103
+ term = Bplgeo::Standardizer.parse_for_geographic_term(term) if parse_term_flag
104
+ term = Bplgeo::Standardizer.standardize_geographic_term(term)
105
+
106
+ if term.blank?
107
+ return {}
108
+ end
109
+
110
+ return_hash[:standardized_term] = term
111
+
112
+ #Mapquest returns bad data for: Manchester, Mass.
113
+ if term.include?('Manchester') || term.include?('Atlanta, MI')
114
+ return {}
115
+ end
116
+
117
+ #Messed up with just neighborhoods. Example: Hyde Park (Boston, Mass.) or Hyde Park (Boston, Mass.)
118
+ #So if not a street address, pass to have google handle it for better results...
119
+ if term.split(',').length >= 3 && term.match(/\d/).blank? && term.downcase.match(/ave\.*,/).blank? && term.downcase.match(/avenue\.*,/).blank? && term.downcase.match(/street\.*,/).blank? && term.downcase.match(/st\.*,/).blank? && term.downcase.match(/road\.*,/).blank? && term.downcase.match(/rd\.*,/).blank?
120
+ return {}
121
+ end
122
+
123
+ Geocoder.configure(:lookup => :mapquest,:api_key => self.mapquest_key,:timeout => self.timeout, :always_raise => :all)
124
+
125
+ mapquest_api_result = Geocoder.search(term)
126
+ rescue SocketError => e
127
+ retry unless (retry_count -= 1).zero?
128
+ else
129
+
130
+
131
+ #If this call returned a result...
132
+ if mapquest_api_result.present?
133
+
134
+ if mapquest_api_result.first.data["street"].present?
135
+ #return_hash[:term_differs_from_tgn] = true
136
+ return_hash[:street_part] = mapquest_api_result.first.data["street"]
137
+ return_hash[:coords] = {:latitude=>mapquest_api_result.first.data['latLng']['lat'].to_s,
138
+ :longitude=>mapquest_api_result.first.data['latLng']['lng'].to_s,
139
+ :combined=>mapquest_api_result.first.data['latLng']['lat'].to_s + ',' + mapquest_api_result.first.data['latLng']['lng'].to_s}
140
+ end
141
+
142
+ return_hash[:country_part] = Country.new(mapquest_api_result.first.data["adminArea1"]).name
143
+
144
+ if return_hash[:country_part] == 'United States'
145
+ return_hash[:state_part] = Bplgeo::Constants::STATE_ABBR[mapquest_api_result.first.data["adminArea3"]] || mapquest_api_result.first.data["adminArea4"]
146
+ else
147
+ return_hash[:state_part] = mapquest_api_result.first.data["adminArea3"].gsub(' province', '')
148
+ end
149
+
150
+ return_hash[:city_part] = mapquest_api_result.first.data["adminArea5"]
151
+
152
+ return_hash[:city_part] = return_hash[:city_part].gsub(' City', '') #Return New York as New York City...
153
+ end
154
+
155
+ #Only return if USA for now. Google is better with stuff like: 'Long Binh, Vietnam'
156
+ #Also only return if there is a city if there were more than two terms passed in. Fixes: Roxbury, MA
157
+ return {} unless return_hash[:country_part] == 'United States'
158
+ return {} if term.split(',').length >= 2 && return_hash[:city_part].blank?
159
+
160
+ return return_hash
161
+ end
162
+
163
+ #Final fallback is google API. The best but we are limited to 2500 requests per day unless we pay the $10k a year premium account...
164
+ #Note: If google cannot find street, it will return just city/state, like for "Salem Street and Paradise Road, Swampscott, MA, 01907"
165
+ #Seems like it sets a partial_match=>true in the data section...
166
+ def self.parse_google_api(term, parse_term_flag=false)
167
+ return_hash = {}
168
+ retry_count = 3
169
+
170
+ return_hash[:original_term] = term
171
+
172
+ term = Bplgeo::Standardizer.parse_for_geographic_term(term) if parse_term_flag
173
+ term = Bplgeo::Standardizer.standardize_geographic_term(term)
174
+
175
+ #Soviet Union returns back a place in Kazakhstan
176
+ if term.blank? || term == 'Soviet Union'
177
+ return {}
178
+ end
179
+
180
+ return_hash[:standardized_term] = term
181
+
182
+ ::Geocoder.configure(:lookup => :google,:api_key => nil,:timeout => self.timeout, :always_raise => :all)
183
+
184
+ google_api_result = ::Geocoder.search(term)
185
+ rescue SocketError => e
186
+ retry unless (retry_count -= 1).zero?
187
+ else
188
+
189
+
190
+ #Check if only a partial match. To avoid errors, strip out the first part and try again...
191
+ #Need better way to check for street endings. See: http://pe.usps.gov/text/pub28/28apc_002.htm
192
+ if google_api_result.present?
193
+ if google_api_result.first.data['partial_match'] && term.split(',').length > 1 && !term.downcase.include?('street') && !term.downcase.include?('st.') && !term.downcase.include?('avenue') && !term.downcase.include?('ave.') && !term.downcase.include?('court') && !term.downcase.include?('dr.')
194
+ term = term.split(',')[1..term.split(',').length-1].join(',').strip
195
+ google_api_result = Geocoder.search(term)
196
+ end
197
+ end
198
+
199
+ if google_api_result.present?
200
+ #Types: street number, route, neighborhood, establishment, transit_station, bus_station
201
+ google_api_result.first.data["address_components"].each do |result|
202
+ if (result['types'] & ['street number', 'route', 'neighborhood', 'establishment', 'transit_station', 'bus_station']).present?
203
+ #return_hash[:term_differs_from_tgn] = true
204
+ #TODO: Not implemented for Google results right now.
205
+ return_hash[:street_part] = 'TODO: Not Implemented for Google Results'
206
+ return_hash[:coords] = {:latitude=>google_api_result.first.data['geometry']['location']['lat'].to_s,
207
+ :longitude=>google_api_result.first.data['geometry']['location']['lng'].to_s,
208
+ :combined=>google_api_result.first.data['geometry']['location']['lat'].to_s + ',' + google_api_result.first.data['geometry']['location']['lng'].to_s}
209
+ elsif (result['types'] & ['country']).present?
210
+ return_hash[:country_part] = result['long_name']
211
+ elsif (result['types'] & ['administrative_area_level_1']).present?
212
+ return_hash[:state_part] = result['long_name'].to_ascii
213
+ elsif (result['types'] & ['locality']).present?
214
+ return_hash[:city_part] = result['long_name']
215
+ elsif (result['types'] & ['sublocality', 'political']).length == 2
216
+ return_hash[:neighborhood_part] = result['long_name']
217
+ end
218
+ end
219
+
220
+ return_hash[:term_differs_from_tgn] ||= google_api_result.first.data['partial_match'] unless google_api_result.first.data['partial_match'].blank?
221
+ end
222
+
223
+
224
+ return return_hash
225
+ end
226
+ end
227
+ end
@@ -0,0 +1,213 @@
1
+ module Bplgeo
2
+ class Standardizer
3
+
4
+ #Take a subject string and look for potential geographic terms.
5
+ def self.parse_for_geographic_term(term)
6
+ geo_term = ''
7
+
8
+ #Likely too long to be an address... some fields have junk with an address string...
9
+ if term.length > 125
10
+ return ''
11
+ end
12
+
13
+ state_abbr_list = ['Mass']
14
+ state_name_list = []
15
+
16
+ #Countries gem of https://github.com/hexorx/countries
17
+ Country.new('US').states.each do |state_abbr, state_names|
18
+ state_abbr_list << ' ' + state_abbr
19
+ state_name_list << state_names["name"]
20
+ end
21
+
22
+ #Parsing a subject geographic term.
23
+ if term.include?('--')
24
+ term.split('--').each_with_index do |split_term, index|
25
+ if state_name_list.any? { |state| split_term.include? state }
26
+ geo_term = term.split('--')[index..term.split('--').length-1].reverse!.join(',')
27
+ elsif state_abbr_list.any? { |abbr| split_term.include? abbr }
28
+ geo_term = split_term
29
+ end
30
+ end
31
+ #Other than a '--' field
32
+ #Experimental... example: Palmer (Mass) - history or Stores (retail trade) - Palmer, Mass
33
+ elsif term.include?(' - ')
34
+ term.split(' - ').each do |split_term|
35
+ if state_name_list.any? { |state| split_term.include? state } || state_abbr_list.any? { |abbr| split_term.include? abbr }
36
+ geo_term = split_term
37
+ end
38
+
39
+ end
40
+ else
41
+ if state_name_list.any? { |state| term.include? state } || state_abbr_list.any? { |abbr| term.include? abbr }
42
+ geo_term = term
43
+ end
44
+ end
45
+
46
+ return geo_term
47
+ end
48
+
49
+ #Make a string in a standard format.
50
+ def self.standardize_geographic_term(geo_term)
51
+
52
+ geo_term = geo_term.clone #Don't change original
53
+
54
+ #Remove common junk terms
55
+ Bplgeo::Constants::JUNK_TERMS.each { |term| geo_term.gsub!(term, '') }
56
+
57
+ #Strip any leading periods or commas from junk terms
58
+ geo_term = geo_term.gsub(/^[\.,]+/, '').strip
59
+
60
+ #Replace any semicolons with commas... possible strip them?
61
+ geo_term = geo_term.gsub(';', ',')
62
+
63
+ #Terms in paranthesis will cause some geographic parsers to freak out. Switch to commas instead.
64
+ if geo_term.match(/[\(\)]+/)
65
+ #Attempt to fix address if something like (word)
66
+ if geo_term.match(/ \(+.*\)+/)
67
+ #Make this replacement better?
68
+ geo_term = geo_term.gsub(/ *\((?=[\S ]+\))/,', ')
69
+ geo_term = geo_term.gsub(')', '')
70
+
71
+ #Else skip this as data returned likely will be unreliable for now... FIXME when use case occurs.
72
+ else
73
+ return nil
74
+ end
75
+ end
76
+
77
+ return geo_term
78
+ end
79
+
80
+ #Attempt to dedup a list of geographic areas.
81
+ #FIXME: Horrendous first pass.
82
+ #Aggresive flag removes less specific matches. IE. ['Hanoi, Vietnam' and 'Vietnam'] would return just ['Hanoi, Vietnam']
83
+ def self.dedup_geo(geo_list, aggresive=false)
84
+ geo_list = geo_list.clone
85
+
86
+ base_word_geo_list = []
87
+ geo_list.each do |geo_term|
88
+ geo_term = geo_term.gsub('(','').gsub(')','').gsub('.','').gsub(',','').gsub(';','')
89
+ #Remove common junk terms
90
+ Bplgeo::Constants::JUNK_TERMS.each { |term| geo_term.gsub!(term, '') }
91
+
92
+ geo_term = geo_term.squish
93
+
94
+ base_word_geo_list << geo_term
95
+ end
96
+
97
+ indexes_to_remove = []
98
+
99
+ 0.upto base_word_geo_list.size-1 do |index|
100
+ matched_words_count = []
101
+ current_best_term = geo_list[index]
102
+ current_best_term_index = index
103
+
104
+ base_word_geo_list[index].split(' ').each { |word|
105
+
106
+ (index+1).upto base_word_geo_list.size-1 do |inner_index|
107
+ if base_word_geo_list[inner_index].split(' ').any? { |single_word| single_word == word }
108
+ matched_words_count[inner_index] ||= 0
109
+ matched_words_count[inner_index] = matched_words_count[inner_index] + 1
110
+
111
+ end
112
+ end
113
+ }
114
+
115
+ matched_words_count.each_with_index do |matched_count, matched_index|
116
+ if matched_count == base_word_geo_list[index].split(' ').size && ((base_word_geo_list[matched_index].split(' ').size < base_word_geo_list[index].split(' ').size && aggresive) || (base_word_geo_list[matched_index].split(' ').size == base_word_geo_list[index].split(' ').size))
117
+ if current_best_term.split(',').size < geo_list[matched_index].split(',').size || (current_best_term.size+1 < geo_list[matched_index].size && !geo_list[matched_index].include?('('))
118
+ current_best_term = geo_list[matched_index]
119
+ indexes_to_remove << current_best_term_index
120
+ current_best_term_index = matched_index
121
+ else
122
+ indexes_to_remove << matched_index
123
+ end
124
+ end
125
+
126
+ end
127
+ end
128
+
129
+ indexes_to_remove.each do |removal_index|
130
+ geo_list[removal_index] = nil
131
+ end
132
+
133
+ return geo_list.compact
134
+ end
135
+
136
+ def self.parsed_and_original_check(geo_hash)
137
+ term = geo_hash[:standardized_term]
138
+
139
+ if geo_hash[:street_part].present? || geo_hash[:coords].present?
140
+ return true
141
+ end
142
+
143
+ #Keep original string if three parts at least or if there is a number in the term.
144
+ #TODO: Make this better!
145
+ if (term.split(',').length >= 3 && geo_hash[:neighborhood_part].blank?) || (term.split(',').length >= 2 && geo_hash[:city_part].blank?) || term.split(',').length >= 4 || term.match(/\d/).present?
146
+ return true
147
+ end
148
+
149
+ if geo_hash[:country_part] != 'United States'
150
+ if geo_hash[:city_part].blank? && geo_hash[:state_part].blank?
151
+ #Currently do noting
152
+ elsif !((geo_hash[:city_part].present? && term.to_ascii.downcase.include?(geo_hash[:city_part].to_ascii.downcase)) || (geo_hash[:state_part].present? && term.to_ascii.downcase.include?(geo_hash[:state_part].to_ascii.downcase)))
153
+ return true
154
+ end
155
+ end
156
+
157
+
158
+ return false
159
+ end
160
+
161
+
162
+
163
+ #Take LCSH subjects and make them standard.
164
+ def self.LCSHize(value)
165
+ #Remove ending periods ... except when an initial or etc.
166
+ if value.last == '.' && value[-2].match(/[^A-Z]/) && !value[-4..-1].match('etc.')
167
+ value = value.slice(0..-2)
168
+ end
169
+
170
+ #Fix when '- -' occurs
171
+ value = value.gsub(/-\s-/,'--')
172
+
173
+ #Fix for "em" dashes - two types?
174
+ value = value.gsub('—','--')
175
+
176
+ #Fix for "em" dashes - two types?
177
+ value = value.gsub('–','--')
178
+
179
+ #Fix for ' - ' combinations
180
+ value = value.gsub(' - ','--')
181
+
182
+ #Remove white space after and before '--'
183
+ value = value.gsub(/\s+--/,'--')
184
+ value = value.gsub(/--\s+/,'--')
185
+
186
+ #Ensure first work is capitalized
187
+ value[0] = value.first.capitalize[0]
188
+
189
+ #Strip any white space
190
+ value = strip_value(value)
191
+
192
+ return value
193
+ end
194
+
195
+ def self.strip_value(value)
196
+ if(value.blank?)
197
+ return nil
198
+ else
199
+ if value.class == Float || value.class == Fixnum
200
+ value = value.to_i.to_s
201
+ end
202
+
203
+ # Make sure it is all UTF-8 and not character encodings or HTML tags and remove any cariage returns
204
+ return utf8Encode(value)
205
+ end
206
+ end
207
+
208
+ #TODO: Better name for this. Should be part of an overall helped gem.
209
+ def self.utf8Encode(value)
210
+ return HTMLEntities.new.decode(ActionView::Base.full_sanitizer.sanitize(value.to_s.gsub(/\r?\n?\t/, ' ').gsub(/\r?\n/, ' ').gsub(/<br[\s]*\/>/,' '))).strip
211
+ end
212
+ end
213
+ end