geomash 0.2.1 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/config/geomash.yml.sample +3 -0
- data/lib/geomash/autoexpire_cache_dalli.rb +55 -0
- data/lib/geomash/autoexpire_cache_redis.rb +26 -0
- data/lib/geomash/constants.rb +6 -1
- data/lib/geomash/geonames.rb +19 -14
- data/lib/geomash/parser.rb +48 -18
- data/lib/geomash/standardizer.rb +46 -17
- data/lib/geomash/tgn.rb +274 -217
- data/lib/geomash/version.rb +1 -1
- data/lib/geomash.rb +8 -18
- data/test/geomash_test.rb +58 -4
- data/test/geonames_test.rb +1 -1
- data/test/standardizer_test.rb +37 -0
- metadata +15 -14
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b181c5b1f79695f4b8ba54cf5e641c9cdaefe842
|
4
|
+
data.tar.gz: f6db8e134bf152b162765a86b688848f16d0753d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0a5e74f73c036f36d0d030c2edc57fd2b026add354ee845e6f5bf6ab847f81d62dda8afe08add796b63ae0ee2b87446471a1ca083c60114d7401bc88e8476989
|
7
|
+
data.tar.gz: c0af8abaa745856c0e416f461e230574c869f5736f3c4aa8cb416dac102fe301eac669c1a308ff91d748d3e2be78511e3c0fb96fd8d0a7851e944573e0cd2506
|
data/config/geomash.yml.sample
CHANGED
@@ -6,6 +6,7 @@ development:
|
|
6
6
|
#NOTE: Bing also fairly unreliable but mostly works...
|
7
7
|
bing_key: <bing_key>
|
8
8
|
timeout: 7
|
9
|
+
parser_cache_enabled: false #See Readme before enabling this
|
9
10
|
test: &TEST_
|
10
11
|
tgn_enabled: true
|
11
12
|
geonames_username: <username>
|
@@ -14,6 +15,7 @@ test: &TEST_
|
|
14
15
|
#NOTE: Bing also fairly unreliable but mostly works...
|
15
16
|
bing_key: <bing_key>
|
16
17
|
timeout: 7
|
18
|
+
parser_cache_enabled: false #See Readme before enabling this
|
17
19
|
production:
|
18
20
|
tgn_enabled: true
|
19
21
|
geonames_username: <username>
|
@@ -22,3 +24,4 @@ production:
|
|
22
24
|
#NOTE: Bing also fairly unreliable but mostly works...
|
23
25
|
bing_key: <bing_key>
|
24
26
|
timeout: 7
|
27
|
+
parser_cache_enabled: false #See Readme before enabling this
|
@@ -0,0 +1,55 @@
|
|
1
|
+
#Taken from: https://github.com/alexreisner/geocoder/blob/master/examples/autoexpire_cache_dalli.rb
|
2
|
+
module Geomash
|
3
|
+
class AutoexpireCacheDalli
|
4
|
+
def initialize(store, ttl = 86400)
|
5
|
+
@store = store
|
6
|
+
@keys = 'GeocoderDalliClientKeys'
|
7
|
+
@ttl = ttl
|
8
|
+
end
|
9
|
+
|
10
|
+
def [](url)
|
11
|
+
res = @store.get(url)
|
12
|
+
res = YAML::load(res) if res.present?
|
13
|
+
res
|
14
|
+
end
|
15
|
+
|
16
|
+
def []=(url, value)
|
17
|
+
if value.nil?
|
18
|
+
del(url)
|
19
|
+
else
|
20
|
+
key_cache_add(url) if @store.add(url, YAML::dump(value), @ttl)
|
21
|
+
end
|
22
|
+
value
|
23
|
+
end
|
24
|
+
|
25
|
+
def keys
|
26
|
+
key_cache
|
27
|
+
end
|
28
|
+
|
29
|
+
def del(url)
|
30
|
+
key_cache_delete(url) if @store.delete(url)
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def key_cache
|
36
|
+
the_keys = @store.get(@keys)
|
37
|
+
if the_keys.nil?
|
38
|
+
@store.add(@keys, YAML::dump([]))
|
39
|
+
[]
|
40
|
+
else
|
41
|
+
YAML::load(the_keys)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def key_cache_add(key)
|
46
|
+
@store.replace(@keys, YAML::dump(key_cache << key))
|
47
|
+
end
|
48
|
+
|
49
|
+
def key_cache_delete(key)
|
50
|
+
tmp = key_cache
|
51
|
+
tmp.delete(key)
|
52
|
+
@store.replace(@keys, YAML::dump(tmp))
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
#Taken from: https://github.com/alexreisner/geocoder/blob/master/examples/autoexpire_cache_redis.rb
|
2
|
+
module Geomash
|
3
|
+
class AutoexpireCacheRedis
|
4
|
+
def initialize(store, ttl = 86400)
|
5
|
+
@store = store
|
6
|
+
@ttl = ttl
|
7
|
+
end
|
8
|
+
|
9
|
+
def [](url)
|
10
|
+
@store.[](url)
|
11
|
+
end
|
12
|
+
|
13
|
+
def []=(url, value)
|
14
|
+
@store.[]=(url, value)
|
15
|
+
@store.expire(url, @ttl)
|
16
|
+
end
|
17
|
+
|
18
|
+
def keys
|
19
|
+
@store.keys
|
20
|
+
end
|
21
|
+
|
22
|
+
def del(url)
|
23
|
+
@store.del(url)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
data/lib/geomash/constants.rb
CHANGED
@@ -72,6 +72,7 @@ module Geomash
|
|
72
72
|
}
|
73
73
|
|
74
74
|
#Terms that drive geographic parsers mad...
|
75
|
+
#Possibly check using QA against LCSH Subject topics that aren't geographical...
|
75
76
|
JUNK_TERMS = [
|
76
77
|
'Cranberries',
|
77
78
|
'History',
|
@@ -80,7 +81,11 @@ module Geomash
|
|
80
81
|
'Pictorial works.',
|
81
82
|
/[nN]ation/,
|
82
83
|
'Asia',
|
83
|
-
'(Republic)'
|
84
|
+
'(Republic)',
|
85
|
+
'Directories',
|
86
|
+
'Biography',
|
87
|
+
#Some date removal
|
88
|
+
/[\d]+th [cC]entury,/
|
84
89
|
]
|
85
90
|
|
86
91
|
|
data/lib/geomash/geonames.rb
CHANGED
@@ -89,22 +89,27 @@ module Geomash
|
|
89
89
|
end
|
90
90
|
retry_count = retry_count + 1
|
91
91
|
|
92
|
-
|
92
|
+
country_code = nil
|
93
|
+
if geo_hash[:country_part] == 'South Korea'
|
94
|
+
country_code = 'KR'
|
95
|
+
elsif geo_hash[:country_part] == 'North Korea'
|
96
|
+
country_code = 'KP'
|
97
|
+
else
|
98
|
+
country_code = Country.find_country_by_name(geo_hash[:country_part]).alpha2
|
99
|
+
end
|
100
|
+
geonames_response = Typhoeus::Request.get("http://api.geonames.org/search?username=#{self.geonames_username}&lang=en&style=FULL&q=#{CGI.escape(geonames_search_string)}&name_equals=#{CGI.escape(exact_name_term)}&country=#{country_code}")
|
93
101
|
|
94
102
|
end until (geonames_response.code != 500 || retry_count == max_retry)
|
95
103
|
|
96
104
|
unless geonames_response.code == 500
|
97
105
|
|
98
|
-
parsed_xml = Nokogiri::
|
106
|
+
parsed_xml = Nokogiri::XML(geonames_response.body)
|
99
107
|
|
100
|
-
|
101
|
-
raise "geonames status error message of: #{parsed_xml.to_s}" if parsed_xml.geonames.status
|
102
|
-
rescue
|
103
|
-
#Do nothing but FIXME to not use slop
|
104
|
-
end
|
108
|
+
raise "geonames status error message of: #{parsed_xml.to_s}" if parsed_xml.xpath("//status").present?
|
105
109
|
|
106
110
|
#This is ugly and needs to be redone to achieve better recursive...
|
107
|
-
|
111
|
+
current_count = parsed_xml.xpath("//totalResultsCount")
|
112
|
+
if current_count.blank? || current_count.first.text == '0'
|
108
113
|
if geo_hash[:neighborhood_part].present?
|
109
114
|
geo_hash_temp = geo_hash.clone
|
110
115
|
geo_hash_temp[:neighborhood_part] = nil
|
@@ -121,14 +126,14 @@ module Geomash
|
|
121
126
|
end
|
122
127
|
|
123
128
|
#Exact Match ... FIXME to not use Slop
|
124
|
-
if parsed_xml.geonames
|
125
|
-
return_hash[:id] = parsed_xml.geonames
|
126
|
-
return_hash[:rdf] = "http://sws.geonames.org/#{return_hash[:id]}/about.rdf"
|
127
|
-
elsif parsed_xml.geonames.geoname.class ==Nokogiri::XML::NodeSet
|
128
|
-
return_hash[:id] = parsed_xml.geonames.geoname.first.geonameId.text
|
129
|
+
if parsed_xml.xpath("//geonames/geoname/geonameId").present?
|
130
|
+
return_hash[:id] = parsed_xml.xpath("//geonames/geoname/geonameId").first.text
|
129
131
|
return_hash[:rdf] = "http://sws.geonames.org/#{return_hash[:id]}/about.rdf"
|
132
|
+
return_hash[:original_string_differs] = Geomash::Standardizer.parsed_and_original_check(geo_hash)
|
133
|
+
return return_hash
|
134
|
+
else
|
135
|
+
return nil
|
130
136
|
end
|
131
|
-
return_hash[:original_string_differs] = Geomash::Standardizer.parsed_and_original_check(geo_hash)
|
132
137
|
|
133
138
|
end
|
134
139
|
|
data/lib/geomash/parser.rb
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
module Geomash
|
2
2
|
class Parser
|
3
3
|
|
4
|
+
def self.cache_enabled
|
5
|
+
return Geomash.config[:parser_cache_enabled] unless Geomash.config[:parser_cache_enabled].nil?
|
6
|
+
return false
|
7
|
+
end
|
8
|
+
|
4
9
|
def self.mapquest_key
|
5
10
|
Geomash.config[:mapquest_key] || '<mapquest_key>'
|
6
11
|
end
|
@@ -44,12 +49,21 @@ module Geomash
|
|
44
49
|
return {}
|
45
50
|
end
|
46
51
|
|
47
|
-
Geocoder.configure(:lookup => :bing,:api_key => self.bing_key,:timeout => self.timeout, :always_raise => :all)
|
48
|
-
|
52
|
+
::Geocoder.configure(:lookup => :bing,:api_key => self.bing_key,:timeout => self.timeout, :always_raise => :all)
|
53
|
+
|
54
|
+
begin
|
55
|
+
bing_api_result = Geocoder.search(term)
|
56
|
+
rescue SocketError => e
|
57
|
+
Geocoder::Lookup.get(:bing).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
|
58
|
+
retry unless (retry_count -= 1).zero?
|
59
|
+
rescue Geocoder::OverQueryLimitError => e
|
60
|
+
Geocoder::Lookup.get(:bing).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
|
61
|
+
raise e
|
62
|
+
rescue Exception => e
|
63
|
+
Geocoder::Lookup.get(:bing).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
|
64
|
+
raise e
|
65
|
+
end
|
49
66
|
|
50
|
-
rescue SocketError => e
|
51
|
-
retry unless (retry_count -= 1).zero?
|
52
|
-
else
|
53
67
|
|
54
68
|
#Use only for United States results... international results are inaccurate.
|
55
69
|
if bing_api_result.present? && bing_api_result.first.data["address"]["countryRegion"] == 'United States'
|
@@ -113,13 +127,20 @@ module Geomash
|
|
113
127
|
return {}
|
114
128
|
end
|
115
129
|
|
116
|
-
Geocoder.configure(:lookup => :mapquest,:api_key => self.mapquest_key,:timeout => self.timeout, :always_raise => :all)
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
130
|
+
::Geocoder.configure(:lookup => :mapquest,:api_key => self.mapquest_key,:timeout => self.timeout, :always_raise => :all)
|
131
|
+
|
132
|
+
begin
|
133
|
+
mapquest_api_result = Geocoder.search(term)
|
134
|
+
rescue SocketError => e
|
135
|
+
Geocoder::Lookup.get(:mapquest).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
|
136
|
+
retry unless (retry_count -= 1).zero?
|
137
|
+
rescue Geocoder::OverQueryLimitError => e
|
138
|
+
Geocoder::Lookup.get(:mapquest).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
|
139
|
+
raise e
|
140
|
+
rescue Exception => e
|
141
|
+
Geocoder::Lookup.get(:mapquest).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
|
142
|
+
raise e
|
143
|
+
end
|
123
144
|
|
124
145
|
#If this call returned a result...
|
125
146
|
if mapquest_api_result.present?
|
@@ -173,11 +194,18 @@ module Geomash
|
|
173
194
|
return_hash[:standardized_term] = term
|
174
195
|
|
175
196
|
::Geocoder.configure(:lookup => :google,:api_key => nil,:timeout => self.timeout, :always_raise => :all)
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
197
|
+
begin
|
198
|
+
google_api_result = ::Geocoder.search(term)
|
199
|
+
rescue SocketError => e
|
200
|
+
Geocoder::Lookup.get(:google).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
|
201
|
+
retry unless (retry_count -= 1).zero?
|
202
|
+
rescue Geocoder::OverQueryLimitError => e
|
203
|
+
Geocoder::Lookup.get(:google).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
|
204
|
+
raise e
|
205
|
+
rescue Exception => e
|
206
|
+
Geocoder::Lookup.get(:google).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
|
207
|
+
raise e
|
208
|
+
end
|
181
209
|
|
182
210
|
|
183
211
|
#Check if only a partial match. To avoid errors, strip out the first part and try again...
|
@@ -202,7 +230,7 @@ module Geomash
|
|
202
230
|
elsif (result['types'] & ['country']).present?
|
203
231
|
return_hash[:country_part] = result['long_name']
|
204
232
|
elsif (result['types'] & ['administrative_area_level_1']).present?
|
205
|
-
return_hash[:state_part] = result['long_name'].to_ascii
|
233
|
+
return_hash[:state_part] = result['long_name'].to_ascii.gsub('-city', '')
|
206
234
|
elsif (result['types'] & ['locality']).present?
|
207
235
|
return_hash[:city_part] = result['long_name']
|
208
236
|
elsif (result['types'] & ['sublocality', 'political']).length == 2 || result['types'].include?('neighborhood')
|
@@ -213,6 +241,8 @@ module Geomash
|
|
213
241
|
return_hash[:term_differs_from_tgn] ||= google_api_result.first.data['partial_match'] unless google_api_result.first.data['partial_match'].blank?
|
214
242
|
end
|
215
243
|
|
244
|
+
#FIXME: Google free API rate limit is 5 requests / 1 second now (used to be 10). Need a better way to handle this.
|
245
|
+
sleep(0.1)
|
216
246
|
|
217
247
|
return return_hash
|
218
248
|
end
|
data/lib/geomash/standardizer.rb
CHANGED
@@ -11,6 +11,10 @@ module Geomash
|
|
11
11
|
return ''
|
12
12
|
end
|
13
13
|
|
14
|
+
term_split_list = term.split(/[,\-\(\(]/).reject{ |e| e.empty? }
|
15
|
+
term_split_list.each{ |e| e.gsub!(/[^\w\s]/, "") } #Remove punctuation
|
16
|
+
term_split_list.each{ |e| e.strip! } #Remove any extra remaining whitespace
|
17
|
+
term_split_list.reject{ |e| e.empty? }
|
14
18
|
state_abbr_list = ['Mass']
|
15
19
|
state_name_list = []
|
16
20
|
country_name_list = []
|
@@ -24,29 +28,51 @@ module Geomash
|
|
24
28
|
Country.all.each do |country_name_abbr_pair|
|
25
29
|
country_name_list << country_name_abbr_pair.first
|
26
30
|
end
|
31
|
+
country_name_list.append('South Korea') #Listed as Korea, Republic of in the gem
|
32
|
+
country_name_list.append('North Korea') #Listed as Korea, Democratic People's Republic Of of in the gem
|
27
33
|
|
28
34
|
#Parsing a subject geographic term.
|
29
|
-
if
|
30
|
-
term.
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
+
if (state_name_list & term_split_list).present? || (state_abbr_list & term_split_list).present? || (country_name_list & term_split_list).present?
|
36
|
+
if term.include?('--')
|
37
|
+
term.split('--').each_with_index do |split_term, index|
|
38
|
+
if state_name_list.any? { |state| split_term.include? state } || country_name_list.any? { |country| split_term.include? country }
|
39
|
+
#Cases like Naroden Etnografski Muzeĭ (Sofia, Bulgaria)--Catalogs
|
40
|
+
if split_term.match(/\([^\)]+\)/)
|
41
|
+
geo_term = split_term.gsub('(', ',').gsub(' ,', ', ')
|
42
|
+
geo_term = geo_term.gsub(')', '')
|
43
|
+
|
44
|
+
=begin
|
45
|
+
if split_term.match(/\([^\)]+,[^\)]+\)/)
|
46
|
+
geo_term = split_term.match(/\([^\)]+\)/).to_s
|
47
|
+
geo_term = geo_term[1..geo_term.length-2]
|
48
|
+
#Abbeville (France)--History--20th century.
|
49
|
+
elsif split_term.match(/\([^\)]+\)/)
|
50
|
+
geo_term = split_term
|
51
|
+
=end
|
52
|
+
else
|
53
|
+
geo_term = term.split('--')[index..term.split('--').length-1].reverse!.join(',')
|
54
|
+
end
|
55
|
+
|
56
|
+
elsif state_abbr_list.any? { |abbr| split_term.include? abbr }
|
57
|
+
geo_term = split_term
|
58
|
+
end
|
35
59
|
end
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
60
|
+
#Other than a '--' field
|
61
|
+
#Experimental... example: Palmer (Mass) - history or Stores (retail trade) - Palmer, Mass
|
62
|
+
elsif term.include?(' - ')
|
63
|
+
term.split(' - ').each do |split_term|
|
64
|
+
if state_name_list.any? { |state| split_term.include? state } || state_abbr_list.any? { |abbr| split_term.include? abbr } || country_name_list.any? { |country| split_term.include? country }
|
65
|
+
geo_term = split_term
|
66
|
+
end
|
67
|
+
|
43
68
|
end
|
69
|
+
else
|
70
|
+
#if term_split_list.length > 1
|
71
|
+
geo_term = term.gsub('(', ',').gsub(' ,', ', ')
|
72
|
+
geo_term = geo_term.gsub(')', '')
|
73
|
+
#end
|
44
74
|
|
45
75
|
end
|
46
|
-
else
|
47
|
-
if state_name_list.any? { |state| term.include? state } || state_abbr_list.any? { |abbr| term.include? abbr } || country_name_list.any? { |country| term.include? country }
|
48
|
-
geo_term = term
|
49
|
-
end
|
50
76
|
end
|
51
77
|
|
52
78
|
return geo_term
|
@@ -83,6 +109,8 @@ module Geomash
|
|
83
109
|
end
|
84
110
|
end
|
85
111
|
|
112
|
+
geo_term = geo_term.squeeze(',')
|
113
|
+
|
86
114
|
return geo_term
|
87
115
|
end
|
88
116
|
|
@@ -224,6 +252,7 @@ module Geomash
|
|
224
252
|
|
225
253
|
def self.try_with_entered_names(geo_hash)
|
226
254
|
geo_hash_local = geo_hash.clone
|
255
|
+
geo_hash_local[:tgn] = nil
|
227
256
|
if geo_hash_local[:neighborhood_part].present?
|
228
257
|
orig_string_check = geo_hash_local[:standardized_term].gsub(',', ' ').squish.split(' ').select { |value| value.downcase.to_ascii == geo_hash_local[:neighborhood_part].downcase.to_ascii}
|
229
258
|
geo_hash_local[:neighborhood_part] = orig_string_check.first.strip if orig_string_check.present? && orig_string_check != geo_hash_local[:neighborhood_part]
|