geomash 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/config/geomash.yml.sample +3 -0
- data/lib/geomash/autoexpire_cache_dalli.rb +55 -0
- data/lib/geomash/autoexpire_cache_redis.rb +26 -0
- data/lib/geomash/constants.rb +6 -1
- data/lib/geomash/geonames.rb +19 -14
- data/lib/geomash/parser.rb +48 -18
- data/lib/geomash/standardizer.rb +46 -17
- data/lib/geomash/tgn.rb +274 -217
- data/lib/geomash/version.rb +1 -1
- data/lib/geomash.rb +8 -18
- data/test/geomash_test.rb +58 -4
- data/test/geonames_test.rb +1 -1
- data/test/standardizer_test.rb +37 -0
- metadata +15 -14
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b181c5b1f79695f4b8ba54cf5e641c9cdaefe842
|
4
|
+
data.tar.gz: f6db8e134bf152b162765a86b688848f16d0753d
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0a5e74f73c036f36d0d030c2edc57fd2b026add354ee845e6f5bf6ab847f81d62dda8afe08add796b63ae0ee2b87446471a1ca083c60114d7401bc88e8476989
|
7
|
+
data.tar.gz: c0af8abaa745856c0e416f461e230574c869f5736f3c4aa8cb416dac102fe301eac669c1a308ff91d748d3e2be78511e3c0fb96fd8d0a7851e944573e0cd2506
|
data/config/geomash.yml.sample
CHANGED
@@ -6,6 +6,7 @@ development:
|
|
6
6
|
#NOTE: Bing also fairly unreliable but mostly works...
|
7
7
|
bing_key: <bing_key>
|
8
8
|
timeout: 7
|
9
|
+
parser_cache_enabled: false #See Readme before enabling this
|
9
10
|
test: &TEST_
|
10
11
|
tgn_enabled: true
|
11
12
|
geonames_username: <username>
|
@@ -14,6 +15,7 @@ test: &TEST_
|
|
14
15
|
#NOTE: Bing also fairly unreliable but mostly works...
|
15
16
|
bing_key: <bing_key>
|
16
17
|
timeout: 7
|
18
|
+
parser_cache_enabled: false #See Readme before enabling this
|
17
19
|
production:
|
18
20
|
tgn_enabled: true
|
19
21
|
geonames_username: <username>
|
@@ -22,3 +24,4 @@ production:
|
|
22
24
|
#NOTE: Bing also fairly unreliable but mostly works...
|
23
25
|
bing_key: <bing_key>
|
24
26
|
timeout: 7
|
27
|
+
parser_cache_enabled: false #See Readme before enabling this
|
@@ -0,0 +1,55 @@
|
|
1
|
+
#Taken from: https://github.com/alexreisner/geocoder/blob/master/examples/autoexpire_cache_dalli.rb
|
2
|
+
module Geomash
|
3
|
+
class AutoexpireCacheDalli
|
4
|
+
def initialize(store, ttl = 86400)
|
5
|
+
@store = store
|
6
|
+
@keys = 'GeocoderDalliClientKeys'
|
7
|
+
@ttl = ttl
|
8
|
+
end
|
9
|
+
|
10
|
+
def [](url)
|
11
|
+
res = @store.get(url)
|
12
|
+
res = YAML::load(res) if res.present?
|
13
|
+
res
|
14
|
+
end
|
15
|
+
|
16
|
+
def []=(url, value)
|
17
|
+
if value.nil?
|
18
|
+
del(url)
|
19
|
+
else
|
20
|
+
key_cache_add(url) if @store.add(url, YAML::dump(value), @ttl)
|
21
|
+
end
|
22
|
+
value
|
23
|
+
end
|
24
|
+
|
25
|
+
def keys
|
26
|
+
key_cache
|
27
|
+
end
|
28
|
+
|
29
|
+
def del(url)
|
30
|
+
key_cache_delete(url) if @store.delete(url)
|
31
|
+
end
|
32
|
+
|
33
|
+
private
|
34
|
+
|
35
|
+
def key_cache
|
36
|
+
the_keys = @store.get(@keys)
|
37
|
+
if the_keys.nil?
|
38
|
+
@store.add(@keys, YAML::dump([]))
|
39
|
+
[]
|
40
|
+
else
|
41
|
+
YAML::load(the_keys)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def key_cache_add(key)
|
46
|
+
@store.replace(@keys, YAML::dump(key_cache << key))
|
47
|
+
end
|
48
|
+
|
49
|
+
def key_cache_delete(key)
|
50
|
+
tmp = key_cache
|
51
|
+
tmp.delete(key)
|
52
|
+
@store.replace(@keys, YAML::dump(tmp))
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
#Taken from: https://github.com/alexreisner/geocoder/blob/master/examples/autoexpire_cache_redis.rb
|
2
|
+
module Geomash
|
3
|
+
class AutoexpireCacheRedis
|
4
|
+
def initialize(store, ttl = 86400)
|
5
|
+
@store = store
|
6
|
+
@ttl = ttl
|
7
|
+
end
|
8
|
+
|
9
|
+
def [](url)
|
10
|
+
@store.[](url)
|
11
|
+
end
|
12
|
+
|
13
|
+
def []=(url, value)
|
14
|
+
@store.[]=(url, value)
|
15
|
+
@store.expire(url, @ttl)
|
16
|
+
end
|
17
|
+
|
18
|
+
def keys
|
19
|
+
@store.keys
|
20
|
+
end
|
21
|
+
|
22
|
+
def del(url)
|
23
|
+
@store.del(url)
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
data/lib/geomash/constants.rb
CHANGED
@@ -72,6 +72,7 @@ module Geomash
|
|
72
72
|
}
|
73
73
|
|
74
74
|
#Terms that drive geographic parsers mad...
|
75
|
+
#Possibly check using QA against LCSH Subject topics that aren't geographical...
|
75
76
|
JUNK_TERMS = [
|
76
77
|
'Cranberries',
|
77
78
|
'History',
|
@@ -80,7 +81,11 @@ module Geomash
|
|
80
81
|
'Pictorial works.',
|
81
82
|
/[nN]ation/,
|
82
83
|
'Asia',
|
83
|
-
'(Republic)'
|
84
|
+
'(Republic)',
|
85
|
+
'Directories',
|
86
|
+
'Biography',
|
87
|
+
#Some date removal
|
88
|
+
/[\d]+th [cC]entury,/
|
84
89
|
]
|
85
90
|
|
86
91
|
|
data/lib/geomash/geonames.rb
CHANGED
@@ -89,22 +89,27 @@ module Geomash
|
|
89
89
|
end
|
90
90
|
retry_count = retry_count + 1
|
91
91
|
|
92
|
-
|
92
|
+
country_code = nil
|
93
|
+
if geo_hash[:country_part] == 'South Korea'
|
94
|
+
country_code = 'KR'
|
95
|
+
elsif geo_hash[:country_part] == 'North Korea'
|
96
|
+
country_code = 'KP'
|
97
|
+
else
|
98
|
+
country_code = Country.find_country_by_name(geo_hash[:country_part]).alpha2
|
99
|
+
end
|
100
|
+
geonames_response = Typhoeus::Request.get("http://api.geonames.org/search?username=#{self.geonames_username}&lang=en&style=FULL&q=#{CGI.escape(geonames_search_string)}&name_equals=#{CGI.escape(exact_name_term)}&country=#{country_code}")
|
93
101
|
|
94
102
|
end until (geonames_response.code != 500 || retry_count == max_retry)
|
95
103
|
|
96
104
|
unless geonames_response.code == 500
|
97
105
|
|
98
|
-
parsed_xml = Nokogiri::
|
106
|
+
parsed_xml = Nokogiri::XML(geonames_response.body)
|
99
107
|
|
100
|
-
|
101
|
-
raise "geonames status error message of: #{parsed_xml.to_s}" if parsed_xml.geonames.status
|
102
|
-
rescue
|
103
|
-
#Do nothing but FIXME to not use slop
|
104
|
-
end
|
108
|
+
raise "geonames status error message of: #{parsed_xml.to_s}" if parsed_xml.xpath("//status").present?
|
105
109
|
|
106
110
|
#This is ugly and needs to be redone to achieve better recursive...
|
107
|
-
|
111
|
+
current_count = parsed_xml.xpath("//totalResultsCount")
|
112
|
+
if current_count.blank? || current_count.first.text == '0'
|
108
113
|
if geo_hash[:neighborhood_part].present?
|
109
114
|
geo_hash_temp = geo_hash.clone
|
110
115
|
geo_hash_temp[:neighborhood_part] = nil
|
@@ -121,14 +126,14 @@ module Geomash
|
|
121
126
|
end
|
122
127
|
|
123
128
|
#Exact Match ... FIXME to not use Slop
|
124
|
-
if parsed_xml.geonames
|
125
|
-
return_hash[:id] = parsed_xml.geonames
|
126
|
-
return_hash[:rdf] = "http://sws.geonames.org/#{return_hash[:id]}/about.rdf"
|
127
|
-
elsif parsed_xml.geonames.geoname.class ==Nokogiri::XML::NodeSet
|
128
|
-
return_hash[:id] = parsed_xml.geonames.geoname.first.geonameId.text
|
129
|
+
if parsed_xml.xpath("//geonames/geoname/geonameId").present?
|
130
|
+
return_hash[:id] = parsed_xml.xpath("//geonames/geoname/geonameId").first.text
|
129
131
|
return_hash[:rdf] = "http://sws.geonames.org/#{return_hash[:id]}/about.rdf"
|
132
|
+
return_hash[:original_string_differs] = Geomash::Standardizer.parsed_and_original_check(geo_hash)
|
133
|
+
return return_hash
|
134
|
+
else
|
135
|
+
return nil
|
130
136
|
end
|
131
|
-
return_hash[:original_string_differs] = Geomash::Standardizer.parsed_and_original_check(geo_hash)
|
132
137
|
|
133
138
|
end
|
134
139
|
|
data/lib/geomash/parser.rb
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
module Geomash
|
2
2
|
class Parser
|
3
3
|
|
4
|
+
def self.cache_enabled
|
5
|
+
return Geomash.config[:parser_cache_enabled] unless Geomash.config[:parser_cache_enabled].nil?
|
6
|
+
return false
|
7
|
+
end
|
8
|
+
|
4
9
|
def self.mapquest_key
|
5
10
|
Geomash.config[:mapquest_key] || '<mapquest_key>'
|
6
11
|
end
|
@@ -44,12 +49,21 @@ module Geomash
|
|
44
49
|
return {}
|
45
50
|
end
|
46
51
|
|
47
|
-
Geocoder.configure(:lookup => :bing,:api_key => self.bing_key,:timeout => self.timeout, :always_raise => :all)
|
48
|
-
|
52
|
+
::Geocoder.configure(:lookup => :bing,:api_key => self.bing_key,:timeout => self.timeout, :always_raise => :all)
|
53
|
+
|
54
|
+
begin
|
55
|
+
bing_api_result = Geocoder.search(term)
|
56
|
+
rescue SocketError => e
|
57
|
+
Geocoder::Lookup.get(:bing).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
|
58
|
+
retry unless (retry_count -= 1).zero?
|
59
|
+
rescue Geocoder::OverQueryLimitError => e
|
60
|
+
Geocoder::Lookup.get(:bing).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
|
61
|
+
raise e
|
62
|
+
rescue Exception => e
|
63
|
+
Geocoder::Lookup.get(:bing).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
|
64
|
+
raise e
|
65
|
+
end
|
49
66
|
|
50
|
-
rescue SocketError => e
|
51
|
-
retry unless (retry_count -= 1).zero?
|
52
|
-
else
|
53
67
|
|
54
68
|
#Use only for United States results... international results are inaccurate.
|
55
69
|
if bing_api_result.present? && bing_api_result.first.data["address"]["countryRegion"] == 'United States'
|
@@ -113,13 +127,20 @@ module Geomash
|
|
113
127
|
return {}
|
114
128
|
end
|
115
129
|
|
116
|
-
Geocoder.configure(:lookup => :mapquest,:api_key => self.mapquest_key,:timeout => self.timeout, :always_raise => :all)
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
130
|
+
::Geocoder.configure(:lookup => :mapquest,:api_key => self.mapquest_key,:timeout => self.timeout, :always_raise => :all)
|
131
|
+
|
132
|
+
begin
|
133
|
+
mapquest_api_result = Geocoder.search(term)
|
134
|
+
rescue SocketError => e
|
135
|
+
Geocoder::Lookup.get(:mapquest).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
|
136
|
+
retry unless (retry_count -= 1).zero?
|
137
|
+
rescue Geocoder::OverQueryLimitError => e
|
138
|
+
Geocoder::Lookup.get(:mapquest).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
|
139
|
+
raise e
|
140
|
+
rescue Exception => e
|
141
|
+
Geocoder::Lookup.get(:mapquest).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
|
142
|
+
raise e
|
143
|
+
end
|
123
144
|
|
124
145
|
#If this call returned a result...
|
125
146
|
if mapquest_api_result.present?
|
@@ -173,11 +194,18 @@ module Geomash
|
|
173
194
|
return_hash[:standardized_term] = term
|
174
195
|
|
175
196
|
::Geocoder.configure(:lookup => :google,:api_key => nil,:timeout => self.timeout, :always_raise => :all)
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
197
|
+
begin
|
198
|
+
google_api_result = ::Geocoder.search(term)
|
199
|
+
rescue SocketError => e
|
200
|
+
Geocoder::Lookup.get(:google).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
|
201
|
+
retry unless (retry_count -= 1).zero?
|
202
|
+
rescue Geocoder::OverQueryLimitError => e
|
203
|
+
Geocoder::Lookup.get(:google).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
|
204
|
+
raise e
|
205
|
+
rescue Exception => e
|
206
|
+
Geocoder::Lookup.get(:google).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
|
207
|
+
raise e
|
208
|
+
end
|
181
209
|
|
182
210
|
|
183
211
|
#Check if only a partial match. To avoid errors, strip out the first part and try again...
|
@@ -202,7 +230,7 @@ module Geomash
|
|
202
230
|
elsif (result['types'] & ['country']).present?
|
203
231
|
return_hash[:country_part] = result['long_name']
|
204
232
|
elsif (result['types'] & ['administrative_area_level_1']).present?
|
205
|
-
return_hash[:state_part] = result['long_name'].to_ascii
|
233
|
+
return_hash[:state_part] = result['long_name'].to_ascii.gsub('-city', '')
|
206
234
|
elsif (result['types'] & ['locality']).present?
|
207
235
|
return_hash[:city_part] = result['long_name']
|
208
236
|
elsif (result['types'] & ['sublocality', 'political']).length == 2 || result['types'].include?('neighborhood')
|
@@ -213,6 +241,8 @@ module Geomash
|
|
213
241
|
return_hash[:term_differs_from_tgn] ||= google_api_result.first.data['partial_match'] unless google_api_result.first.data['partial_match'].blank?
|
214
242
|
end
|
215
243
|
|
244
|
+
#FIXME: Google free API rate limit is 5 requests / 1 second now (used to be 10). Need a better way to handle this.
|
245
|
+
sleep(0.1)
|
216
246
|
|
217
247
|
return return_hash
|
218
248
|
end
|
data/lib/geomash/standardizer.rb
CHANGED
@@ -11,6 +11,10 @@ module Geomash
|
|
11
11
|
return ''
|
12
12
|
end
|
13
13
|
|
14
|
+
term_split_list = term.split(/[,\-\(\(]/).reject{ |e| e.empty? }
|
15
|
+
term_split_list.each{ |e| e.gsub!(/[^\w\s]/, "") } #Remove punctuation
|
16
|
+
term_split_list.each{ |e| e.strip! } #Remove any extra remaining whitespace
|
17
|
+
term_split_list.reject{ |e| e.empty? }
|
14
18
|
state_abbr_list = ['Mass']
|
15
19
|
state_name_list = []
|
16
20
|
country_name_list = []
|
@@ -24,29 +28,51 @@ module Geomash
|
|
24
28
|
Country.all.each do |country_name_abbr_pair|
|
25
29
|
country_name_list << country_name_abbr_pair.first
|
26
30
|
end
|
31
|
+
country_name_list.append('South Korea') #Listed as Korea, Republic of in the gem
|
32
|
+
country_name_list.append('North Korea') #Listed as Korea, Democratic People's Republic Of of in the gem
|
27
33
|
|
28
34
|
#Parsing a subject geographic term.
|
29
|
-
if
|
30
|
-
term.
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
+
if (state_name_list & term_split_list).present? || (state_abbr_list & term_split_list).present? || (country_name_list & term_split_list).present?
|
36
|
+
if term.include?('--')
|
37
|
+
term.split('--').each_with_index do |split_term, index|
|
38
|
+
if state_name_list.any? { |state| split_term.include? state } || country_name_list.any? { |country| split_term.include? country }
|
39
|
+
#Cases like Naroden Etnografski Muzeĭ (Sofia, Bulgaria)--Catalogs
|
40
|
+
if split_term.match(/\([^\)]+\)/)
|
41
|
+
geo_term = split_term.gsub('(', ',').gsub(' ,', ', ')
|
42
|
+
geo_term = geo_term.gsub(')', '')
|
43
|
+
|
44
|
+
=begin
|
45
|
+
if split_term.match(/\([^\)]+,[^\)]+\)/)
|
46
|
+
geo_term = split_term.match(/\([^\)]+\)/).to_s
|
47
|
+
geo_term = geo_term[1..geo_term.length-2]
|
48
|
+
#Abbeville (France)--History--20th century.
|
49
|
+
elsif split_term.match(/\([^\)]+\)/)
|
50
|
+
geo_term = split_term
|
51
|
+
=end
|
52
|
+
else
|
53
|
+
geo_term = term.split('--')[index..term.split('--').length-1].reverse!.join(',')
|
54
|
+
end
|
55
|
+
|
56
|
+
elsif state_abbr_list.any? { |abbr| split_term.include? abbr }
|
57
|
+
geo_term = split_term
|
58
|
+
end
|
35
59
|
end
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
60
|
+
#Other than a '--' field
|
61
|
+
#Experimental... example: Palmer (Mass) - history or Stores (retail trade) - Palmer, Mass
|
62
|
+
elsif term.include?(' - ')
|
63
|
+
term.split(' - ').each do |split_term|
|
64
|
+
if state_name_list.any? { |state| split_term.include? state } || state_abbr_list.any? { |abbr| split_term.include? abbr } || country_name_list.any? { |country| split_term.include? country }
|
65
|
+
geo_term = split_term
|
66
|
+
end
|
67
|
+
|
43
68
|
end
|
69
|
+
else
|
70
|
+
#if term_split_list.length > 1
|
71
|
+
geo_term = term.gsub('(', ',').gsub(' ,', ', ')
|
72
|
+
geo_term = geo_term.gsub(')', '')
|
73
|
+
#end
|
44
74
|
|
45
75
|
end
|
46
|
-
else
|
47
|
-
if state_name_list.any? { |state| term.include? state } || state_abbr_list.any? { |abbr| term.include? abbr } || country_name_list.any? { |country| term.include? country }
|
48
|
-
geo_term = term
|
49
|
-
end
|
50
76
|
end
|
51
77
|
|
52
78
|
return geo_term
|
@@ -83,6 +109,8 @@ module Geomash
|
|
83
109
|
end
|
84
110
|
end
|
85
111
|
|
112
|
+
geo_term = geo_term.squeeze(',')
|
113
|
+
|
86
114
|
return geo_term
|
87
115
|
end
|
88
116
|
|
@@ -224,6 +252,7 @@ module Geomash
|
|
224
252
|
|
225
253
|
def self.try_with_entered_names(geo_hash)
|
226
254
|
geo_hash_local = geo_hash.clone
|
255
|
+
geo_hash_local[:tgn] = nil
|
227
256
|
if geo_hash_local[:neighborhood_part].present?
|
228
257
|
orig_string_check = geo_hash_local[:standardized_term].gsub(',', ' ').squish.split(' ').select { |value| value.downcase.to_ascii == geo_hash_local[:neighborhood_part].downcase.to_ascii}
|
229
258
|
geo_hash_local[:neighborhood_part] = orig_string_check.first.strip if orig_string_check.present? && orig_string_check != geo_hash_local[:neighborhood_part]
|