geomash 0.2.1 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9de5bd2cdc879dc6103df0df9d7b31f195c02e73
4
- data.tar.gz: 6bc1355c2a93d2886c1d77a00e705894b8720418
3
+ metadata.gz: b181c5b1f79695f4b8ba54cf5e641c9cdaefe842
4
+ data.tar.gz: f6db8e134bf152b162765a86b688848f16d0753d
5
5
  SHA512:
6
- metadata.gz: 6622f04afaae5560a36930441afe000b3b45cb53221b6f7fddf449b5b892bdcc59e35cf1088a50eb11b811edff662ecf43234ef1b348f48f1f6cfac18d4d5486
7
- data.tar.gz: 22d728689b333961bcc8e37628188797af5c1773732a6d6a5fff2120284ddf1b18fb5eac9e8fd3bfadbf648d97aac5cb827021b796cf2efb8f600cd406ff7d83
6
+ metadata.gz: 0a5e74f73c036f36d0d030c2edc57fd2b026add354ee845e6f5bf6ab847f81d62dda8afe08add796b63ae0ee2b87446471a1ca083c60114d7401bc88e8476989
7
+ data.tar.gz: c0af8abaa745856c0e416f461e230574c869f5736f3c4aa8cb416dac102fe301eac669c1a308ff91d748d3e2be78511e3c0fb96fd8d0a7851e944573e0cd2506
@@ -6,6 +6,7 @@ development:
6
6
  #NOTE: Bing also fairly unreliable but mostly works...
7
7
  bing_key: <bing_key>
8
8
  timeout: 7
9
+ parser_cache_enabled: false #See Readme before enabling this
9
10
  test: &TEST_
10
11
  tgn_enabled: true
11
12
  geonames_username: <username>
@@ -14,6 +15,7 @@ test: &TEST_
14
15
  #NOTE: Bing also fairly unreliable but mostly works...
15
16
  bing_key: <bing_key>
16
17
  timeout: 7
18
+ parser_cache_enabled: false #See Readme before enabling this
17
19
  production:
18
20
  tgn_enabled: true
19
21
  geonames_username: <username>
@@ -22,3 +24,4 @@ production:
22
24
  #NOTE: Bing also fairly unreliable but mostly works...
23
25
  bing_key: <bing_key>
24
26
  timeout: 7
27
+ parser_cache_enabled: false #See Readme before enabling this
@@ -0,0 +1,55 @@
1
+ #Taken from: https://github.com/alexreisner/geocoder/blob/master/examples/autoexpire_cache_dalli.rb
2
+ module Geomash
3
+ class AutoexpireCacheDalli
4
+ def initialize(store, ttl = 86400)
5
+ @store = store
6
+ @keys = 'GeocoderDalliClientKeys'
7
+ @ttl = ttl
8
+ end
9
+
10
+ def [](url)
11
+ res = @store.get(url)
12
+ res = YAML::load(res) if res.present?
13
+ res
14
+ end
15
+
16
+ def []=(url, value)
17
+ if value.nil?
18
+ del(url)
19
+ else
20
+ key_cache_add(url) if @store.add(url, YAML::dump(value), @ttl)
21
+ end
22
+ value
23
+ end
24
+
25
+ def keys
26
+ key_cache
27
+ end
28
+
29
+ def del(url)
30
+ key_cache_delete(url) if @store.delete(url)
31
+ end
32
+
33
+ private
34
+
35
+ def key_cache
36
+ the_keys = @store.get(@keys)
37
+ if the_keys.nil?
38
+ @store.add(@keys, YAML::dump([]))
39
+ []
40
+ else
41
+ YAML::load(the_keys)
42
+ end
43
+ end
44
+
45
+ def key_cache_add(key)
46
+ @store.replace(@keys, YAML::dump(key_cache << key))
47
+ end
48
+
49
+ def key_cache_delete(key)
50
+ tmp = key_cache
51
+ tmp.delete(key)
52
+ @store.replace(@keys, YAML::dump(tmp))
53
+ end
54
+ end
55
+ end
@@ -0,0 +1,26 @@
1
+ #Taken from: https://github.com/alexreisner/geocoder/blob/master/examples/autoexpire_cache_redis.rb
2
+ module Geomash
3
+ class AutoexpireCacheRedis
4
+ def initialize(store, ttl = 86400)
5
+ @store = store
6
+ @ttl = ttl
7
+ end
8
+
9
+ def [](url)
10
+ @store.[](url)
11
+ end
12
+
13
+ def []=(url, value)
14
+ @store.[]=(url, value)
15
+ @store.expire(url, @ttl)
16
+ end
17
+
18
+ def keys
19
+ @store.keys
20
+ end
21
+
22
+ def del(url)
23
+ @store.del(url)
24
+ end
25
+ end
26
+ end
@@ -72,6 +72,7 @@ module Geomash
72
72
  }
73
73
 
74
74
  #Terms that drive geographic parsers mad...
75
+ #Possibly check using QA against LCSH Subject topics that aren't geographical...
75
76
  JUNK_TERMS = [
76
77
  'Cranberries',
77
78
  'History',
@@ -80,7 +81,11 @@ module Geomash
80
81
  'Pictorial works.',
81
82
  /[nN]ation/,
82
83
  'Asia',
83
- '(Republic)'
84
+ '(Republic)',
85
+ 'Directories',
86
+ 'Biography',
87
+ #Some date removal
88
+ /[\d]+th [cC]entury,/
84
89
  ]
85
90
 
86
91
 
@@ -89,22 +89,27 @@ module Geomash
89
89
  end
90
90
  retry_count = retry_count + 1
91
91
 
92
- geonames_response = Typhoeus::Request.get("http://api.geonames.org/search?username=#{self.geonames_username}&lang=en&style=FULL&q=#{CGI.escape(geonames_search_string)}&name_equals=#{CGI.escape(exact_name_term)}&country=#{Country.find_country_by_name(geo_hash[:country_part]).alpha2}")
92
+ country_code = nil
93
+ if geo_hash[:country_part] == 'South Korea'
94
+ country_code = 'KR'
95
+ elsif geo_hash[:country_part] == 'North Korea'
96
+ country_code = 'KP'
97
+ else
98
+ country_code = Country.find_country_by_name(geo_hash[:country_part]).alpha2
99
+ end
100
+ geonames_response = Typhoeus::Request.get("http://api.geonames.org/search?username=#{self.geonames_username}&lang=en&style=FULL&q=#{CGI.escape(geonames_search_string)}&name_equals=#{CGI.escape(exact_name_term)}&country=#{country_code}")
93
101
 
94
102
  end until (geonames_response.code != 500 || retry_count == max_retry)
95
103
 
96
104
  unless geonames_response.code == 500
97
105
 
98
- parsed_xml = Nokogiri::Slop(geonames_response.body)
106
+ parsed_xml = Nokogiri::XML(geonames_response.body)
99
107
 
100
- begin
101
- raise "geonames status error message of: #{parsed_xml.to_s}" if parsed_xml.geonames.status
102
- rescue
103
- #Do nothing but FIXME to not use slop
104
- end
108
+ raise "geonames status error message of: #{parsed_xml.to_s}" if parsed_xml.xpath("//status").present?
105
109
 
106
110
  #This is ugly and needs to be redone to achieve better recursive...
107
- if parsed_xml.geonames.totalResultsCount.text == '0'
111
+ current_count = parsed_xml.xpath("//totalResultsCount")
112
+ if current_count.blank? || current_count.first.text == '0'
108
113
  if geo_hash[:neighborhood_part].present?
109
114
  geo_hash_temp = geo_hash.clone
110
115
  geo_hash_temp[:neighborhood_part] = nil
@@ -121,14 +126,14 @@ module Geomash
121
126
  end
122
127
 
123
128
  #Exact Match ... FIXME to not use Slop
124
- if parsed_xml.geonames.geoname.class == Nokogiri::XML::Element
125
- return_hash[:id] = parsed_xml.geonames.geoname.geonameId.text
126
- return_hash[:rdf] = "http://sws.geonames.org/#{return_hash[:id]}/about.rdf"
127
- elsif parsed_xml.geonames.geoname.class ==Nokogiri::XML::NodeSet
128
- return_hash[:id] = parsed_xml.geonames.geoname.first.geonameId.text
129
+ if parsed_xml.xpath("//geonames/geoname/geonameId").present?
130
+ return_hash[:id] = parsed_xml.xpath("//geonames/geoname/geonameId").first.text
129
131
  return_hash[:rdf] = "http://sws.geonames.org/#{return_hash[:id]}/about.rdf"
132
+ return_hash[:original_string_differs] = Geomash::Standardizer.parsed_and_original_check(geo_hash)
133
+ return return_hash
134
+ else
135
+ return nil
130
136
  end
131
- return_hash[:original_string_differs] = Geomash::Standardizer.parsed_and_original_check(geo_hash)
132
137
 
133
138
  end
134
139
 
@@ -1,6 +1,11 @@
1
1
  module Geomash
2
2
  class Parser
3
3
 
4
+ def self.cache_enabled
5
+ return Geomash.config[:parser_cache_enabled] unless Geomash.config[:parser_cache_enabled].nil?
6
+ return false
7
+ end
8
+
4
9
  def self.mapquest_key
5
10
  Geomash.config[:mapquest_key] || '<mapquest_key>'
6
11
  end
@@ -44,12 +49,21 @@ module Geomash
44
49
  return {}
45
50
  end
46
51
 
47
- Geocoder.configure(:lookup => :bing,:api_key => self.bing_key,:timeout => self.timeout, :always_raise => :all)
48
- bing_api_result = Geocoder.search(term)
52
+ ::Geocoder.configure(:lookup => :bing,:api_key => self.bing_key,:timeout => self.timeout, :always_raise => :all)
53
+
54
+ begin
55
+ bing_api_result = Geocoder.search(term)
56
+ rescue SocketError => e
57
+ Geocoder::Lookup.get(:bing).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
58
+ retry unless (retry_count -= 1).zero?
59
+ rescue Geocoder::OverQueryLimitError => e
60
+ Geocoder::Lookup.get(:bing).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
61
+ raise e
62
+ rescue Exception => e
63
+ Geocoder::Lookup.get(:bing).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
64
+ raise e
65
+ end
49
66
 
50
- rescue SocketError => e
51
- retry unless (retry_count -= 1).zero?
52
- else
53
67
 
54
68
  #Use only for United States results... international results are inaccurate.
55
69
  if bing_api_result.present? && bing_api_result.first.data["address"]["countryRegion"] == 'United States'
@@ -113,13 +127,20 @@ module Geomash
113
127
  return {}
114
128
  end
115
129
 
116
- Geocoder.configure(:lookup => :mapquest,:api_key => self.mapquest_key,:timeout => self.timeout, :always_raise => :all)
117
-
118
- mapquest_api_result = Geocoder.search(term)
119
- rescue SocketError => e
120
- retry unless (retry_count -= 1).zero?
121
- else
122
-
130
+ ::Geocoder.configure(:lookup => :mapquest,:api_key => self.mapquest_key,:timeout => self.timeout, :always_raise => :all)
131
+
132
+ begin
133
+ mapquest_api_result = Geocoder.search(term)
134
+ rescue SocketError => e
135
+ Geocoder::Lookup.get(:mapquest).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
136
+ retry unless (retry_count -= 1).zero?
137
+ rescue Geocoder::OverQueryLimitError => e
138
+ Geocoder::Lookup.get(:mapquest).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
139
+ raise e
140
+ rescue Exception => e
141
+ Geocoder::Lookup.get(:mapquest).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
142
+ raise e
143
+ end
123
144
 
124
145
  #If this call returned a result...
125
146
  if mapquest_api_result.present?
@@ -173,11 +194,18 @@ module Geomash
173
194
  return_hash[:standardized_term] = term
174
195
 
175
196
  ::Geocoder.configure(:lookup => :google,:api_key => nil,:timeout => self.timeout, :always_raise => :all)
176
-
177
- google_api_result = ::Geocoder.search(term)
178
- rescue SocketError => e
179
- retry unless (retry_count -= 1).zero?
180
- else
197
+ begin
198
+ google_api_result = ::Geocoder.search(term)
199
+ rescue SocketError => e
200
+ Geocoder::Lookup.get(:google).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
201
+ retry unless (retry_count -= 1).zero?
202
+ rescue Geocoder::OverQueryLimitError => e
203
+ Geocoder::Lookup.get(:google).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
204
+ raise e
205
+ rescue Exception => e
206
+ Geocoder::Lookup.get(:google).cache.expire(Geocoder::Query.new(term).url) if self.cache_enabled #Expire this url
207
+ raise e
208
+ end
181
209
 
182
210
 
183
211
  #Check if only a partial match. To avoid errors, strip out the first part and try again...
@@ -202,7 +230,7 @@ module Geomash
202
230
  elsif (result['types'] & ['country']).present?
203
231
  return_hash[:country_part] = result['long_name']
204
232
  elsif (result['types'] & ['administrative_area_level_1']).present?
205
- return_hash[:state_part] = result['long_name'].to_ascii
233
+ return_hash[:state_part] = result['long_name'].to_ascii.gsub('-city', '')
206
234
  elsif (result['types'] & ['locality']).present?
207
235
  return_hash[:city_part] = result['long_name']
208
236
  elsif (result['types'] & ['sublocality', 'political']).length == 2 || result['types'].include?('neighborhood')
@@ -213,6 +241,8 @@ module Geomash
213
241
  return_hash[:term_differs_from_tgn] ||= google_api_result.first.data['partial_match'] unless google_api_result.first.data['partial_match'].blank?
214
242
  end
215
243
 
244
+ #FIXME: Google free API rate limit is 5 requests / 1 second now (used to be 10). Need a better way to handle this.
245
+ sleep(0.1)
216
246
 
217
247
  return return_hash
218
248
  end
@@ -11,6 +11,10 @@ module Geomash
11
11
  return ''
12
12
  end
13
13
 
14
+ term_split_list = term.split(/[,\-\(\(]/).reject{ |e| e.empty? }
15
+ term_split_list.each{ |e| e.gsub!(/[^\w\s]/, "") } #Remove punctuation
16
+ term_split_list.each{ |e| e.strip! } #Remove any extra remaining whitespace
17
+ term_split_list.reject{ |e| e.empty? }
14
18
  state_abbr_list = ['Mass']
15
19
  state_name_list = []
16
20
  country_name_list = []
@@ -24,29 +28,51 @@ module Geomash
24
28
  Country.all.each do |country_name_abbr_pair|
25
29
  country_name_list << country_name_abbr_pair.first
26
30
  end
31
+ country_name_list.append('South Korea') #Listed as Korea, Republic of in the gem
32
+ country_name_list.append('North Korea') #Listed as Korea, Democratic People's Republic Of of in the gem
27
33
 
28
34
  #Parsing a subject geographic term.
29
- if term.include?('--')
30
- term.split('--').each_with_index do |split_term, index|
31
- if state_name_list.any? { |state| split_term.include? state } || country_name_list.any? { |country| split_term.include? country }
32
- geo_term = term.split('--')[index..term.split('--').length-1].reverse!.join(',')
33
- elsif state_abbr_list.any? { |abbr| split_term.include? abbr }
34
- geo_term = split_term
35
+ if (state_name_list & term_split_list).present? || (state_abbr_list & term_split_list).present? || (country_name_list & term_split_list).present?
36
+ if term.include?('--')
37
+ term.split('--').each_with_index do |split_term, index|
38
+ if state_name_list.any? { |state| split_term.include? state } || country_name_list.any? { |country| split_term.include? country }
39
+ #Cases like Naroden Etnografski Muzeĭ (Sofia, Bulgaria)--Catalogs
40
+ if split_term.match(/\([^\)]+\)/)
41
+ geo_term = split_term.gsub('(', ',').gsub(' ,', ', ')
42
+ geo_term = geo_term.gsub(')', '')
43
+
44
+ =begin
45
+ if split_term.match(/\([^\)]+,[^\)]+\)/)
46
+ geo_term = split_term.match(/\([^\)]+\)/).to_s
47
+ geo_term = geo_term[1..geo_term.length-2]
48
+ #Abbeville (France)--History--20th century.
49
+ elsif split_term.match(/\([^\)]+\)/)
50
+ geo_term = split_term
51
+ =end
52
+ else
53
+ geo_term = term.split('--')[index..term.split('--').length-1].reverse!.join(',')
54
+ end
55
+
56
+ elsif state_abbr_list.any? { |abbr| split_term.include? abbr }
57
+ geo_term = split_term
58
+ end
35
59
  end
36
- end
37
- #Other than a '--' field
38
- #Experimental... example: Palmer (Mass) - history or Stores (retail trade) - Palmer, Mass
39
- elsif term.include?(' - ')
40
- term.split(' - ').each do |split_term|
41
- if state_name_list.any? { |state| split_term.include? state } || state_abbr_list.any? { |abbr| split_term.include? abbr } || country_name_list.any? { |country| split_term.include? country }
42
- geo_term = split_term
60
+ #Other than a '--' field
61
+ #Experimental... example: Palmer (Mass) - history or Stores (retail trade) - Palmer, Mass
62
+ elsif term.include?(' - ')
63
+ term.split(' - ').each do |split_term|
64
+ if state_name_list.any? { |state| split_term.include? state } || state_abbr_list.any? { |abbr| split_term.include? abbr } || country_name_list.any? { |country| split_term.include? country }
65
+ geo_term = split_term
66
+ end
67
+
43
68
  end
69
+ else
70
+ #if term_split_list.length > 1
71
+ geo_term = term.gsub('(', ',').gsub(' ,', ', ')
72
+ geo_term = geo_term.gsub(')', '')
73
+ #end
44
74
 
45
75
  end
46
- else
47
- if state_name_list.any? { |state| term.include? state } || state_abbr_list.any? { |abbr| term.include? abbr } || country_name_list.any? { |country| term.include? country }
48
- geo_term = term
49
- end
50
76
  end
51
77
 
52
78
  return geo_term
@@ -83,6 +109,8 @@ module Geomash
83
109
  end
84
110
  end
85
111
 
112
+ geo_term = geo_term.squeeze(',')
113
+
86
114
  return geo_term
87
115
  end
88
116
 
@@ -224,6 +252,7 @@ module Geomash
224
252
 
225
253
  def self.try_with_entered_names(geo_hash)
226
254
  geo_hash_local = geo_hash.clone
255
+ geo_hash_local[:tgn] = nil
227
256
  if geo_hash_local[:neighborhood_part].present?
228
257
  orig_string_check = geo_hash_local[:standardized_term].gsub(',', ' ').squish.split(' ').select { |value| value.downcase.to_ascii == geo_hash_local[:neighborhood_part].downcase.to_ascii}
229
258
  geo_hash_local[:neighborhood_part] = orig_string_check.first.strip if orig_string_check.present? && orig_string_check != geo_hash_local[:neighborhood_part]