geo_coder 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile +12 -0
- data/Gemfile.lock +32 -0
- data/History.txt +6 -0
- data/Makefile +13 -0
- data/Manifest.txt +18 -0
- data/README.rdoc +197 -0
- data/Rakefile +53 -0
- data/TODO.txt +8 -0
- data/VERSION +1 -0
- data/bin/build_indexes +8 -0
- data/bin/rebuild_cluster +22 -0
- data/bin/rebuild_metaphones +23 -0
- data/bin/tiger_import +59 -0
- data/demos/demo/app/ext/geocodewrap.rb +84 -0
- data/demos/demo/app/views/index.builder +13 -0
- data/demos/demo/app/views/index.erb +71 -0
- data/demos/demo/config.ru +12 -0
- data/demos/demo/config/bootstraps.rb +130 -0
- data/demos/demo/config/geoenvironment.rb +25 -0
- data/demos/demo/geocoder_helper.rb +12 -0
- data/demos/demo/geocom_geocode.rb +10 -0
- data/demos/demo/main.rb +3 -0
- data/demos/demo/rakefile.rb +17 -0
- data/demos/demo/tmp/restart.txt +0 -0
- data/demos/simpledemo/views/index.builder +13 -0
- data/demos/simpledemo/views/index.erb +69 -0
- data/demos/simpledemo/ws.rb +83 -0
- data/doc/Makefile +7 -0
- data/doc/html4css1.css +279 -0
- data/doc/lookup.rst +193 -0
- data/doc/parsing.rst +125 -0
- data/doc/voidspace.css +147 -0
- data/geo_coder.gemspec +172 -0
- data/lib/geocoder/us.rb +21 -0
- data/lib/geocoder/us/address.rb +290 -0
- data/lib/geocoder/us/constants.rb +670 -0
- data/lib/geocoder/us/database.rb +745 -0
- data/lib/geocoder/us/import.rb +181 -0
- data/lib/geocoder/us/import/tiger.rb +13 -0
- data/lib/geocoder/us/numbers.rb +58 -0
- data/navteq/README +4 -0
- data/navteq/convert.sql +37 -0
- data/navteq/navteq_import +39 -0
- data/navteq/prepare.sql +92 -0
- data/sql/cluster.sql +16 -0
- data/sql/convert.sql +80 -0
- data/sql/create.sql +37 -0
- data/sql/index.sql +12 -0
- data/sql/place.csv +104944 -0
- data/sql/place.sql +104948 -0
- data/sql/setup.sql +78 -0
- data/src/Makefile +13 -0
- data/src/README +14 -0
- data/src/liblwgeom/Makefile +75 -0
- data/src/liblwgeom/box2d.c +54 -0
- data/src/liblwgeom/lex.yy.c +4799 -0
- data/src/liblwgeom/liblwgeom.h +1405 -0
- data/src/liblwgeom/lwalgorithm.c +946 -0
- data/src/liblwgeom/lwalgorithm.h +52 -0
- data/src/liblwgeom/lwcircstring.c +759 -0
- data/src/liblwgeom/lwcollection.c +541 -0
- data/src/liblwgeom/lwcompound.c +118 -0
- data/src/liblwgeom/lwcurvepoly.c +86 -0
- data/src/liblwgeom/lwgeom.c +886 -0
- data/src/liblwgeom/lwgeom_api.c +2201 -0
- data/src/liblwgeom/lwgparse.c +1219 -0
- data/src/liblwgeom/lwgunparse.c +1054 -0
- data/src/liblwgeom/lwline.c +525 -0
- data/src/liblwgeom/lwmcurve.c +125 -0
- data/src/liblwgeom/lwmline.c +137 -0
- data/src/liblwgeom/lwmpoint.c +138 -0
- data/src/liblwgeom/lwmpoly.c +141 -0
- data/src/liblwgeom/lwmsurface.c +129 -0
- data/src/liblwgeom/lwpoint.c +439 -0
- data/src/liblwgeom/lwpoly.c +579 -0
- data/src/liblwgeom/lwsegmentize.c +1047 -0
- data/src/liblwgeom/lwutil.c +369 -0
- data/src/liblwgeom/measures.c +861 -0
- data/src/liblwgeom/postgis_config.h +93 -0
- data/src/liblwgeom/ptarray.c +847 -0
- data/src/liblwgeom/vsprintf.c +179 -0
- data/src/liblwgeom/wktparse.h +126 -0
- data/src/liblwgeom/wktparse.lex +74 -0
- data/src/liblwgeom/wktparse.tab.c +2353 -0
- data/src/liblwgeom/wktparse.tab.h +145 -0
- data/src/liblwgeom/wktparse.y +385 -0
- data/src/libsqlite3_geocoder/Makefile +22 -0
- data/src/libsqlite3_geocoder/Makefile.nix +15 -0
- data/src/libsqlite3_geocoder/Makefile.redhat +15 -0
- data/src/libsqlite3_geocoder/extension.c +121 -0
- data/src/libsqlite3_geocoder/extension.h +13 -0
- data/src/libsqlite3_geocoder/levenshtein.c +42 -0
- data/src/libsqlite3_geocoder/metaphon.c +278 -0
- data/src/libsqlite3_geocoder/util.c +37 -0
- data/src/libsqlite3_geocoder/wkb_compress.c +54 -0
- data/src/metaphone/Makefile +7 -0
- data/src/metaphone/README +49 -0
- data/src/metaphone/extension.c +37 -0
- data/src/metaphone/metaphon.c +251 -0
- data/src/shp2sqlite/Makefile +37 -0
- data/src/shp2sqlite/Makefile.nix +36 -0
- data/src/shp2sqlite/Makefile.redhat +35 -0
- data/src/shp2sqlite/dbfopen.c +1595 -0
- data/src/shp2sqlite/getopt.c +695 -0
- data/src/shp2sqlite/getopt.h +127 -0
- data/src/shp2sqlite/shapefil.h +500 -0
- data/src/shp2sqlite/shp2sqlite.c +1974 -0
- data/src/shp2sqlite/shpopen.c +1894 -0
- data/tests/address.rb +236 -0
- data/tests/benchmark.rb +20 -0
- data/tests/constants.rb +57 -0
- data/tests/data/address-sample.csv +52 -0
- data/tests/data/db-test.csv +57 -0
- data/tests/data/locations.csv +4 -0
- data/tests/database.rb +137 -0
- data/tests/generate.rb +34 -0
- data/tests/numbers.rb +46 -0
- data/tests/run.rb +11 -0
- metadata +237 -0
|
@@ -0,0 +1,745 @@
|
|
|
1
|
+
require 'rubygems'
|
|
2
|
+
require 'sqlite3'
|
|
3
|
+
require 'text'
|
|
4
|
+
|
|
5
|
+
require 'set'
|
|
6
|
+
require 'pp'
|
|
7
|
+
require 'time'
|
|
8
|
+
require 'thread'
|
|
9
|
+
|
|
10
|
+
require 'geocoder/us/address'
|
|
11
|
+
|
|
12
|
+
module Geocoder
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
module Geocoder::US
|
|
16
|
+
# Provides an interface to a Geocoder::US database.
|
|
17
|
+
class Database
|
|
18
|
+
Street_Weight = 3.0
|
|
19
|
+
Number_Weight = 2.0
|
|
20
|
+
Parity_Weight = 1.25
|
|
21
|
+
City_Weight = 1.0
|
|
22
|
+
@@mutex = Mutex.new
|
|
23
|
+
|
|
24
|
+
# Takes the path of an SQLite 3 database prepared for Geocoder::US
|
|
25
|
+
# as the sole mandatory argument. The helper argument points to the
|
|
26
|
+
# Geocoder::US SQLite plugin; the module looks for this in the same
|
|
27
|
+
# directory as database.rb by default. The cache_size argument is
|
|
28
|
+
# measured in kilobytes and is used to set the SQLite cache size; larger
|
|
29
|
+
# values will trade memory for speed in long-running processes.
|
|
30
|
+
def initialize (filename, options = {})
|
|
31
|
+
defaults = {:debug => false, :cache_size => 50000,
|
|
32
|
+
:helper => "sqlite3.so", :threadsafe => false,
|
|
33
|
+
:create => false}
|
|
34
|
+
options = defaults.merge options
|
|
35
|
+
raise ArgumentError, "can't find database #{filename}" \
|
|
36
|
+
unless options[:create] or File.exists? filename
|
|
37
|
+
@db = SQLite3::Database.new( filename )
|
|
38
|
+
@st = {}
|
|
39
|
+
@debug = options[:debug]
|
|
40
|
+
@threadsafe = options[:threadsafe]
|
|
41
|
+
tune options[:helper], options[:cache_size]
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
def synchronize
|
|
45
|
+
if not @threadsafe
|
|
46
|
+
@@mutex.synchronize { yield }
|
|
47
|
+
else
|
|
48
|
+
yield
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
#private
|
|
53
|
+
|
|
54
|
+
# Load the SQLite extension and tune the database settings.
|
|
55
|
+
# q.v. http://web.utk.edu/~jplyon/sqlite/SQLite_optimization_FAQ.html
|
|
56
|
+
def tune (helper, cache_size)
|
|
57
|
+
synchronize do
|
|
58
|
+
@db.create_function("levenshtein", 2) do |func, word1, word2|
|
|
59
|
+
test1, test2 = [word1, word2].map {|w|
|
|
60
|
+
w.to_s.gsub(/\W/o, "").downcase
|
|
61
|
+
}
|
|
62
|
+
dist = Text::Levenshtein.distance(test1, test2)
|
|
63
|
+
result = dist.to_f / [test1.length, test2.length].max
|
|
64
|
+
func.set_result result
|
|
65
|
+
end
|
|
66
|
+
@db.create_function("metaphone", 2) do |func, string, len|
|
|
67
|
+
test = string.to_s.gsub(/\W/o, "")
|
|
68
|
+
if test =~ /^(\d+)/o
|
|
69
|
+
mph = $1
|
|
70
|
+
elsif test =~ /^([wy])$/io
|
|
71
|
+
mph = $1
|
|
72
|
+
else
|
|
73
|
+
mph = Text::Metaphone.metaphone test
|
|
74
|
+
end
|
|
75
|
+
func.result = mph[0...len.to_i]
|
|
76
|
+
end
|
|
77
|
+
@db.create_function("nondigit_prefix", 1) do |func, string|
|
|
78
|
+
string.to_s =~ /^(.*\D)?(\d+)$/o
|
|
79
|
+
func.result = ($1 || "")
|
|
80
|
+
end
|
|
81
|
+
@db.create_function("digit_suffix", 1) do |func, string|
|
|
82
|
+
string.to_s =~ /^(.*\D)?(\d+)$/o
|
|
83
|
+
func.result = ($2 || "")
|
|
84
|
+
end
|
|
85
|
+
#@db.enable_load_extension(1)
|
|
86
|
+
#@db.load_extension(helper)
|
|
87
|
+
#@db.enable_load_extension(0)
|
|
88
|
+
@db.cache_size = cache_size
|
|
89
|
+
@db.temp_store = "memory"
|
|
90
|
+
@db.synchronous = "off"
|
|
91
|
+
end
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# Return a cached SQLite statement object, preparing it first if
|
|
95
|
+
# it's not already in the cache.
|
|
96
|
+
def prepare (sql)
|
|
97
|
+
$stderr.print "SQL : #{sql}\n" if @debug
|
|
98
|
+
synchronize do
|
|
99
|
+
@st[sql] ||= @db.prepare sql
|
|
100
|
+
end
|
|
101
|
+
return @st[sql]
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
def flush_statements
|
|
105
|
+
@st = {}
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
# Generate enough SQL placeholders for a list of objects.
|
|
109
|
+
def placeholders_for (list)
|
|
110
|
+
(["?"] * list.length).join(",")
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Generate enough SQL placeholders for a list of objects.
|
|
114
|
+
def metaphone_placeholders_for (list)
|
|
115
|
+
(["metaphone(?,5)"] * list.length).join(",")
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
# Execute an SQL statement, bind a list of parameters, and
|
|
119
|
+
# return the result as a list of hashes.
|
|
120
|
+
def execute (sql, *params)
|
|
121
|
+
st = prepare(sql)
|
|
122
|
+
execute_statement st, *params
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
# Execute an SQLite statement object, bind the parameters,
|
|
127
|
+
# map the column names to symbols, and return the rows
|
|
128
|
+
# as a list of hashes.
|
|
129
|
+
def execute_statement (st, *params)
|
|
130
|
+
if @debug
|
|
131
|
+
start = Time.now
|
|
132
|
+
$stderr.print "EXEC: #{params.inspect}\n" if !params.empty?
|
|
133
|
+
end
|
|
134
|
+
rows = []
|
|
135
|
+
synchronize do
|
|
136
|
+
result = st.execute(*params)
|
|
137
|
+
columns = result.columns.map {|c| c.to_sym}
|
|
138
|
+
result.each {|row|
|
|
139
|
+
rows << Hash[*(columns.zip(row).flatten)]}
|
|
140
|
+
|
|
141
|
+
end
|
|
142
|
+
if @debug
|
|
143
|
+
runtime = format("%.3f", Time.now - start)
|
|
144
|
+
$stderr.print "ROWS: #{rows.length} (#{runtime}s)\n"
|
|
145
|
+
end
|
|
146
|
+
rows.reverse!
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
def places_by_zip (city, zip)
|
|
150
|
+
execute("SELECT *, levenshtein(?, city) AS city_score
|
|
151
|
+
FROM place WHERE zip = ? order by priority desc LIMIT 1;", city, zip)
|
|
152
|
+
end
|
|
153
|
+
|
|
154
|
+
# Query the place table for by city, optional state, and zip.
|
|
155
|
+
# The metaphone index on the place table is used to match
|
|
156
|
+
# city names.
|
|
157
|
+
def places_by_city (city, tokens, state)
|
|
158
|
+
if city.nil?
|
|
159
|
+
city = ""
|
|
160
|
+
end
|
|
161
|
+
if state.nil? or state.empty?
|
|
162
|
+
and_state = ""
|
|
163
|
+
args = [city] + tokens.clone
|
|
164
|
+
else
|
|
165
|
+
and_state = "AND state = ?"
|
|
166
|
+
args = [city] + tokens.clone + [state]
|
|
167
|
+
end
|
|
168
|
+
metaphones = metaphone_placeholders_for tokens
|
|
169
|
+
execute("SELECT *, levenshtein(?, city) AS city_score
|
|
170
|
+
FROM place WHERE city_phone IN (#{metaphones}) #{and_state} order by priority desc LIMIT 1;", *args)
|
|
171
|
+
end
|
|
172
|
+
|
|
173
|
+
# Generate an SQL query and set of parameters against the feature and range
|
|
174
|
+
# tables for a street name and optional building number. The SQL is
|
|
175
|
+
# used by candidate_records and more_candidate_records to filter results
|
|
176
|
+
# by ZIP code.
|
|
177
|
+
def features_by_street (street, tokens)
|
|
178
|
+
metaphones = (["metaphone(?,5)"] * tokens.length).join(",")
|
|
179
|
+
sql = "
|
|
180
|
+
SELECT feature.*, levenshtein(?, street) AS street_score
|
|
181
|
+
FROM feature
|
|
182
|
+
WHERE street_phone IN (#{metaphones})"
|
|
183
|
+
params = [street] + tokens
|
|
184
|
+
return [sql, params]
|
|
185
|
+
end
|
|
186
|
+
|
|
187
|
+
# Query the feature and range tables for a set of ranges, given a
|
|
188
|
+
# building number, street name, and list of candidate ZIP codes.
|
|
189
|
+
# The metaphone and ZIP code indexes on the feature table are
|
|
190
|
+
# used to match results.
|
|
191
|
+
def features_by_street_and_zip (street, tokens, zips)
|
|
192
|
+
sql, params = features_by_street(street, tokens)
|
|
193
|
+
in_list = placeholders_for zips
|
|
194
|
+
sql += " AND feature.zip IN (#{in_list})"
|
|
195
|
+
params += zips
|
|
196
|
+
execute sql, *params
|
|
197
|
+
end
|
|
198
|
+
|
|
199
|
+
# Query the feature and range tables for a set of ranges, given a
|
|
200
|
+
# building number, street name, and list of candidate ZIP codes.
|
|
201
|
+
# The ZIP codes are reduced to a set of 3-digit prefixes, broadening
|
|
202
|
+
# the search area.
|
|
203
|
+
def more_features_by_street_and_zip (street, tokens, zips)
|
|
204
|
+
sql, params = features_by_street(street, tokens)
|
|
205
|
+
if !zips.empty? and !zips[0].nil?
|
|
206
|
+
puts "zip results 2"
|
|
207
|
+
zip3s = zips.map {|z| z[0..2]+'%'}.to_set.to_a
|
|
208
|
+
like_list = zip3s.map {|z| "feature.zip LIKE ?"}.join(" OR ")
|
|
209
|
+
sql += " AND (#{like_list})"
|
|
210
|
+
params += zip3s
|
|
211
|
+
end
|
|
212
|
+
st = @db.prepare sql
|
|
213
|
+
execute_statement st, *params
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
def ranges_by_feature (fids, number, prenum)
|
|
217
|
+
in_list = placeholders_for fids
|
|
218
|
+
limit = 4 * fids.length
|
|
219
|
+
sql = "
|
|
220
|
+
SELECT feature_edge.fid AS fid, range.*
|
|
221
|
+
FROM feature_edge, range
|
|
222
|
+
WHERE fid IN (#{in_list})
|
|
223
|
+
AND feature_edge.tlid = range.tlid"
|
|
224
|
+
params = fids.clone
|
|
225
|
+
unless prenum.nil?
|
|
226
|
+
sql += " AND prenum = ?"
|
|
227
|
+
params += [prenum]
|
|
228
|
+
end
|
|
229
|
+
sql += "
|
|
230
|
+
ORDER BY min(abs(fromhn - ?), abs(tohn - ?))
|
|
231
|
+
LIMIT #{limit};"
|
|
232
|
+
params += [number, number]
|
|
233
|
+
execute sql, *params
|
|
234
|
+
end
|
|
235
|
+
|
|
236
|
+
# Query the edge table for a list of edges matching a list of edge IDs.
|
|
237
|
+
def edges (edge_ids)
|
|
238
|
+
in_list = placeholders_for edge_ids
|
|
239
|
+
sql = "SELECT edge.* FROM edge WHERE edge.tlid IN (#{in_list})"
|
|
240
|
+
execute sql, *edge_ids
|
|
241
|
+
end
|
|
242
|
+
|
|
243
|
+
# Query the range table for all ranges associated with the given
|
|
244
|
+
# list of edge IDs.
|
|
245
|
+
def range_ends (edge_ids)
|
|
246
|
+
in_list = placeholders_for edge_ids
|
|
247
|
+
sql = "SELECT tlid, side,
|
|
248
|
+
min(fromhn) > min(tohn) AS flipped,
|
|
249
|
+
min(fromhn) AS from0, max(tohn) AS to0,
|
|
250
|
+
min(tohn) AS from1, max(fromhn) AS to1
|
|
251
|
+
FROM range WHERE tlid IN (#{in_list})
|
|
252
|
+
GROUP BY tlid, side;"
|
|
253
|
+
execute(sql, *edge_ids).map {|r|
|
|
254
|
+
if r[:flipped] == "0"
|
|
255
|
+
r[:flipped] = false
|
|
256
|
+
r[:fromhn], r[:tohn] = r[:from0], r[:to0]
|
|
257
|
+
else
|
|
258
|
+
r[:flipped] = true
|
|
259
|
+
r[:fromhn], r[:tohn] = r[:from1], r[:to1]
|
|
260
|
+
end
|
|
261
|
+
[:from0, :to0, :from1, :to1].each {|k| r.delete k}
|
|
262
|
+
r
|
|
263
|
+
}
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
def intersections_by_fid (fids)
|
|
267
|
+
in_list = placeholders_for fids
|
|
268
|
+
sql = "
|
|
269
|
+
CREATE TEMPORARY TABLE intersection AS
|
|
270
|
+
SELECT fid, substr(geometry,1,8) AS point
|
|
271
|
+
FROM feature_edge, edge
|
|
272
|
+
WHERE feature_edge.tlid = edge.tlid
|
|
273
|
+
AND fid IN (#{in_list})
|
|
274
|
+
UNION
|
|
275
|
+
SELECT fid, substr(geometry,length(geometry)-7,8) AS point
|
|
276
|
+
FROM feature_edge, edge
|
|
277
|
+
WHERE feature_edge.tlid = edge.tlid
|
|
278
|
+
AND fid IN (#{in_list});
|
|
279
|
+
CREATE INDEX intersect_pt_idx ON intersection (point);"
|
|
280
|
+
execute sql, *(fids + fids)
|
|
281
|
+
# the a.fid < b.fid inequality guarantees consistent ordering of street
|
|
282
|
+
# names in the output
|
|
283
|
+
results = execute "
|
|
284
|
+
SELECT a.fid AS fid1, b.fid AS fid2, a.point
|
|
285
|
+
FROM intersection a, intersection b, feature f1, feature f2
|
|
286
|
+
WHERE a.point = b.point AND a.fid < b.fid
|
|
287
|
+
AND f1.fid = a.fid AND f2.fid = b.fid
|
|
288
|
+
AND f1.zip = f2.zip
|
|
289
|
+
AND f1.paflag = 'P' AND f2.paflag = 'P';"
|
|
290
|
+
execute "DROP TABLE intersection;"
|
|
291
|
+
flush_statements # the CREATE/DROP TABLE invalidates prepared statements
|
|
292
|
+
results
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
# Query the place table for notional "primary" place names for each of a
|
|
296
|
+
# list of ZIP codes. Since the place table shipped with this code is
|
|
297
|
+
# bespoke, and constructed from a variety of public domain sources,
|
|
298
|
+
# the primary name for a ZIP is not always the "right" one.
|
|
299
|
+
def primary_places (zips)
|
|
300
|
+
in_list = placeholders_for zips
|
|
301
|
+
sql = "SELECT * FROM place WHERE zip IN (#{in_list}) order by priority desc;"
|
|
302
|
+
execute sql, *zips
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
# Given a list of rows, find the unique values for a given key.
|
|
306
|
+
def unique_values (rows, key)
|
|
307
|
+
rows.map {|r| r[key]}.to_set.to_a
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
# Convert a list of rows into a hash keyed by the given keys.
|
|
311
|
+
def rows_to_h (rows, *keys)
|
|
312
|
+
hash = {}
|
|
313
|
+
rows.each {|row| (hash[row.values_at(*keys)] ||= []) << row; }
|
|
314
|
+
hash
|
|
315
|
+
end
|
|
316
|
+
|
|
317
|
+
# Merge the values in the list of rows given in src into the
|
|
318
|
+
# list of rows in dest, matching rows on the given list of keys.
|
|
319
|
+
# May generate more than one row in dest for each input dest row.
|
|
320
|
+
def merge_rows! (dest, src, *keys)
|
|
321
|
+
src = rows_to_h src, *keys
|
|
322
|
+
dest.map! {|row|
|
|
323
|
+
vals = row.values_at(*keys)
|
|
324
|
+
if src.key? vals
|
|
325
|
+
src[vals].map {|row2| row.merge row2}
|
|
326
|
+
else
|
|
327
|
+
[row]
|
|
328
|
+
end
|
|
329
|
+
}
|
|
330
|
+
dest.flatten!
|
|
331
|
+
end
|
|
332
|
+
|
|
333
|
+
def find_candidates (address)
|
|
334
|
+
places = []
|
|
335
|
+
candidates = []
|
|
336
|
+
|
|
337
|
+
city = address.city.sort {|a,b|a.length <=> b.length}[0]
|
|
338
|
+
if(!address.zip.empty? && !address.zip.nil?)
|
|
339
|
+
places = places_by_zip city, address.zip
|
|
340
|
+
end
|
|
341
|
+
places = places_by_city city, address.city_parts, address.state if places.empty?
|
|
342
|
+
return [] if places.empty?
|
|
343
|
+
|
|
344
|
+
address.city = unique_values places, :city
|
|
345
|
+
return places if address.street.empty?
|
|
346
|
+
|
|
347
|
+
zips = unique_values places, :zip
|
|
348
|
+
street = address.street.sort {|a,b|a.length <=> b.length}[0]
|
|
349
|
+
# puts "street parts = #{address.street_parts.inspect}"
|
|
350
|
+
candidates = features_by_street_and_zip street, address.street_parts, zips
|
|
351
|
+
|
|
352
|
+
if candidates.empty?
|
|
353
|
+
candidates = more_features_by_street_and_zip street, address.street_parts, zips
|
|
354
|
+
end
|
|
355
|
+
|
|
356
|
+
merge_rows! candidates, places, :zip
|
|
357
|
+
candidates
|
|
358
|
+
end
|
|
359
|
+
|
|
360
|
+
# Given a query hash and a list of candidates, assign :number
|
|
361
|
+
# and :precision values to each candidate. If the query building
|
|
362
|
+
# number is inside the candidate range, set the number on the result
|
|
363
|
+
# and set the precision to :range; otherwise, find the closest
|
|
364
|
+
# corner and set precision to :street.
|
|
365
|
+
def assign_number! (hn, candidates)
|
|
366
|
+
hn = 0 unless hn
|
|
367
|
+
for candidate in candidates
|
|
368
|
+
fromhn, tohn = candidate[:fromhn].to_i, candidate[:tohn].to_i
|
|
369
|
+
if (hn >= fromhn and hn <= tohn) or (hn <= fromhn and hn >= tohn)
|
|
370
|
+
candidate[:number] = hn.to_s
|
|
371
|
+
candidate[:precision] = :range
|
|
372
|
+
else
|
|
373
|
+
candidate[:number] = ((hn - fromhn).abs < (hn - tohn).abs ?
|
|
374
|
+
candidate[:fromhn] : candidate[:tohn]).to_s
|
|
375
|
+
candidate[:precision] = :street
|
|
376
|
+
end
|
|
377
|
+
end
|
|
378
|
+
end
|
|
379
|
+
|
|
380
|
+
def add_ranges! (address, candidates)
|
|
381
|
+
number = address.number.to_i
|
|
382
|
+
fids = unique_values candidates, :fid
|
|
383
|
+
ranges = ranges_by_feature fids, number, address.prenum
|
|
384
|
+
ranges = ranges_by_feature fids, number, nil unless !ranges.empty?
|
|
385
|
+
merge_rows! candidates, ranges, :fid
|
|
386
|
+
assign_number! number, candidates
|
|
387
|
+
end
|
|
388
|
+
|
|
389
|
+
def merge_edges! (candidates)
|
|
390
|
+
edge_ids = unique_values candidates, :tlid
|
|
391
|
+
records = edges edge_ids
|
|
392
|
+
merge_rows! candidates, records, :tlid
|
|
393
|
+
candidates.reject! {|record| record[:tlid].nil?}
|
|
394
|
+
edge_ids
|
|
395
|
+
end
|
|
396
|
+
|
|
397
|
+
def extend_ranges! (candidates)
|
|
398
|
+
edge_ids = merge_edges! candidates
|
|
399
|
+
full_ranges = range_ends edge_ids
|
|
400
|
+
merge_rows! candidates, full_ranges, :tlid, :side
|
|
401
|
+
end
|
|
402
|
+
|
|
403
|
+
# Score a list of candidates. For each candidate:
|
|
404
|
+
# * For each item in the query:
|
|
405
|
+
# ** if the query item is blank but the candidate is not, score 0.15;
|
|
406
|
+
# otherwise, if both are blank, score 1.0.
|
|
407
|
+
# ** If both items are set, compute the scaled Levenshtein-Damerau distance
|
|
408
|
+
# between them, and add that value (between 0.0 and 1.0) to the score.
|
|
409
|
+
# * Add 0.5 to the score for each numbered end of the range that matches
|
|
410
|
+
# the parity of the query number.
|
|
411
|
+
# * Add 1.0 if the query number is in the candidate range, otherwise
|
|
412
|
+
# add a fractional value for the notional distance between the
|
|
413
|
+
# closest candidate corner and the query.
|
|
414
|
+
# * Finally, divide the score by the total number of comparisons.
|
|
415
|
+
# The result should be between 0.0 and 1.0, with 1.0 indicating a
|
|
416
|
+
# perfect match.
|
|
417
|
+
def score_candidates! (address, candidates)
|
|
418
|
+
for candidate in candidates
|
|
419
|
+
candidate[:components] = {}
|
|
420
|
+
compare = [:prenum, :state, :zip]
|
|
421
|
+
denominator = compare.length + Street_Weight + City_Weight
|
|
422
|
+
|
|
423
|
+
street_score = (1.0 - candidate[:street_score].to_f) * Street_Weight
|
|
424
|
+
candidate[:components][:street] = street_score
|
|
425
|
+
city_score = (1.0 - candidate[:city_score].to_f) * City_Weight
|
|
426
|
+
candidate[:components][:city] = city_score
|
|
427
|
+
score = street_score + city_score
|
|
428
|
+
|
|
429
|
+
compare.each {|key|
|
|
430
|
+
src = address.send(key); src = src ? src.downcase : ""
|
|
431
|
+
dest = candidate[key]; dest = dest ? dest.downcase : ""
|
|
432
|
+
item_score = (src == dest) ? 1 : 0
|
|
433
|
+
candidate[:components][key] = item_score
|
|
434
|
+
score += item_score
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
if address.number and !address.number.empty?
|
|
438
|
+
parity = subscore = 0.0
|
|
439
|
+
fromhn, tohn, assigned, hn = [
|
|
440
|
+
candidate[:fromhn],
|
|
441
|
+
candidate[:tohn],
|
|
442
|
+
candidate[:number],
|
|
443
|
+
address.number].map {|s|s.to_i}
|
|
444
|
+
if candidate[:precision] == :range
|
|
445
|
+
subscore += Number_Weight
|
|
446
|
+
elsif assigned > 0
|
|
447
|
+
# only credit number subscore if assigned
|
|
448
|
+
subscore += Number_Weight/(assigned - hn).abs.to_f
|
|
449
|
+
end
|
|
450
|
+
candidate[:components][:number] = subscore
|
|
451
|
+
if hn > 0 and assigned > 0
|
|
452
|
+
# only credit parity if a number was given *and* assigned
|
|
453
|
+
parity += Parity_Weight/2.0 if fromhn % 2 == hn % 2
|
|
454
|
+
parity += Parity_Weight/2.0 if tohn % 2 == hn % 2
|
|
455
|
+
end
|
|
456
|
+
candidate[:components][:parity] = parity
|
|
457
|
+
score += subscore + parity
|
|
458
|
+
denominator += Number_Weight + Parity_Weight
|
|
459
|
+
end
|
|
460
|
+
candidate[:components][:total] = score.to_f
|
|
461
|
+
candidate[:components][:denominator] = denominator
|
|
462
|
+
candidate[:score] = score.to_f / denominator
|
|
463
|
+
end
|
|
464
|
+
end
|
|
465
|
+
|
|
466
|
+
# Find the candidates in a list of candidates that are tied for the
|
|
467
|
+
# top score and prune the remainder from the list.
|
|
468
|
+
def best_candidates! (candidates)
|
|
469
|
+
candidates.sort! {|a,b| b[:score] <=> a[:score]}
|
|
470
|
+
#candidates.reverse_each {|c| print "#{c[:number]} #{c[:state]} #{c[:city]} #{c[:raw_score]} #{c[:number_score]} #{c[:street_score]} #{c[:city_score]}\n" }
|
|
471
|
+
candidates.delete_if {|record| record[:score] < candidates[0][:score]}
|
|
472
|
+
end
|
|
473
|
+
|
|
474
|
+
# Compute the fractional interpolation distance for a query number along an
|
|
475
|
+
# edge, given all of the ranges for the same side of that edge.
|
|
476
|
+
def interpolation_distance (candidate)
|
|
477
|
+
fromhn, tohn, number = candidate.values_at(:fromhn, :tohn, :number).map{|x| x.to_i}
|
|
478
|
+
$stderr.print "NUM : #{fromhn} < #{number} < #{tohn} (flipped? #{candidate[:flipped]})\n" if @debug
|
|
479
|
+
# don't need this anymore since range_ends was improved...
|
|
480
|
+
fromhn, tohn = tohn, fromhn if fromhn > tohn
|
|
481
|
+
if fromhn > number
|
|
482
|
+
0.0
|
|
483
|
+
elsif tohn < number
|
|
484
|
+
1.0
|
|
485
|
+
else
|
|
486
|
+
(number - fromhn) / (tohn - fromhn).to_f
|
|
487
|
+
end
|
|
488
|
+
end
|
|
489
|
+
|
|
490
|
+
# Unpack an array of little-endian 4-byte ints, and convert them into
|
|
491
|
+
# signed floats by dividing by 10^6, inverting the process used by the
|
|
492
|
+
# compress_wkb_line() function in the SQLite helper extension.
|
|
493
|
+
def unpack_geometry (geom)
|
|
494
|
+
points = []
|
|
495
|
+
if !geom.nil?
|
|
496
|
+
# Pete - The database format is completely different to the one
|
|
497
|
+
# expected by the code, so I've done some detective work to
|
|
498
|
+
# figure out what it should be. It looks like the format is
|
|
499
|
+
# | 1 byte Type | 4 byte SRID | 4 byte element count| 8 byte double coordinates *
|
|
500
|
+
# I've added new code to read this, and commented out the old.
|
|
501
|
+
info = geom.unpack('CVVD*')
|
|
502
|
+
coords = info.slice(3, info.length)
|
|
503
|
+
points << [coords.shift, coords.shift] until coords.empty?
|
|
504
|
+
|
|
505
|
+
# coords = geom.unpack "V*" # little-endian 4-byte long ints
|
|
506
|
+
#
|
|
507
|
+
## now map them into signed floats
|
|
508
|
+
# coords.map! {|i| ( i > (1 << 31) ? i - (1 << 32) : i ) / 1_000_000.0}
|
|
509
|
+
# points << [coords.shift, coords.shift] until coords.empty?
|
|
510
|
+
end
|
|
511
|
+
points
|
|
512
|
+
end
|
|
513
|
+
|
|
514
|
+
# Calculate the longitude scaling for the average of two latitudes.
|
|
515
|
+
def scale_lon (lat1,lat2)
|
|
516
|
+
# an approximation in place of lookup.rst (10e) and (10g)
|
|
517
|
+
# = scale longitude distances by the cosine of the latitude
|
|
518
|
+
# (or, actually, the mean of two latitudes)
|
|
519
|
+
# -- is this even necessary?
|
|
520
|
+
Math.cos((lat1+lat2) / 2 * Math::PI / 180)
|
|
521
|
+
end
|
|
522
|
+
|
|
523
|
+
# Simple Euclidean distances between two 2-D coordinate pairs, scaled
|
|
524
|
+
# along the longitudinal axis by scale_lon.
|
|
525
|
+
def distance (a, b)
|
|
526
|
+
dx = (b[0] - a[0]) * scale_lon(a[1], b[1])
|
|
527
|
+
dy = (b[1] - a[1])
|
|
528
|
+
Math.sqrt(dx ** 2 + dy ** 2)
|
|
529
|
+
end
|
|
530
|
+
|
|
531
|
+
# Find an interpolated point along a list of linestring vertices
|
|
532
|
+
# proportional to the given fractional distance along the line.
|
|
533
|
+
def interpolate (points, fraction)
|
|
534
|
+
$stderr.print "POINTS: #{points.inspect}" if @debug
|
|
535
|
+
return points[0] if fraction == 0.0
|
|
536
|
+
return points[-1] if fraction == 1.0
|
|
537
|
+
total = 0.0
|
|
538
|
+
(1...points.length).each {|n| total += distance(points[n-1], points[n])}
|
|
539
|
+
target = total * fraction
|
|
540
|
+
for n in 1...points.length
|
|
541
|
+
step = distance(points[n-1], points[n])
|
|
542
|
+
if step < target
|
|
543
|
+
target -= step
|
|
544
|
+
else
|
|
545
|
+
scale = scale_lon(points[n][1], points[n-1][1])
|
|
546
|
+
dx = (points[n][0] - points[n-1][0]) * (target/step) * scale
|
|
547
|
+
dy = (points[n][1] - points[n-1][1]) * (target/step)
|
|
548
|
+
found = [points[n-1][0]+dx, points[n-1][1]+dy]
|
|
549
|
+
return found.map {|x| format("%.6f", x).to_f}
|
|
550
|
+
end
|
|
551
|
+
end
|
|
552
|
+
# raise "Can't happen!"
|
|
553
|
+
end
|
|
554
|
+
|
|
555
|
+
# Find and replace the city, state, and county information
|
|
556
|
+
# in a list of candidates with the primary place information
|
|
557
|
+
# for the ZIP codes in the candidate list.
|
|
558
|
+
def canonicalize_places! (candidates)
|
|
559
|
+
zips_used = unique_values(candidates, :zip)
|
|
560
|
+
pri_places = rows_to_h primary_places(zips_used), :zip
|
|
561
|
+
candidates.map! {|record|
|
|
562
|
+
current_places = pri_places[[record[:zip]]]
|
|
563
|
+
# FIXME: this should never happen!
|
|
564
|
+
return [] unless current_places
|
|
565
|
+
top_priority = current_places.map{|p| p[:priority]}.min
|
|
566
|
+
current_places.select {|p| p[:priority] == top_priority}.map {|p|
|
|
567
|
+
record.merge({
|
|
568
|
+
:city => p[:city],
|
|
569
|
+
:state => p[:state],
|
|
570
|
+
:fips_county => p[:fips_county]
|
|
571
|
+
})
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
candidates.flatten!
|
|
575
|
+
end
|
|
576
|
+
|
|
577
|
+
# Clean up a candidate record by formatting the score, replacing nil
|
|
578
|
+
# values with empty strings, and deleting artifacts from database
|
|
579
|
+
# queries.
|
|
580
|
+
def clean_record! (record)
|
|
581
|
+
record[:score] = format("%.3f", record[:score]).to_f \
|
|
582
|
+
unless record[:score].nil?
|
|
583
|
+
record.keys.each {|k| record[k] = "" if record[k].nil? } # clean up nils
|
|
584
|
+
record.delete :components unless @debug
|
|
585
|
+
record.delete_if {|k,v| k.is_a? Fixnum or
|
|
586
|
+
[:geometry, :side, :tlid, :fid, :fid1, :fid2, :street_phone,
|
|
587
|
+
:city_phone, :fromhn, :tohn, :paflag, :flipped, :street_score,
|
|
588
|
+
:city_score, :priority, :fips_class, :fips_place, :status].include? k}
|
|
589
|
+
end
|
|
590
|
+
|
|
591
|
+
def best_places (address, places, canonicalize=false)
|
|
592
|
+
return [] unless !places.empty?
|
|
593
|
+
score_candidates! address, places
|
|
594
|
+
best_candidates! places
|
|
595
|
+
canonicalize_places! places if canonicalize
|
|
596
|
+
|
|
597
|
+
# uniqify places
|
|
598
|
+
by_name = rows_to_h(places, :city, :state)
|
|
599
|
+
if !by_name.nil?
|
|
600
|
+
begin
|
|
601
|
+
by_name.values.each {|v|
|
|
602
|
+
v.sort! {|a,b|
|
|
603
|
+
a[:zip] <=> b[:zip]
|
|
604
|
+
}}
|
|
605
|
+
rescue
|
|
606
|
+
|
|
607
|
+
end
|
|
608
|
+
places = by_name.map {|k,v| v[0]}
|
|
609
|
+
|
|
610
|
+
places.each {|record| clean_record! record}
|
|
611
|
+
places.each {|record|
|
|
612
|
+
record[:precision] = (record[:zip] == address.zip ? :zip : :city)
|
|
613
|
+
}
|
|
614
|
+
end
|
|
615
|
+
places
|
|
616
|
+
end
|
|
617
|
+
|
|
618
|
+
# Given an Address object, return a list of possible geocodes by place
|
|
619
|
+
# name. If canonicalize is true, attempt to return the "primary" postal
|
|
620
|
+
# place name for the given city, state, or ZIP.
|
|
621
|
+
def geocode_place (address, canonicalize=false)
|
|
622
|
+
places = []
|
|
623
|
+
places = places_by_zip address.text, address.zip if !address.zip.empty? or !address.zip.nil?
|
|
624
|
+
places = places_by_city address.text, address.city_parts, address.state if places.empty?
|
|
625
|
+
best_places address, places, canonicalize
|
|
626
|
+
end
|
|
627
|
+
|
|
628
|
+
def geocode_intersection (address, canonical_place=false)
|
|
629
|
+
candidates = find_candidates address
|
|
630
|
+
return [] if candidates.empty?
|
|
631
|
+
return best_places(address, candidates, canonical_place) if candidates[0][:street].nil?
|
|
632
|
+
|
|
633
|
+
features = rows_to_h candidates, :fid
|
|
634
|
+
intersects = intersections_by_fid features.keys.flatten
|
|
635
|
+
intersects.map! {|record|
|
|
636
|
+
feat1, feat2 = record.values_at(:fid1, :fid2).map {|k| features[[k]][0]}
|
|
637
|
+
record.merge! feat1
|
|
638
|
+
record[:street1] = record.delete(:street)
|
|
639
|
+
record[:street2] = feat2[:street]
|
|
640
|
+
record[:lon], record[:lat] = unpack_geometry(record.delete(:point))[0]
|
|
641
|
+
record[:precision] = :intersection
|
|
642
|
+
record[:street_score] = (feat1[:street_score].to_f + feat2[:street_score].to_f)/2
|
|
643
|
+
record
|
|
644
|
+
}
|
|
645
|
+
#pp(intersects)
|
|
646
|
+
|
|
647
|
+
score_candidates! address, intersects
|
|
648
|
+
best_candidates! intersects
|
|
649
|
+
|
|
650
|
+
by_point = rows_to_h(intersects, :lon, :lat)
|
|
651
|
+
candidates = by_point.values.map {|records| records[0]}
|
|
652
|
+
|
|
653
|
+
canonicalize_places! candidates if canonical_place
|
|
654
|
+
candidates.each {|record| clean_record! record}
|
|
655
|
+
candidates
|
|
656
|
+
end
|
|
657
|
+
|
|
658
|
+
# Given an Address object, return a list of possible geocodes by address
|
|
659
|
+
# range interpolation. If canonicalize is true, attempt to return the
|
|
660
|
+
# "primary" street and place names, if they are different from the ones
|
|
661
|
+
# given.
|
|
662
|
+
def geocode_address (address, canonical_place=false)
|
|
663
|
+
candidates = find_candidates address
|
|
664
|
+
return [] if candidates.empty?
|
|
665
|
+
return best_places(address, candidates, canonical_place) if candidates[0][:street].nil?
|
|
666
|
+
|
|
667
|
+
score_candidates! address, candidates
|
|
668
|
+
best_candidates! candidates
|
|
669
|
+
|
|
670
|
+
#candidates.sort {|a,b| b[:score] <=> a[:score]}.each {|candidate|
|
|
671
|
+
add_ranges! address, candidates
|
|
672
|
+
score_candidates! address, candidates
|
|
673
|
+
#pp candidates.sort {|a,b| b[:score] <=> a[:score]}
|
|
674
|
+
best_candidates! candidates
|
|
675
|
+
|
|
676
|
+
# sometimes multiple fids match the same tlid
|
|
677
|
+
by_tlid = rows_to_h candidates, :tlid
|
|
678
|
+
candidates = by_tlid.values.map {|records| records[0]}
|
|
679
|
+
|
|
680
|
+
# if no number is assigned in the query, only return one
|
|
681
|
+
# result for each street/zip combo
|
|
682
|
+
if !address.number.empty?
|
|
683
|
+
extend_ranges! candidates
|
|
684
|
+
else
|
|
685
|
+
by_street = rows_to_h candidates, :street, :zip
|
|
686
|
+
candidates = by_street.values.map {|records| records[0]}
|
|
687
|
+
merge_edges! candidates
|
|
688
|
+
end
|
|
689
|
+
candidates.map {|record|
|
|
690
|
+
dist = interpolation_distance record
|
|
691
|
+
$stderr.print "DIST: #{dist}\n" if @debug
|
|
692
|
+
points = unpack_geometry record[:geometry]
|
|
693
|
+
points.reverse! if record[:flipped]
|
|
694
|
+
record[:lon], record[:lat] = interpolate points, dist
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
canonicalize_places! candidates if canonical_place
|
|
698
|
+
|
|
699
|
+
candidates.each {|record| clean_record! record}
|
|
700
|
+
candidates
|
|
701
|
+
end
|
|
702
|
+
|
|
703
|
+
public
|
|
704
|
+
|
|
705
|
+
# Geocode a given address or place name string. The max_penalty and cutoff
|
|
706
|
+
# arguments are passed to the Address parse functions. If canonicalize is
|
|
707
|
+
# true, attempt to return the "primary" street and place names, if they are
|
|
708
|
+
# different from the ones given.
|
|
709
|
+
#
|
|
710
|
+
# Returns possible candidate matches as a list of hashes.
|
|
711
|
+
#
|
|
712
|
+
# * The :lat and :lon values of each hash store the range-interpolated
|
|
713
|
+
# address coordinates as latitude and longitude in the WGS84 spheroid.
|
|
714
|
+
# * The :precision value may be one of :city, :zip, :street, or :range, in
|
|
715
|
+
# order of increasing precision.
|
|
716
|
+
# * The :score value will be a float between 0.0 and 1.0 representing
|
|
717
|
+
# the approximate "goodness" of the candidate match.
|
|
718
|
+
# * The other values in the hash will represent various structured
|
|
719
|
+
# components of the address and place name.
|
|
720
|
+
def geocode (info_to_geocode, canonical_place=false)
|
|
721
|
+
address = Address.new info_to_geocode
|
|
722
|
+
$stderr.print "ADDR: #{address.inspect}\n" if @debug
|
|
723
|
+
return [] if address.city.empty? and address.zip.empty?
|
|
724
|
+
results = []
|
|
725
|
+
start_time = Time.now if @debug
|
|
726
|
+
if address.po_box? and !address.zip.empty?
|
|
727
|
+
results = geocode_place address, canonical_place
|
|
728
|
+
end
|
|
729
|
+
if address.intersection? and !address.street.empty? and address.number.empty?
|
|
730
|
+
results = geocode_intersection address, canonical_place
|
|
731
|
+
end
|
|
732
|
+
if results.empty? and !address.street.empty?
|
|
733
|
+
results = geocode_address address, canonical_place
|
|
734
|
+
end
|
|
735
|
+
if results.empty?
|
|
736
|
+
results = geocode_place address, canonical_place
|
|
737
|
+
end
|
|
738
|
+
if @debug
|
|
739
|
+
runtime = format("%.3f", Time.now - start_time)
|
|
740
|
+
$stderr.print "DONE: #{runtime}s\n"
|
|
741
|
+
end
|
|
742
|
+
results
|
|
743
|
+
end
|
|
744
|
+
end
|
|
745
|
+
end
|