geo_coder 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (119) hide show
  1. data/Gemfile +12 -0
  2. data/Gemfile.lock +32 -0
  3. data/History.txt +6 -0
  4. data/Makefile +13 -0
  5. data/Manifest.txt +18 -0
  6. data/README.rdoc +197 -0
  7. data/Rakefile +53 -0
  8. data/TODO.txt +8 -0
  9. data/VERSION +1 -0
  10. data/bin/build_indexes +8 -0
  11. data/bin/rebuild_cluster +22 -0
  12. data/bin/rebuild_metaphones +23 -0
  13. data/bin/tiger_import +59 -0
  14. data/demos/demo/app/ext/geocodewrap.rb +84 -0
  15. data/demos/demo/app/views/index.builder +13 -0
  16. data/demos/demo/app/views/index.erb +71 -0
  17. data/demos/demo/config.ru +12 -0
  18. data/demos/demo/config/bootstraps.rb +130 -0
  19. data/demos/demo/config/geoenvironment.rb +25 -0
  20. data/demos/demo/geocoder_helper.rb +12 -0
  21. data/demos/demo/geocom_geocode.rb +10 -0
  22. data/demos/demo/main.rb +3 -0
  23. data/demos/demo/rakefile.rb +17 -0
  24. data/demos/demo/tmp/restart.txt +0 -0
  25. data/demos/simpledemo/views/index.builder +13 -0
  26. data/demos/simpledemo/views/index.erb +69 -0
  27. data/demos/simpledemo/ws.rb +83 -0
  28. data/doc/Makefile +7 -0
  29. data/doc/html4css1.css +279 -0
  30. data/doc/lookup.rst +193 -0
  31. data/doc/parsing.rst +125 -0
  32. data/doc/voidspace.css +147 -0
  33. data/geo_coder.gemspec +172 -0
  34. data/lib/geocoder/us.rb +21 -0
  35. data/lib/geocoder/us/address.rb +290 -0
  36. data/lib/geocoder/us/constants.rb +670 -0
  37. data/lib/geocoder/us/database.rb +745 -0
  38. data/lib/geocoder/us/import.rb +181 -0
  39. data/lib/geocoder/us/import/tiger.rb +13 -0
  40. data/lib/geocoder/us/numbers.rb +58 -0
  41. data/navteq/README +4 -0
  42. data/navteq/convert.sql +37 -0
  43. data/navteq/navteq_import +39 -0
  44. data/navteq/prepare.sql +92 -0
  45. data/sql/cluster.sql +16 -0
  46. data/sql/convert.sql +80 -0
  47. data/sql/create.sql +37 -0
  48. data/sql/index.sql +12 -0
  49. data/sql/place.csv +104944 -0
  50. data/sql/place.sql +104948 -0
  51. data/sql/setup.sql +78 -0
  52. data/src/Makefile +13 -0
  53. data/src/README +14 -0
  54. data/src/liblwgeom/Makefile +75 -0
  55. data/src/liblwgeom/box2d.c +54 -0
  56. data/src/liblwgeom/lex.yy.c +4799 -0
  57. data/src/liblwgeom/liblwgeom.h +1405 -0
  58. data/src/liblwgeom/lwalgorithm.c +946 -0
  59. data/src/liblwgeom/lwalgorithm.h +52 -0
  60. data/src/liblwgeom/lwcircstring.c +759 -0
  61. data/src/liblwgeom/lwcollection.c +541 -0
  62. data/src/liblwgeom/lwcompound.c +118 -0
  63. data/src/liblwgeom/lwcurvepoly.c +86 -0
  64. data/src/liblwgeom/lwgeom.c +886 -0
  65. data/src/liblwgeom/lwgeom_api.c +2201 -0
  66. data/src/liblwgeom/lwgparse.c +1219 -0
  67. data/src/liblwgeom/lwgunparse.c +1054 -0
  68. data/src/liblwgeom/lwline.c +525 -0
  69. data/src/liblwgeom/lwmcurve.c +125 -0
  70. data/src/liblwgeom/lwmline.c +137 -0
  71. data/src/liblwgeom/lwmpoint.c +138 -0
  72. data/src/liblwgeom/lwmpoly.c +141 -0
  73. data/src/liblwgeom/lwmsurface.c +129 -0
  74. data/src/liblwgeom/lwpoint.c +439 -0
  75. data/src/liblwgeom/lwpoly.c +579 -0
  76. data/src/liblwgeom/lwsegmentize.c +1047 -0
  77. data/src/liblwgeom/lwutil.c +369 -0
  78. data/src/liblwgeom/measures.c +861 -0
  79. data/src/liblwgeom/postgis_config.h +93 -0
  80. data/src/liblwgeom/ptarray.c +847 -0
  81. data/src/liblwgeom/vsprintf.c +179 -0
  82. data/src/liblwgeom/wktparse.h +126 -0
  83. data/src/liblwgeom/wktparse.lex +74 -0
  84. data/src/liblwgeom/wktparse.tab.c +2353 -0
  85. data/src/liblwgeom/wktparse.tab.h +145 -0
  86. data/src/liblwgeom/wktparse.y +385 -0
  87. data/src/libsqlite3_geocoder/Makefile +22 -0
  88. data/src/libsqlite3_geocoder/Makefile.nix +15 -0
  89. data/src/libsqlite3_geocoder/Makefile.redhat +15 -0
  90. data/src/libsqlite3_geocoder/extension.c +121 -0
  91. data/src/libsqlite3_geocoder/extension.h +13 -0
  92. data/src/libsqlite3_geocoder/levenshtein.c +42 -0
  93. data/src/libsqlite3_geocoder/metaphon.c +278 -0
  94. data/src/libsqlite3_geocoder/util.c +37 -0
  95. data/src/libsqlite3_geocoder/wkb_compress.c +54 -0
  96. data/src/metaphone/Makefile +7 -0
  97. data/src/metaphone/README +49 -0
  98. data/src/metaphone/extension.c +37 -0
  99. data/src/metaphone/metaphon.c +251 -0
  100. data/src/shp2sqlite/Makefile +37 -0
  101. data/src/shp2sqlite/Makefile.nix +36 -0
  102. data/src/shp2sqlite/Makefile.redhat +35 -0
  103. data/src/shp2sqlite/dbfopen.c +1595 -0
  104. data/src/shp2sqlite/getopt.c +695 -0
  105. data/src/shp2sqlite/getopt.h +127 -0
  106. data/src/shp2sqlite/shapefil.h +500 -0
  107. data/src/shp2sqlite/shp2sqlite.c +1974 -0
  108. data/src/shp2sqlite/shpopen.c +1894 -0
  109. data/tests/address.rb +236 -0
  110. data/tests/benchmark.rb +20 -0
  111. data/tests/constants.rb +57 -0
  112. data/tests/data/address-sample.csv +52 -0
  113. data/tests/data/db-test.csv +57 -0
  114. data/tests/data/locations.csv +4 -0
  115. data/tests/database.rb +137 -0
  116. data/tests/generate.rb +34 -0
  117. data/tests/numbers.rb +46 -0
  118. data/tests/run.rb +11 -0
  119. metadata +237 -0
@@ -0,0 +1,745 @@
1
+ require 'rubygems'
2
+ require 'sqlite3'
3
+ require 'text'
4
+
5
+ require 'set'
6
+ require 'pp'
7
+ require 'time'
8
+ require 'thread'
9
+
10
+ require 'geocoder/us/address'
11
+
12
+ module Geocoder
13
+ end
14
+
15
+ module Geocoder::US
16
+ # Provides an interface to a Geocoder::US database.
17
+ class Database
18
+ Street_Weight = 3.0
19
+ Number_Weight = 2.0
20
+ Parity_Weight = 1.25
21
+ City_Weight = 1.0
22
+ @@mutex = Mutex.new
23
+
24
+ # Takes the path of an SQLite 3 database prepared for Geocoder::US
25
+ # as the sole mandatory argument. The helper argument points to the
26
+ # Geocoder::US SQLite plugin; the module looks for this in the same
27
+ # directory as database.rb by default. The cache_size argument is
28
+ # measured in kilobytes and is used to set the SQLite cache size; larger
29
+ # values will trade memory for speed in long-running processes.
30
+ def initialize (filename, options = {})
31
+ defaults = {:debug => false, :cache_size => 50000,
32
+ :helper => "sqlite3.so", :threadsafe => false,
33
+ :create => false}
34
+ options = defaults.merge options
35
+ raise ArgumentError, "can't find database #{filename}" \
36
+ unless options[:create] or File.exists? filename
37
+ @db = SQLite3::Database.new( filename )
38
+ @st = {}
39
+ @debug = options[:debug]
40
+ @threadsafe = options[:threadsafe]
41
+ tune options[:helper], options[:cache_size]
42
+ end
43
+
44
+ def synchronize
45
+ if not @threadsafe
46
+ @@mutex.synchronize { yield }
47
+ else
48
+ yield
49
+ end
50
+ end
51
+
52
+ #private
53
+
54
+ # Load the SQLite extension and tune the database settings.
55
+ # q.v. http://web.utk.edu/~jplyon/sqlite/SQLite_optimization_FAQ.html
56
+ def tune (helper, cache_size)
57
+ synchronize do
58
+ @db.create_function("levenshtein", 2) do |func, word1, word2|
59
+ test1, test2 = [word1, word2].map {|w|
60
+ w.to_s.gsub(/\W/o, "").downcase
61
+ }
62
+ dist = Text::Levenshtein.distance(test1, test2)
63
+ result = dist.to_f / [test1.length, test2.length].max
64
+ func.set_result result
65
+ end
66
+ @db.create_function("metaphone", 2) do |func, string, len|
67
+ test = string.to_s.gsub(/\W/o, "")
68
+ if test =~ /^(\d+)/o
69
+ mph = $1
70
+ elsif test =~ /^([wy])$/io
71
+ mph = $1
72
+ else
73
+ mph = Text::Metaphone.metaphone test
74
+ end
75
+ func.result = mph[0...len.to_i]
76
+ end
77
+ @db.create_function("nondigit_prefix", 1) do |func, string|
78
+ string.to_s =~ /^(.*\D)?(\d+)$/o
79
+ func.result = ($1 || "")
80
+ end
81
+ @db.create_function("digit_suffix", 1) do |func, string|
82
+ string.to_s =~ /^(.*\D)?(\d+)$/o
83
+ func.result = ($2 || "")
84
+ end
85
+ #@db.enable_load_extension(1)
86
+ #@db.load_extension(helper)
87
+ #@db.enable_load_extension(0)
88
+ @db.cache_size = cache_size
89
+ @db.temp_store = "memory"
90
+ @db.synchronous = "off"
91
+ end
92
+ end
93
+
94
+ # Return a cached SQLite statement object, preparing it first if
95
+ # it's not already in the cache.
96
+ def prepare (sql)
97
+ $stderr.print "SQL : #{sql}\n" if @debug
98
+ synchronize do
99
+ @st[sql] ||= @db.prepare sql
100
+ end
101
+ return @st[sql]
102
+ end
103
+
104
+ def flush_statements
105
+ @st = {}
106
+ end
107
+
108
+ # Generate enough SQL placeholders for a list of objects.
109
+ def placeholders_for (list)
110
+ (["?"] * list.length).join(",")
111
+ end
112
+
113
+ # Generate enough SQL placeholders for a list of objects.
114
+ def metaphone_placeholders_for (list)
115
+ (["metaphone(?,5)"] * list.length).join(",")
116
+ end
117
+
118
+ # Execute an SQL statement, bind a list of parameters, and
119
+ # return the result as a list of hashes.
120
+ def execute (sql, *params)
121
+ st = prepare(sql)
122
+ execute_statement st, *params
123
+ end
124
+
125
+
126
+ # Execute an SQLite statement object, bind the parameters,
127
+ # map the column names to symbols, and return the rows
128
+ # as a list of hashes.
129
+ def execute_statement (st, *params)
130
+ if @debug
131
+ start = Time.now
132
+ $stderr.print "EXEC: #{params.inspect}\n" if !params.empty?
133
+ end
134
+ rows = []
135
+ synchronize do
136
+ result = st.execute(*params)
137
+ columns = result.columns.map {|c| c.to_sym}
138
+ result.each {|row|
139
+ rows << Hash[*(columns.zip(row).flatten)]}
140
+
141
+ end
142
+ if @debug
143
+ runtime = format("%.3f", Time.now - start)
144
+ $stderr.print "ROWS: #{rows.length} (#{runtime}s)\n"
145
+ end
146
+ rows.reverse!
147
+ end
148
+
149
+ def places_by_zip (city, zip)
150
+ execute("SELECT *, levenshtein(?, city) AS city_score
151
+ FROM place WHERE zip = ? order by priority desc LIMIT 1;", city, zip)
152
+ end
153
+
154
+ # Query the place table for by city, optional state, and zip.
155
+ # The metaphone index on the place table is used to match
156
+ # city names.
157
+ def places_by_city (city, tokens, state)
158
+ if city.nil?
159
+ city = ""
160
+ end
161
+ if state.nil? or state.empty?
162
+ and_state = ""
163
+ args = [city] + tokens.clone
164
+ else
165
+ and_state = "AND state = ?"
166
+ args = [city] + tokens.clone + [state]
167
+ end
168
+ metaphones = metaphone_placeholders_for tokens
169
+ execute("SELECT *, levenshtein(?, city) AS city_score
170
+ FROM place WHERE city_phone IN (#{metaphones}) #{and_state} order by priority desc LIMIT 1;", *args)
171
+ end
172
+
173
+ # Generate an SQL query and set of parameters against the feature and range
174
+ # tables for a street name and optional building number. The SQL is
175
+ # used by candidate_records and more_candidate_records to filter results
176
+ # by ZIP code.
177
+ def features_by_street (street, tokens)
178
+ metaphones = (["metaphone(?,5)"] * tokens.length).join(",")
179
+ sql = "
180
+ SELECT feature.*, levenshtein(?, street) AS street_score
181
+ FROM feature
182
+ WHERE street_phone IN (#{metaphones})"
183
+ params = [street] + tokens
184
+ return [sql, params]
185
+ end
186
+
187
+ # Query the feature and range tables for a set of ranges, given a
188
+ # building number, street name, and list of candidate ZIP codes.
189
+ # The metaphone and ZIP code indexes on the feature table are
190
+ # used to match results.
191
+ def features_by_street_and_zip (street, tokens, zips)
192
+ sql, params = features_by_street(street, tokens)
193
+ in_list = placeholders_for zips
194
+ sql += " AND feature.zip IN (#{in_list})"
195
+ params += zips
196
+ execute sql, *params
197
+ end
198
+
199
+ # Query the feature and range tables for a set of ranges, given a
200
+ # building number, street name, and list of candidate ZIP codes.
201
+ # The ZIP codes are reduced to a set of 3-digit prefixes, broadening
202
+ # the search area.
203
+ def more_features_by_street_and_zip (street, tokens, zips)
204
+ sql, params = features_by_street(street, tokens)
205
+ if !zips.empty? and !zips[0].nil?
206
+ puts "zip results 2"
207
+ zip3s = zips.map {|z| z[0..2]+'%'}.to_set.to_a
208
+ like_list = zip3s.map {|z| "feature.zip LIKE ?"}.join(" OR ")
209
+ sql += " AND (#{like_list})"
210
+ params += zip3s
211
+ end
212
+ st = @db.prepare sql
213
+ execute_statement st, *params
214
+ end
215
+
216
+ def ranges_by_feature (fids, number, prenum)
217
+ in_list = placeholders_for fids
218
+ limit = 4 * fids.length
219
+ sql = "
220
+ SELECT feature_edge.fid AS fid, range.*
221
+ FROM feature_edge, range
222
+ WHERE fid IN (#{in_list})
223
+ AND feature_edge.tlid = range.tlid"
224
+ params = fids.clone
225
+ unless prenum.nil?
226
+ sql += " AND prenum = ?"
227
+ params += [prenum]
228
+ end
229
+ sql += "
230
+ ORDER BY min(abs(fromhn - ?), abs(tohn - ?))
231
+ LIMIT #{limit};"
232
+ params += [number, number]
233
+ execute sql, *params
234
+ end
235
+
236
+ # Query the edge table for a list of edges matching a list of edge IDs.
237
+ def edges (edge_ids)
238
+ in_list = placeholders_for edge_ids
239
+ sql = "SELECT edge.* FROM edge WHERE edge.tlid IN (#{in_list})"
240
+ execute sql, *edge_ids
241
+ end
242
+
243
+ # Query the range table for all ranges associated with the given
244
+ # list of edge IDs.
245
+ def range_ends (edge_ids)
246
+ in_list = placeholders_for edge_ids
247
+ sql = "SELECT tlid, side,
248
+ min(fromhn) > min(tohn) AS flipped,
249
+ min(fromhn) AS from0, max(tohn) AS to0,
250
+ min(tohn) AS from1, max(fromhn) AS to1
251
+ FROM range WHERE tlid IN (#{in_list})
252
+ GROUP BY tlid, side;"
253
+ execute(sql, *edge_ids).map {|r|
254
+ if r[:flipped] == "0"
255
+ r[:flipped] = false
256
+ r[:fromhn], r[:tohn] = r[:from0], r[:to0]
257
+ else
258
+ r[:flipped] = true
259
+ r[:fromhn], r[:tohn] = r[:from1], r[:to1]
260
+ end
261
+ [:from0, :to0, :from1, :to1].each {|k| r.delete k}
262
+ r
263
+ }
264
+ end
265
+
266
+ def intersections_by_fid (fids)
267
+ in_list = placeholders_for fids
268
+ sql = "
269
+ CREATE TEMPORARY TABLE intersection AS
270
+ SELECT fid, substr(geometry,1,8) AS point
271
+ FROM feature_edge, edge
272
+ WHERE feature_edge.tlid = edge.tlid
273
+ AND fid IN (#{in_list})
274
+ UNION
275
+ SELECT fid, substr(geometry,length(geometry)-7,8) AS point
276
+ FROM feature_edge, edge
277
+ WHERE feature_edge.tlid = edge.tlid
278
+ AND fid IN (#{in_list});
279
+ CREATE INDEX intersect_pt_idx ON intersection (point);"
280
+ execute sql, *(fids + fids)
281
+ # the a.fid < b.fid inequality guarantees consistent ordering of street
282
+ # names in the output
283
+ results = execute "
284
+ SELECT a.fid AS fid1, b.fid AS fid2, a.point
285
+ FROM intersection a, intersection b, feature f1, feature f2
286
+ WHERE a.point = b.point AND a.fid < b.fid
287
+ AND f1.fid = a.fid AND f2.fid = b.fid
288
+ AND f1.zip = f2.zip
289
+ AND f1.paflag = 'P' AND f2.paflag = 'P';"
290
+ execute "DROP TABLE intersection;"
291
+ flush_statements # the CREATE/DROP TABLE invalidates prepared statements
292
+ results
293
+ end
294
+
295
+ # Query the place table for notional "primary" place names for each of a
296
+ # list of ZIP codes. Since the place table shipped with this code is
297
+ # bespoke, and constructed from a variety of public domain sources,
298
+ # the primary name for a ZIP is not always the "right" one.
299
+ def primary_places (zips)
300
+ in_list = placeholders_for zips
301
+ sql = "SELECT * FROM place WHERE zip IN (#{in_list}) order by priority desc;"
302
+ execute sql, *zips
303
+ end
304
+
305
+ # Given a list of rows, find the unique values for a given key.
306
+ def unique_values (rows, key)
307
+ rows.map {|r| r[key]}.to_set.to_a
308
+ end
309
+
310
+ # Convert a list of rows into a hash keyed by the given keys.
311
+ def rows_to_h (rows, *keys)
312
+ hash = {}
313
+ rows.each {|row| (hash[row.values_at(*keys)] ||= []) << row; }
314
+ hash
315
+ end
316
+
317
+ # Merge the values in the list of rows given in src into the
318
+ # list of rows in dest, matching rows on the given list of keys.
319
+ # May generate more than one row in dest for each input dest row.
320
+ def merge_rows! (dest, src, *keys)
321
+ src = rows_to_h src, *keys
322
+ dest.map! {|row|
323
+ vals = row.values_at(*keys)
324
+ if src.key? vals
325
+ src[vals].map {|row2| row.merge row2}
326
+ else
327
+ [row]
328
+ end
329
+ }
330
+ dest.flatten!
331
+ end
332
+
333
+ def find_candidates (address)
334
+ places = []
335
+ candidates = []
336
+
337
+ city = address.city.sort {|a,b|a.length <=> b.length}[0]
338
+ if(!address.zip.empty? && !address.zip.nil?)
339
+ places = places_by_zip city, address.zip
340
+ end
341
+ places = places_by_city city, address.city_parts, address.state if places.empty?
342
+ return [] if places.empty?
343
+
344
+ address.city = unique_values places, :city
345
+ return places if address.street.empty?
346
+
347
+ zips = unique_values places, :zip
348
+ street = address.street.sort {|a,b|a.length <=> b.length}[0]
349
+ # puts "street parts = #{address.street_parts.inspect}"
350
+ candidates = features_by_street_and_zip street, address.street_parts, zips
351
+
352
+ if candidates.empty?
353
+ candidates = more_features_by_street_and_zip street, address.street_parts, zips
354
+ end
355
+
356
+ merge_rows! candidates, places, :zip
357
+ candidates
358
+ end
359
+
360
+ # Given a query hash and a list of candidates, assign :number
361
+ # and :precision values to each candidate. If the query building
362
+ # number is inside the candidate range, set the number on the result
363
+ # and set the precision to :range; otherwise, find the closest
364
+ # corner and set precision to :street.
365
+ def assign_number! (hn, candidates)
366
+ hn = 0 unless hn
367
+ for candidate in candidates
368
+ fromhn, tohn = candidate[:fromhn].to_i, candidate[:tohn].to_i
369
+ if (hn >= fromhn and hn <= tohn) or (hn <= fromhn and hn >= tohn)
370
+ candidate[:number] = hn.to_s
371
+ candidate[:precision] = :range
372
+ else
373
+ candidate[:number] = ((hn - fromhn).abs < (hn - tohn).abs ?
374
+ candidate[:fromhn] : candidate[:tohn]).to_s
375
+ candidate[:precision] = :street
376
+ end
377
+ end
378
+ end
379
+
380
+ def add_ranges! (address, candidates)
381
+ number = address.number.to_i
382
+ fids = unique_values candidates, :fid
383
+ ranges = ranges_by_feature fids, number, address.prenum
384
+ ranges = ranges_by_feature fids, number, nil unless !ranges.empty?
385
+ merge_rows! candidates, ranges, :fid
386
+ assign_number! number, candidates
387
+ end
388
+
389
+ def merge_edges! (candidates)
390
+ edge_ids = unique_values candidates, :tlid
391
+ records = edges edge_ids
392
+ merge_rows! candidates, records, :tlid
393
+ candidates.reject! {|record| record[:tlid].nil?}
394
+ edge_ids
395
+ end
396
+
397
+ def extend_ranges! (candidates)
398
+ edge_ids = merge_edges! candidates
399
+ full_ranges = range_ends edge_ids
400
+ merge_rows! candidates, full_ranges, :tlid, :side
401
+ end
402
+
403
+ # Score a list of candidates. For each candidate:
404
+ # * For each item in the query:
405
+ # ** if the query item is blank but the candidate is not, score 0.15;
406
+ # otherwise, if both are blank, score 1.0.
407
+ # ** If both items are set, compute the scaled Levenshtein-Damerau distance
408
+ # between them, and add that value (between 0.0 and 1.0) to the score.
409
+ # * Add 0.5 to the score for each numbered end of the range that matches
410
+ # the parity of the query number.
411
+ # * Add 1.0 if the query number is in the candidate range, otherwise
412
+ # add a fractional value for the notional distance between the
413
+ # closest candidate corner and the query.
414
+ # * Finally, divide the score by the total number of comparisons.
415
+ # The result should be between 0.0 and 1.0, with 1.0 indicating a
416
+ # perfect match.
417
+ def score_candidates! (address, candidates)
418
+ for candidate in candidates
419
+ candidate[:components] = {}
420
+ compare = [:prenum, :state, :zip]
421
+ denominator = compare.length + Street_Weight + City_Weight
422
+
423
+ street_score = (1.0 - candidate[:street_score].to_f) * Street_Weight
424
+ candidate[:components][:street] = street_score
425
+ city_score = (1.0 - candidate[:city_score].to_f) * City_Weight
426
+ candidate[:components][:city] = city_score
427
+ score = street_score + city_score
428
+
429
+ compare.each {|key|
430
+ src = address.send(key); src = src ? src.downcase : ""
431
+ dest = candidate[key]; dest = dest ? dest.downcase : ""
432
+ item_score = (src == dest) ? 1 : 0
433
+ candidate[:components][key] = item_score
434
+ score += item_score
435
+ }
436
+
437
+ if address.number and !address.number.empty?
438
+ parity = subscore = 0.0
439
+ fromhn, tohn, assigned, hn = [
440
+ candidate[:fromhn],
441
+ candidate[:tohn],
442
+ candidate[:number],
443
+ address.number].map {|s|s.to_i}
444
+ if candidate[:precision] == :range
445
+ subscore += Number_Weight
446
+ elsif assigned > 0
447
+ # only credit number subscore if assigned
448
+ subscore += Number_Weight/(assigned - hn).abs.to_f
449
+ end
450
+ candidate[:components][:number] = subscore
451
+ if hn > 0 and assigned > 0
452
+ # only credit parity if a number was given *and* assigned
453
+ parity += Parity_Weight/2.0 if fromhn % 2 == hn % 2
454
+ parity += Parity_Weight/2.0 if tohn % 2 == hn % 2
455
+ end
456
+ candidate[:components][:parity] = parity
457
+ score += subscore + parity
458
+ denominator += Number_Weight + Parity_Weight
459
+ end
460
+ candidate[:components][:total] = score.to_f
461
+ candidate[:components][:denominator] = denominator
462
+ candidate[:score] = score.to_f / denominator
463
+ end
464
+ end
465
+
466
+ # Find the candidates in a list of candidates that are tied for the
467
+ # top score and prune the remainder from the list.
468
+ def best_candidates! (candidates)
469
+ candidates.sort! {|a,b| b[:score] <=> a[:score]}
470
+ #candidates.reverse_each {|c| print "#{c[:number]} #{c[:state]} #{c[:city]} #{c[:raw_score]} #{c[:number_score]} #{c[:street_score]} #{c[:city_score]}\n" }
471
+ candidates.delete_if {|record| record[:score] < candidates[0][:score]}
472
+ end
473
+
474
+ # Compute the fractional interpolation distance for a query number along an
475
+ # edge, given all of the ranges for the same side of that edge.
476
+ def interpolation_distance (candidate)
477
+ fromhn, tohn, number = candidate.values_at(:fromhn, :tohn, :number).map{|x| x.to_i}
478
+ $stderr.print "NUM : #{fromhn} < #{number} < #{tohn} (flipped? #{candidate[:flipped]})\n" if @debug
479
+ # don't need this anymore since range_ends was improved...
480
+ fromhn, tohn = tohn, fromhn if fromhn > tohn
481
+ if fromhn > number
482
+ 0.0
483
+ elsif tohn < number
484
+ 1.0
485
+ else
486
+ (number - fromhn) / (tohn - fromhn).to_f
487
+ end
488
+ end
489
+
490
+ # Unpack an array of little-endian 4-byte ints, and convert them into
491
+ # signed floats by dividing by 10^6, inverting the process used by the
492
+ # compress_wkb_line() function in the SQLite helper extension.
493
+ def unpack_geometry (geom)
494
+ points = []
495
+ if !geom.nil?
496
+ # Pete - The database format is completely different to the one
497
+ # expected by the code, so I've done some detective work to
498
+ # figure out what it should be. It looks like the format is
499
+ # | 1 byte Type | 4 byte SRID | 4 byte element count| 8 byte double coordinates *
500
+ # I've added new code to read this, and commented out the old.
501
+ info = geom.unpack('CVVD*')
502
+ coords = info.slice(3, info.length)
503
+ points << [coords.shift, coords.shift] until coords.empty?
504
+
505
+ # coords = geom.unpack "V*" # little-endian 4-byte long ints
506
+ #
507
+ ## now map them into signed floats
508
+ # coords.map! {|i| ( i > (1 << 31) ? i - (1 << 32) : i ) / 1_000_000.0}
509
+ # points << [coords.shift, coords.shift] until coords.empty?
510
+ end
511
+ points
512
+ end
513
+
514
+ # Calculate the longitude scaling for the average of two latitudes.
515
+ def scale_lon (lat1,lat2)
516
+ # an approximation in place of lookup.rst (10e) and (10g)
517
+ # = scale longitude distances by the cosine of the latitude
518
+ # (or, actually, the mean of two latitudes)
519
+ # -- is this even necessary?
520
+ Math.cos((lat1+lat2) / 2 * Math::PI / 180)
521
+ end
522
+
523
+ # Simple Euclidean distances between two 2-D coordinate pairs, scaled
524
+ # along the longitudinal axis by scale_lon.
525
+ def distance (a, b)
526
+ dx = (b[0] - a[0]) * scale_lon(a[1], b[1])
527
+ dy = (b[1] - a[1])
528
+ Math.sqrt(dx ** 2 + dy ** 2)
529
+ end
530
+
531
+ # Find an interpolated point along a list of linestring vertices
532
+ # proportional to the given fractional distance along the line.
533
+ def interpolate (points, fraction)
534
+ $stderr.print "POINTS: #{points.inspect}" if @debug
535
+ return points[0] if fraction == 0.0
536
+ return points[-1] if fraction == 1.0
537
+ total = 0.0
538
+ (1...points.length).each {|n| total += distance(points[n-1], points[n])}
539
+ target = total * fraction
540
+ for n in 1...points.length
541
+ step = distance(points[n-1], points[n])
542
+ if step < target
543
+ target -= step
544
+ else
545
+ scale = scale_lon(points[n][1], points[n-1][1])
546
+ dx = (points[n][0] - points[n-1][0]) * (target/step) * scale
547
+ dy = (points[n][1] - points[n-1][1]) * (target/step)
548
+ found = [points[n-1][0]+dx, points[n-1][1]+dy]
549
+ return found.map {|x| format("%.6f", x).to_f}
550
+ end
551
+ end
552
+ # raise "Can't happen!"
553
+ end
554
+
555
+ # Find and replace the city, state, and county information
556
+ # in a list of candidates with the primary place information
557
+ # for the ZIP codes in the candidate list.
558
+ def canonicalize_places! (candidates)
559
+ zips_used = unique_values(candidates, :zip)
560
+ pri_places = rows_to_h primary_places(zips_used), :zip
561
+ candidates.map! {|record|
562
+ current_places = pri_places[[record[:zip]]]
563
+ # FIXME: this should never happen!
564
+ return [] unless current_places
565
+ top_priority = current_places.map{|p| p[:priority]}.min
566
+ current_places.select {|p| p[:priority] == top_priority}.map {|p|
567
+ record.merge({
568
+ :city => p[:city],
569
+ :state => p[:state],
570
+ :fips_county => p[:fips_county]
571
+ })
572
+ }
573
+ }
574
+ candidates.flatten!
575
+ end
576
+
577
+ # Clean up a candidate record by formatting the score, replacing nil
578
+ # values with empty strings, and deleting artifacts from database
579
+ # queries.
580
+ def clean_record! (record)
581
+ record[:score] = format("%.3f", record[:score]).to_f \
582
+ unless record[:score].nil?
583
+ record.keys.each {|k| record[k] = "" if record[k].nil? } # clean up nils
584
+ record.delete :components unless @debug
585
+ record.delete_if {|k,v| k.is_a? Fixnum or
586
+ [:geometry, :side, :tlid, :fid, :fid1, :fid2, :street_phone,
587
+ :city_phone, :fromhn, :tohn, :paflag, :flipped, :street_score,
588
+ :city_score, :priority, :fips_class, :fips_place, :status].include? k}
589
+ end
590
+
591
+ def best_places (address, places, canonicalize=false)
592
+ return [] unless !places.empty?
593
+ score_candidates! address, places
594
+ best_candidates! places
595
+ canonicalize_places! places if canonicalize
596
+
597
+ # uniqify places
598
+ by_name = rows_to_h(places, :city, :state)
599
+ if !by_name.nil?
600
+ begin
601
+ by_name.values.each {|v|
602
+ v.sort! {|a,b|
603
+ a[:zip] <=> b[:zip]
604
+ }}
605
+ rescue
606
+
607
+ end
608
+ places = by_name.map {|k,v| v[0]}
609
+
610
+ places.each {|record| clean_record! record}
611
+ places.each {|record|
612
+ record[:precision] = (record[:zip] == address.zip ? :zip : :city)
613
+ }
614
+ end
615
+ places
616
+ end
617
+
618
+ # Given an Address object, return a list of possible geocodes by place
619
+ # name. If canonicalize is true, attempt to return the "primary" postal
620
+ # place name for the given city, state, or ZIP.
621
+ def geocode_place (address, canonicalize=false)
622
+ places = []
623
+ places = places_by_zip address.text, address.zip if !address.zip.empty? or !address.zip.nil?
624
+ places = places_by_city address.text, address.city_parts, address.state if places.empty?
625
+ best_places address, places, canonicalize
626
+ end
627
+
628
+ def geocode_intersection (address, canonical_place=false)
629
+ candidates = find_candidates address
630
+ return [] if candidates.empty?
631
+ return best_places(address, candidates, canonical_place) if candidates[0][:street].nil?
632
+
633
+ features = rows_to_h candidates, :fid
634
+ intersects = intersections_by_fid features.keys.flatten
635
+ intersects.map! {|record|
636
+ feat1, feat2 = record.values_at(:fid1, :fid2).map {|k| features[[k]][0]}
637
+ record.merge! feat1
638
+ record[:street1] = record.delete(:street)
639
+ record[:street2] = feat2[:street]
640
+ record[:lon], record[:lat] = unpack_geometry(record.delete(:point))[0]
641
+ record[:precision] = :intersection
642
+ record[:street_score] = (feat1[:street_score].to_f + feat2[:street_score].to_f)/2
643
+ record
644
+ }
645
+ #pp(intersects)
646
+
647
+ score_candidates! address, intersects
648
+ best_candidates! intersects
649
+
650
+ by_point = rows_to_h(intersects, :lon, :lat)
651
+ candidates = by_point.values.map {|records| records[0]}
652
+
653
+ canonicalize_places! candidates if canonical_place
654
+ candidates.each {|record| clean_record! record}
655
+ candidates
656
+ end
657
+
658
+ # Given an Address object, return a list of possible geocodes by address
659
+ # range interpolation. If canonicalize is true, attempt to return the
660
+ # "primary" street and place names, if they are different from the ones
661
+ # given.
662
+ def geocode_address (address, canonical_place=false)
663
+ candidates = find_candidates address
664
+ return [] if candidates.empty?
665
+ return best_places(address, candidates, canonical_place) if candidates[0][:street].nil?
666
+
667
+ score_candidates! address, candidates
668
+ best_candidates! candidates
669
+
670
+ #candidates.sort {|a,b| b[:score] <=> a[:score]}.each {|candidate|
671
+ add_ranges! address, candidates
672
+ score_candidates! address, candidates
673
+ #pp candidates.sort {|a,b| b[:score] <=> a[:score]}
674
+ best_candidates! candidates
675
+
676
+ # sometimes multiple fids match the same tlid
677
+ by_tlid = rows_to_h candidates, :tlid
678
+ candidates = by_tlid.values.map {|records| records[0]}
679
+
680
+ # if no number is assigned in the query, only return one
681
+ # result for each street/zip combo
682
+ if !address.number.empty?
683
+ extend_ranges! candidates
684
+ else
685
+ by_street = rows_to_h candidates, :street, :zip
686
+ candidates = by_street.values.map {|records| records[0]}
687
+ merge_edges! candidates
688
+ end
689
+ candidates.map {|record|
690
+ dist = interpolation_distance record
691
+ $stderr.print "DIST: #{dist}\n" if @debug
692
+ points = unpack_geometry record[:geometry]
693
+ points.reverse! if record[:flipped]
694
+ record[:lon], record[:lat] = interpolate points, dist
695
+ }
696
+
697
+ canonicalize_places! candidates if canonical_place
698
+
699
+ candidates.each {|record| clean_record! record}
700
+ candidates
701
+ end
702
+
703
+ public
704
+
705
+ # Geocode a given address or place name string. The max_penalty and cutoff
706
+ # arguments are passed to the Address parse functions. If canonicalize is
707
+ # true, attempt to return the "primary" street and place names, if they are
708
+ # different from the ones given.
709
+ #
710
+ # Returns possible candidate matches as a list of hashes.
711
+ #
712
+ # * The :lat and :lon values of each hash store the range-interpolated
713
+ # address coordinates as latitude and longitude in the WGS84 spheroid.
714
+ # * The :precision value may be one of :city, :zip, :street, or :range, in
715
+ # order of increasing precision.
716
+ # * The :score value will be a float between 0.0 and 1.0 representing
717
+ # the approximate "goodness" of the candidate match.
718
+ # * The other values in the hash will represent various structured
719
+ # components of the address and place name.
720
+ def geocode (info_to_geocode, canonical_place=false)
721
+ address = Address.new info_to_geocode
722
+ $stderr.print "ADDR: #{address.inspect}\n" if @debug
723
+ return [] if address.city.empty? and address.zip.empty?
724
+ results = []
725
+ start_time = Time.now if @debug
726
+ if address.po_box? and !address.zip.empty?
727
+ results = geocode_place address, canonical_place
728
+ end
729
+ if address.intersection? and !address.street.empty? and address.number.empty?
730
+ results = geocode_intersection address, canonical_place
731
+ end
732
+ if results.empty? and !address.street.empty?
733
+ results = geocode_address address, canonical_place
734
+ end
735
+ if results.empty?
736
+ results = geocode_place address, canonical_place
737
+ end
738
+ if @debug
739
+ runtime = format("%.3f", Time.now - start_time)
740
+ $stderr.print "DONE: #{runtime}s\n"
741
+ end
742
+ results
743
+ end
744
+ end
745
+ end