geo_coder 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. data/Gemfile +12 -0
  2. data/Gemfile.lock +32 -0
  3. data/History.txt +6 -0
  4. data/Makefile +13 -0
  5. data/Manifest.txt +18 -0
  6. data/README.rdoc +197 -0
  7. data/Rakefile +53 -0
  8. data/TODO.txt +8 -0
  9. data/VERSION +1 -0
  10. data/bin/build_indexes +8 -0
  11. data/bin/rebuild_cluster +22 -0
  12. data/bin/rebuild_metaphones +23 -0
  13. data/bin/tiger_import +59 -0
  14. data/demos/demo/app/ext/geocodewrap.rb +84 -0
  15. data/demos/demo/app/views/index.builder +13 -0
  16. data/demos/demo/app/views/index.erb +71 -0
  17. data/demos/demo/config.ru +12 -0
  18. data/demos/demo/config/bootstraps.rb +130 -0
  19. data/demos/demo/config/geoenvironment.rb +25 -0
  20. data/demos/demo/geocoder_helper.rb +12 -0
  21. data/demos/demo/geocom_geocode.rb +10 -0
  22. data/demos/demo/main.rb +3 -0
  23. data/demos/demo/rakefile.rb +17 -0
  24. data/demos/demo/tmp/restart.txt +0 -0
  25. data/demos/simpledemo/views/index.builder +13 -0
  26. data/demos/simpledemo/views/index.erb +69 -0
  27. data/demos/simpledemo/ws.rb +83 -0
  28. data/doc/Makefile +7 -0
  29. data/doc/html4css1.css +279 -0
  30. data/doc/lookup.rst +193 -0
  31. data/doc/parsing.rst +125 -0
  32. data/doc/voidspace.css +147 -0
  33. data/geo_coder.gemspec +172 -0
  34. data/lib/geocoder/us.rb +21 -0
  35. data/lib/geocoder/us/address.rb +290 -0
  36. data/lib/geocoder/us/constants.rb +670 -0
  37. data/lib/geocoder/us/database.rb +745 -0
  38. data/lib/geocoder/us/import.rb +181 -0
  39. data/lib/geocoder/us/import/tiger.rb +13 -0
  40. data/lib/geocoder/us/numbers.rb +58 -0
  41. data/navteq/README +4 -0
  42. data/navteq/convert.sql +37 -0
  43. data/navteq/navteq_import +39 -0
  44. data/navteq/prepare.sql +92 -0
  45. data/sql/cluster.sql +16 -0
  46. data/sql/convert.sql +80 -0
  47. data/sql/create.sql +37 -0
  48. data/sql/index.sql +12 -0
  49. data/sql/place.csv +104944 -0
  50. data/sql/place.sql +104948 -0
  51. data/sql/setup.sql +78 -0
  52. data/src/Makefile +13 -0
  53. data/src/README +14 -0
  54. data/src/liblwgeom/Makefile +75 -0
  55. data/src/liblwgeom/box2d.c +54 -0
  56. data/src/liblwgeom/lex.yy.c +4799 -0
  57. data/src/liblwgeom/liblwgeom.h +1405 -0
  58. data/src/liblwgeom/lwalgorithm.c +946 -0
  59. data/src/liblwgeom/lwalgorithm.h +52 -0
  60. data/src/liblwgeom/lwcircstring.c +759 -0
  61. data/src/liblwgeom/lwcollection.c +541 -0
  62. data/src/liblwgeom/lwcompound.c +118 -0
  63. data/src/liblwgeom/lwcurvepoly.c +86 -0
  64. data/src/liblwgeom/lwgeom.c +886 -0
  65. data/src/liblwgeom/lwgeom_api.c +2201 -0
  66. data/src/liblwgeom/lwgparse.c +1219 -0
  67. data/src/liblwgeom/lwgunparse.c +1054 -0
  68. data/src/liblwgeom/lwline.c +525 -0
  69. data/src/liblwgeom/lwmcurve.c +125 -0
  70. data/src/liblwgeom/lwmline.c +137 -0
  71. data/src/liblwgeom/lwmpoint.c +138 -0
  72. data/src/liblwgeom/lwmpoly.c +141 -0
  73. data/src/liblwgeom/lwmsurface.c +129 -0
  74. data/src/liblwgeom/lwpoint.c +439 -0
  75. data/src/liblwgeom/lwpoly.c +579 -0
  76. data/src/liblwgeom/lwsegmentize.c +1047 -0
  77. data/src/liblwgeom/lwutil.c +369 -0
  78. data/src/liblwgeom/measures.c +861 -0
  79. data/src/liblwgeom/postgis_config.h +93 -0
  80. data/src/liblwgeom/ptarray.c +847 -0
  81. data/src/liblwgeom/vsprintf.c +179 -0
  82. data/src/liblwgeom/wktparse.h +126 -0
  83. data/src/liblwgeom/wktparse.lex +74 -0
  84. data/src/liblwgeom/wktparse.tab.c +2353 -0
  85. data/src/liblwgeom/wktparse.tab.h +145 -0
  86. data/src/liblwgeom/wktparse.y +385 -0
  87. data/src/libsqlite3_geocoder/Makefile +22 -0
  88. data/src/libsqlite3_geocoder/Makefile.nix +15 -0
  89. data/src/libsqlite3_geocoder/Makefile.redhat +15 -0
  90. data/src/libsqlite3_geocoder/extension.c +121 -0
  91. data/src/libsqlite3_geocoder/extension.h +13 -0
  92. data/src/libsqlite3_geocoder/levenshtein.c +42 -0
  93. data/src/libsqlite3_geocoder/metaphon.c +278 -0
  94. data/src/libsqlite3_geocoder/util.c +37 -0
  95. data/src/libsqlite3_geocoder/wkb_compress.c +54 -0
  96. data/src/metaphone/Makefile +7 -0
  97. data/src/metaphone/README +49 -0
  98. data/src/metaphone/extension.c +37 -0
  99. data/src/metaphone/metaphon.c +251 -0
  100. data/src/shp2sqlite/Makefile +37 -0
  101. data/src/shp2sqlite/Makefile.nix +36 -0
  102. data/src/shp2sqlite/Makefile.redhat +35 -0
  103. data/src/shp2sqlite/dbfopen.c +1595 -0
  104. data/src/shp2sqlite/getopt.c +695 -0
  105. data/src/shp2sqlite/getopt.h +127 -0
  106. data/src/shp2sqlite/shapefil.h +500 -0
  107. data/src/shp2sqlite/shp2sqlite.c +1974 -0
  108. data/src/shp2sqlite/shpopen.c +1894 -0
  109. data/tests/address.rb +236 -0
  110. data/tests/benchmark.rb +20 -0
  111. data/tests/constants.rb +57 -0
  112. data/tests/data/address-sample.csv +52 -0
  113. data/tests/data/db-test.csv +57 -0
  114. data/tests/data/locations.csv +4 -0
  115. data/tests/database.rb +137 -0
  116. data/tests/generate.rb +34 -0
  117. data/tests/numbers.rb +46 -0
  118. data/tests/run.rb +11 -0
  119. metadata +237 -0
@@ -0,0 +1,745 @@
1
+ require 'rubygems'
2
+ require 'sqlite3'
3
+ require 'text'
4
+
5
+ require 'set'
6
+ require 'pp'
7
+ require 'time'
8
+ require 'thread'
9
+
10
+ require 'geocoder/us/address'
11
+
12
+ module Geocoder
13
+ end
14
+
15
+ module Geocoder::US
16
+ # Provides an interface to a Geocoder::US database.
17
+ class Database
18
+ Street_Weight = 3.0
19
+ Number_Weight = 2.0
20
+ Parity_Weight = 1.25
21
+ City_Weight = 1.0
22
+ @@mutex = Mutex.new
23
+
24
+ # Takes the path of an SQLite 3 database prepared for Geocoder::US
25
+ # as the sole mandatory argument. The helper argument points to the
26
+ # Geocoder::US SQLite plugin; the module looks for this in the same
27
+ # directory as database.rb by default. The cache_size argument is
28
+ # measured in kilobytes and is used to set the SQLite cache size; larger
29
+ # values will trade memory for speed in long-running processes.
30
+ def initialize (filename, options = {})
31
+ defaults = {:debug => false, :cache_size => 50000,
32
+ :helper => "sqlite3.so", :threadsafe => false,
33
+ :create => false}
34
+ options = defaults.merge options
35
+ raise ArgumentError, "can't find database #{filename}" \
36
+ unless options[:create] or File.exists? filename
37
+ @db = SQLite3::Database.new( filename )
38
+ @st = {}
39
+ @debug = options[:debug]
40
+ @threadsafe = options[:threadsafe]
41
+ tune options[:helper], options[:cache_size]
42
+ end
43
+
44
+ def synchronize
45
+ if not @threadsafe
46
+ @@mutex.synchronize { yield }
47
+ else
48
+ yield
49
+ end
50
+ end
51
+
52
+ #private
53
+
54
+ # Load the SQLite extension and tune the database settings.
55
+ # q.v. http://web.utk.edu/~jplyon/sqlite/SQLite_optimization_FAQ.html
56
+ def tune (helper, cache_size)
57
+ synchronize do
58
+ @db.create_function("levenshtein", 2) do |func, word1, word2|
59
+ test1, test2 = [word1, word2].map {|w|
60
+ w.to_s.gsub(/\W/o, "").downcase
61
+ }
62
+ dist = Text::Levenshtein.distance(test1, test2)
63
+ result = dist.to_f / [test1.length, test2.length].max
64
+ func.set_result result
65
+ end
66
+ @db.create_function("metaphone", 2) do |func, string, len|
67
+ test = string.to_s.gsub(/\W/o, "")
68
+ if test =~ /^(\d+)/o
69
+ mph = $1
70
+ elsif test =~ /^([wy])$/io
71
+ mph = $1
72
+ else
73
+ mph = Text::Metaphone.metaphone test
74
+ end
75
+ func.result = mph[0...len.to_i]
76
+ end
77
+ @db.create_function("nondigit_prefix", 1) do |func, string|
78
+ string.to_s =~ /^(.*\D)?(\d+)$/o
79
+ func.result = ($1 || "")
80
+ end
81
+ @db.create_function("digit_suffix", 1) do |func, string|
82
+ string.to_s =~ /^(.*\D)?(\d+)$/o
83
+ func.result = ($2 || "")
84
+ end
85
+ #@db.enable_load_extension(1)
86
+ #@db.load_extension(helper)
87
+ #@db.enable_load_extension(0)
88
+ @db.cache_size = cache_size
89
+ @db.temp_store = "memory"
90
+ @db.synchronous = "off"
91
+ end
92
+ end
93
+
94
+ # Return a cached SQLite statement object, preparing it first if
95
+ # it's not already in the cache.
96
+ def prepare (sql)
97
+ $stderr.print "SQL : #{sql}\n" if @debug
98
+ synchronize do
99
+ @st[sql] ||= @db.prepare sql
100
+ end
101
+ return @st[sql]
102
+ end
103
+
104
+ def flush_statements
105
+ @st = {}
106
+ end
107
+
108
+ # Generate enough SQL placeholders for a list of objects.
109
+ def placeholders_for (list)
110
+ (["?"] * list.length).join(",")
111
+ end
112
+
113
+ # Generate enough SQL placeholders for a list of objects.
114
+ def metaphone_placeholders_for (list)
115
+ (["metaphone(?,5)"] * list.length).join(",")
116
+ end
117
+
118
+ # Execute an SQL statement, bind a list of parameters, and
119
+ # return the result as a list of hashes.
120
+ def execute (sql, *params)
121
+ st = prepare(sql)
122
+ execute_statement st, *params
123
+ end
124
+
125
+
126
+ # Execute an SQLite statement object, bind the parameters,
127
+ # map the column names to symbols, and return the rows
128
+ # as a list of hashes.
129
+ def execute_statement (st, *params)
130
+ if @debug
131
+ start = Time.now
132
+ $stderr.print "EXEC: #{params.inspect}\n" if !params.empty?
133
+ end
134
+ rows = []
135
+ synchronize do
136
+ result = st.execute(*params)
137
+ columns = result.columns.map {|c| c.to_sym}
138
+ result.each {|row|
139
+ rows << Hash[*(columns.zip(row).flatten)]}
140
+
141
+ end
142
+ if @debug
143
+ runtime = format("%.3f", Time.now - start)
144
+ $stderr.print "ROWS: #{rows.length} (#{runtime}s)\n"
145
+ end
146
+ rows.reverse!
147
+ end
148
+
149
+ def places_by_zip (city, zip)
150
+ execute("SELECT *, levenshtein(?, city) AS city_score
151
+ FROM place WHERE zip = ? order by priority desc LIMIT 1;", city, zip)
152
+ end
153
+
154
+ # Query the place table for by city, optional state, and zip.
155
+ # The metaphone index on the place table is used to match
156
+ # city names.
157
+ def places_by_city (city, tokens, state)
158
+ if city.nil?
159
+ city = ""
160
+ end
161
+ if state.nil? or state.empty?
162
+ and_state = ""
163
+ args = [city] + tokens.clone
164
+ else
165
+ and_state = "AND state = ?"
166
+ args = [city] + tokens.clone + [state]
167
+ end
168
+ metaphones = metaphone_placeholders_for tokens
169
+ execute("SELECT *, levenshtein(?, city) AS city_score
170
+ FROM place WHERE city_phone IN (#{metaphones}) #{and_state} order by priority desc LIMIT 1;", *args)
171
+ end
172
+
173
+ # Generate an SQL query and set of parameters against the feature and range
174
+ # tables for a street name and optional building number. The SQL is
175
+ # used by candidate_records and more_candidate_records to filter results
176
+ # by ZIP code.
177
+ def features_by_street (street, tokens)
178
+ metaphones = (["metaphone(?,5)"] * tokens.length).join(",")
179
+ sql = "
180
+ SELECT feature.*, levenshtein(?, street) AS street_score
181
+ FROM feature
182
+ WHERE street_phone IN (#{metaphones})"
183
+ params = [street] + tokens
184
+ return [sql, params]
185
+ end
186
+
187
+ # Query the feature and range tables for a set of ranges, given a
188
+ # building number, street name, and list of candidate ZIP codes.
189
+ # The metaphone and ZIP code indexes on the feature table are
190
+ # used to match results.
191
+ def features_by_street_and_zip (street, tokens, zips)
192
+ sql, params = features_by_street(street, tokens)
193
+ in_list = placeholders_for zips
194
+ sql += " AND feature.zip IN (#{in_list})"
195
+ params += zips
196
+ execute sql, *params
197
+ end
198
+
199
+ # Query the feature and range tables for a set of ranges, given a
200
+ # building number, street name, and list of candidate ZIP codes.
201
+ # The ZIP codes are reduced to a set of 3-digit prefixes, broadening
202
+ # the search area.
203
+ def more_features_by_street_and_zip (street, tokens, zips)
204
+ sql, params = features_by_street(street, tokens)
205
+ if !zips.empty? and !zips[0].nil?
206
+ puts "zip results 2"
207
+ zip3s = zips.map {|z| z[0..2]+'%'}.to_set.to_a
208
+ like_list = zip3s.map {|z| "feature.zip LIKE ?"}.join(" OR ")
209
+ sql += " AND (#{like_list})"
210
+ params += zip3s
211
+ end
212
+ st = @db.prepare sql
213
+ execute_statement st, *params
214
+ end
215
+
216
+ def ranges_by_feature (fids, number, prenum)
217
+ in_list = placeholders_for fids
218
+ limit = 4 * fids.length
219
+ sql = "
220
+ SELECT feature_edge.fid AS fid, range.*
221
+ FROM feature_edge, range
222
+ WHERE fid IN (#{in_list})
223
+ AND feature_edge.tlid = range.tlid"
224
+ params = fids.clone
225
+ unless prenum.nil?
226
+ sql += " AND prenum = ?"
227
+ params += [prenum]
228
+ end
229
+ sql += "
230
+ ORDER BY min(abs(fromhn - ?), abs(tohn - ?))
231
+ LIMIT #{limit};"
232
+ params += [number, number]
233
+ execute sql, *params
234
+ end
235
+
236
+ # Query the edge table for a list of edges matching a list of edge IDs.
237
+ def edges (edge_ids)
238
+ in_list = placeholders_for edge_ids
239
+ sql = "SELECT edge.* FROM edge WHERE edge.tlid IN (#{in_list})"
240
+ execute sql, *edge_ids
241
+ end
242
+
243
+ # Query the range table for all ranges associated with the given
244
+ # list of edge IDs.
245
+ def range_ends (edge_ids)
246
+ in_list = placeholders_for edge_ids
247
+ sql = "SELECT tlid, side,
248
+ min(fromhn) > min(tohn) AS flipped,
249
+ min(fromhn) AS from0, max(tohn) AS to0,
250
+ min(tohn) AS from1, max(fromhn) AS to1
251
+ FROM range WHERE tlid IN (#{in_list})
252
+ GROUP BY tlid, side;"
253
+ execute(sql, *edge_ids).map {|r|
254
+ if r[:flipped] == "0"
255
+ r[:flipped] = false
256
+ r[:fromhn], r[:tohn] = r[:from0], r[:to0]
257
+ else
258
+ r[:flipped] = true
259
+ r[:fromhn], r[:tohn] = r[:from1], r[:to1]
260
+ end
261
+ [:from0, :to0, :from1, :to1].each {|k| r.delete k}
262
+ r
263
+ }
264
+ end
265
+
266
+ def intersections_by_fid (fids)
267
+ in_list = placeholders_for fids
268
+ sql = "
269
+ CREATE TEMPORARY TABLE intersection AS
270
+ SELECT fid, substr(geometry,1,8) AS point
271
+ FROM feature_edge, edge
272
+ WHERE feature_edge.tlid = edge.tlid
273
+ AND fid IN (#{in_list})
274
+ UNION
275
+ SELECT fid, substr(geometry,length(geometry)-7,8) AS point
276
+ FROM feature_edge, edge
277
+ WHERE feature_edge.tlid = edge.tlid
278
+ AND fid IN (#{in_list});
279
+ CREATE INDEX intersect_pt_idx ON intersection (point);"
280
+ execute sql, *(fids + fids)
281
+ # the a.fid < b.fid inequality guarantees consistent ordering of street
282
+ # names in the output
283
+ results = execute "
284
+ SELECT a.fid AS fid1, b.fid AS fid2, a.point
285
+ FROM intersection a, intersection b, feature f1, feature f2
286
+ WHERE a.point = b.point AND a.fid < b.fid
287
+ AND f1.fid = a.fid AND f2.fid = b.fid
288
+ AND f1.zip = f2.zip
289
+ AND f1.paflag = 'P' AND f2.paflag = 'P';"
290
+ execute "DROP TABLE intersection;"
291
+ flush_statements # the CREATE/DROP TABLE invalidates prepared statements
292
+ results
293
+ end
294
+
295
+ # Query the place table for notional "primary" place names for each of a
296
+ # list of ZIP codes. Since the place table shipped with this code is
297
+ # bespoke, and constructed from a variety of public domain sources,
298
+ # the primary name for a ZIP is not always the "right" one.
299
+ def primary_places (zips)
300
+ in_list = placeholders_for zips
301
+ sql = "SELECT * FROM place WHERE zip IN (#{in_list}) order by priority desc;"
302
+ execute sql, *zips
303
+ end
304
+
305
+ # Given a list of rows, find the unique values for a given key.
306
+ def unique_values (rows, key)
307
+ rows.map {|r| r[key]}.to_set.to_a
308
+ end
309
+
310
+ # Convert a list of rows into a hash keyed by the given keys.
311
+ def rows_to_h (rows, *keys)
312
+ hash = {}
313
+ rows.each {|row| (hash[row.values_at(*keys)] ||= []) << row; }
314
+ hash
315
+ end
316
+
317
+ # Merge the values in the list of rows given in src into the
318
+ # list of rows in dest, matching rows on the given list of keys.
319
+ # May generate more than one row in dest for each input dest row.
320
+ def merge_rows! (dest, src, *keys)
321
+ src = rows_to_h src, *keys
322
+ dest.map! {|row|
323
+ vals = row.values_at(*keys)
324
+ if src.key? vals
325
+ src[vals].map {|row2| row.merge row2}
326
+ else
327
+ [row]
328
+ end
329
+ }
330
+ dest.flatten!
331
+ end
332
+
333
+ def find_candidates (address)
334
+ places = []
335
+ candidates = []
336
+
337
+ city = address.city.sort {|a,b|a.length <=> b.length}[0]
338
+ if(!address.zip.empty? && !address.zip.nil?)
339
+ places = places_by_zip city, address.zip
340
+ end
341
+ places = places_by_city city, address.city_parts, address.state if places.empty?
342
+ return [] if places.empty?
343
+
344
+ address.city = unique_values places, :city
345
+ return places if address.street.empty?
346
+
347
+ zips = unique_values places, :zip
348
+ street = address.street.sort {|a,b|a.length <=> b.length}[0]
349
+ # puts "street parts = #{address.street_parts.inspect}"
350
+ candidates = features_by_street_and_zip street, address.street_parts, zips
351
+
352
+ if candidates.empty?
353
+ candidates = more_features_by_street_and_zip street, address.street_parts, zips
354
+ end
355
+
356
+ merge_rows! candidates, places, :zip
357
+ candidates
358
+ end
359
+
360
+ # Given a query hash and a list of candidates, assign :number
361
+ # and :precision values to each candidate. If the query building
362
+ # number is inside the candidate range, set the number on the result
363
+ # and set the precision to :range; otherwise, find the closest
364
+ # corner and set precision to :street.
365
+ def assign_number! (hn, candidates)
366
+ hn = 0 unless hn
367
+ for candidate in candidates
368
+ fromhn, tohn = candidate[:fromhn].to_i, candidate[:tohn].to_i
369
+ if (hn >= fromhn and hn <= tohn) or (hn <= fromhn and hn >= tohn)
370
+ candidate[:number] = hn.to_s
371
+ candidate[:precision] = :range
372
+ else
373
+ candidate[:number] = ((hn - fromhn).abs < (hn - tohn).abs ?
374
+ candidate[:fromhn] : candidate[:tohn]).to_s
375
+ candidate[:precision] = :street
376
+ end
377
+ end
378
+ end
379
+
380
+ def add_ranges! (address, candidates)
381
+ number = address.number.to_i
382
+ fids = unique_values candidates, :fid
383
+ ranges = ranges_by_feature fids, number, address.prenum
384
+ ranges = ranges_by_feature fids, number, nil unless !ranges.empty?
385
+ merge_rows! candidates, ranges, :fid
386
+ assign_number! number, candidates
387
+ end
388
+
389
+ def merge_edges! (candidates)
390
+ edge_ids = unique_values candidates, :tlid
391
+ records = edges edge_ids
392
+ merge_rows! candidates, records, :tlid
393
+ candidates.reject! {|record| record[:tlid].nil?}
394
+ edge_ids
395
+ end
396
+
397
+ def extend_ranges! (candidates)
398
+ edge_ids = merge_edges! candidates
399
+ full_ranges = range_ends edge_ids
400
+ merge_rows! candidates, full_ranges, :tlid, :side
401
+ end
402
+
403
+ # Score a list of candidates. For each candidate:
404
+ # * For each item in the query:
405
+ # ** if the query item is blank but the candidate is not, score 0.15;
406
+ # otherwise, if both are blank, score 1.0.
407
+ # ** If both items are set, compute the scaled Levenshtein-Damerau distance
408
+ # between them, and add that value (between 0.0 and 1.0) to the score.
409
+ # * Add 0.5 to the score for each numbered end of the range that matches
410
+ # the parity of the query number.
411
+ # * Add 1.0 if the query number is in the candidate range, otherwise
412
+ # add a fractional value for the notional distance between the
413
+ # closest candidate corner and the query.
414
+ # * Finally, divide the score by the total number of comparisons.
415
+ # The result should be between 0.0 and 1.0, with 1.0 indicating a
416
+ # perfect match.
417
+ def score_candidates! (address, candidates)
418
+ for candidate in candidates
419
+ candidate[:components] = {}
420
+ compare = [:prenum, :state, :zip]
421
+ denominator = compare.length + Street_Weight + City_Weight
422
+
423
+ street_score = (1.0 - candidate[:street_score].to_f) * Street_Weight
424
+ candidate[:components][:street] = street_score
425
+ city_score = (1.0 - candidate[:city_score].to_f) * City_Weight
426
+ candidate[:components][:city] = city_score
427
+ score = street_score + city_score
428
+
429
+ compare.each {|key|
430
+ src = address.send(key); src = src ? src.downcase : ""
431
+ dest = candidate[key]; dest = dest ? dest.downcase : ""
432
+ item_score = (src == dest) ? 1 : 0
433
+ candidate[:components][key] = item_score
434
+ score += item_score
435
+ }
436
+
437
+ if address.number and !address.number.empty?
438
+ parity = subscore = 0.0
439
+ fromhn, tohn, assigned, hn = [
440
+ candidate[:fromhn],
441
+ candidate[:tohn],
442
+ candidate[:number],
443
+ address.number].map {|s|s.to_i}
444
+ if candidate[:precision] == :range
445
+ subscore += Number_Weight
446
+ elsif assigned > 0
447
+ # only credit number subscore if assigned
448
+ subscore += Number_Weight/(assigned - hn).abs.to_f
449
+ end
450
+ candidate[:components][:number] = subscore
451
+ if hn > 0 and assigned > 0
452
+ # only credit parity if a number was given *and* assigned
453
+ parity += Parity_Weight/2.0 if fromhn % 2 == hn % 2
454
+ parity += Parity_Weight/2.0 if tohn % 2 == hn % 2
455
+ end
456
+ candidate[:components][:parity] = parity
457
+ score += subscore + parity
458
+ denominator += Number_Weight + Parity_Weight
459
+ end
460
+ candidate[:components][:total] = score.to_f
461
+ candidate[:components][:denominator] = denominator
462
+ candidate[:score] = score.to_f / denominator
463
+ end
464
+ end
465
+
466
+ # Find the candidates in a list of candidates that are tied for the
467
+ # top score and prune the remainder from the list.
468
+ def best_candidates! (candidates)
469
+ candidates.sort! {|a,b| b[:score] <=> a[:score]}
470
+ #candidates.reverse_each {|c| print "#{c[:number]} #{c[:state]} #{c[:city]} #{c[:raw_score]} #{c[:number_score]} #{c[:street_score]} #{c[:city_score]}\n" }
471
+ candidates.delete_if {|record| record[:score] < candidates[0][:score]}
472
+ end
473
+
474
+ # Compute the fractional interpolation distance for a query number along an
475
+ # edge, given all of the ranges for the same side of that edge.
476
+ def interpolation_distance (candidate)
477
+ fromhn, tohn, number = candidate.values_at(:fromhn, :tohn, :number).map{|x| x.to_i}
478
+ $stderr.print "NUM : #{fromhn} < #{number} < #{tohn} (flipped? #{candidate[:flipped]})\n" if @debug
479
+ # don't need this anymore since range_ends was improved...
480
+ fromhn, tohn = tohn, fromhn if fromhn > tohn
481
+ if fromhn > number
482
+ 0.0
483
+ elsif tohn < number
484
+ 1.0
485
+ else
486
+ (number - fromhn) / (tohn - fromhn).to_f
487
+ end
488
+ end
489
+
490
+ # Unpack an array of little-endian 4-byte ints, and convert them into
491
+ # signed floats by dividing by 10^6, inverting the process used by the
492
+ # compress_wkb_line() function in the SQLite helper extension.
493
+ def unpack_geometry (geom)
494
+ points = []
495
+ if !geom.nil?
496
+ # Pete - The database format is completely different to the one
497
+ # expected by the code, so I've done some detective work to
498
+ # figure out what it should be. It looks like the format is
499
+ # | 1 byte Type | 4 byte SRID | 4 byte element count| 8 byte double coordinates *
500
+ # I've added new code to read this, and commented out the old.
501
+ info = geom.unpack('CVVD*')
502
+ coords = info.slice(3, info.length)
503
+ points << [coords.shift, coords.shift] until coords.empty?
504
+
505
+ # coords = geom.unpack "V*" # little-endian 4-byte long ints
506
+ #
507
+ ## now map them into signed floats
508
+ # coords.map! {|i| ( i > (1 << 31) ? i - (1 << 32) : i ) / 1_000_000.0}
509
+ # points << [coords.shift, coords.shift] until coords.empty?
510
+ end
511
+ points
512
+ end
513
+
514
+ # Calculate the longitude scaling for the average of two latitudes.
515
+ def scale_lon (lat1,lat2)
516
+ # an approximation in place of lookup.rst (10e) and (10g)
517
+ # = scale longitude distances by the cosine of the latitude
518
+ # (or, actually, the mean of two latitudes)
519
+ # -- is this even necessary?
520
+ Math.cos((lat1+lat2) / 2 * Math::PI / 180)
521
+ end
522
+
523
+ # Simple Euclidean distances between two 2-D coordinate pairs, scaled
524
+ # along the longitudinal axis by scale_lon.
525
+ def distance (a, b)
526
+ dx = (b[0] - a[0]) * scale_lon(a[1], b[1])
527
+ dy = (b[1] - a[1])
528
+ Math.sqrt(dx ** 2 + dy ** 2)
529
+ end
530
+
531
+ # Find an interpolated point along a list of linestring vertices
532
+ # proportional to the given fractional distance along the line.
533
+ def interpolate (points, fraction)
534
+ $stderr.print "POINTS: #{points.inspect}" if @debug
535
+ return points[0] if fraction == 0.0
536
+ return points[-1] if fraction == 1.0
537
+ total = 0.0
538
+ (1...points.length).each {|n| total += distance(points[n-1], points[n])}
539
+ target = total * fraction
540
+ for n in 1...points.length
541
+ step = distance(points[n-1], points[n])
542
+ if step < target
543
+ target -= step
544
+ else
545
+ scale = scale_lon(points[n][1], points[n-1][1])
546
+ dx = (points[n][0] - points[n-1][0]) * (target/step) * scale
547
+ dy = (points[n][1] - points[n-1][1]) * (target/step)
548
+ found = [points[n-1][0]+dx, points[n-1][1]+dy]
549
+ return found.map {|x| format("%.6f", x).to_f}
550
+ end
551
+ end
552
+ # raise "Can't happen!"
553
+ end
554
+
555
+ # Find and replace the city, state, and county information
556
+ # in a list of candidates with the primary place information
557
+ # for the ZIP codes in the candidate list.
558
+ def canonicalize_places! (candidates)
559
+ zips_used = unique_values(candidates, :zip)
560
+ pri_places = rows_to_h primary_places(zips_used), :zip
561
+ candidates.map! {|record|
562
+ current_places = pri_places[[record[:zip]]]
563
+ # FIXME: this should never happen!
564
+ return [] unless current_places
565
+ top_priority = current_places.map{|p| p[:priority]}.min
566
+ current_places.select {|p| p[:priority] == top_priority}.map {|p|
567
+ record.merge({
568
+ :city => p[:city],
569
+ :state => p[:state],
570
+ :fips_county => p[:fips_county]
571
+ })
572
+ }
573
+ }
574
+ candidates.flatten!
575
+ end
576
+
577
+ # Clean up a candidate record by formatting the score, replacing nil
578
+ # values with empty strings, and deleting artifacts from database
579
+ # queries.
580
+ def clean_record! (record)
581
+ record[:score] = format("%.3f", record[:score]).to_f \
582
+ unless record[:score].nil?
583
+ record.keys.each {|k| record[k] = "" if record[k].nil? } # clean up nils
584
+ record.delete :components unless @debug
585
+ record.delete_if {|k,v| k.is_a? Fixnum or
586
+ [:geometry, :side, :tlid, :fid, :fid1, :fid2, :street_phone,
587
+ :city_phone, :fromhn, :tohn, :paflag, :flipped, :street_score,
588
+ :city_score, :priority, :fips_class, :fips_place, :status].include? k}
589
+ end
590
+
591
+ def best_places (address, places, canonicalize=false)
592
+ return [] unless !places.empty?
593
+ score_candidates! address, places
594
+ best_candidates! places
595
+ canonicalize_places! places if canonicalize
596
+
597
+ # uniqify places
598
+ by_name = rows_to_h(places, :city, :state)
599
+ if !by_name.nil?
600
+ begin
601
+ by_name.values.each {|v|
602
+ v.sort! {|a,b|
603
+ a[:zip] <=> b[:zip]
604
+ }}
605
+ rescue
606
+
607
+ end
608
+ places = by_name.map {|k,v| v[0]}
609
+
610
+ places.each {|record| clean_record! record}
611
+ places.each {|record|
612
+ record[:precision] = (record[:zip] == address.zip ? :zip : :city)
613
+ }
614
+ end
615
+ places
616
+ end
617
+
618
+ # Given an Address object, return a list of possible geocodes by place
619
+ # name. If canonicalize is true, attempt to return the "primary" postal
620
+ # place name for the given city, state, or ZIP.
621
+ def geocode_place (address, canonicalize=false)
622
+ places = []
623
+ places = places_by_zip address.text, address.zip if !address.zip.empty? or !address.zip.nil?
624
+ places = places_by_city address.text, address.city_parts, address.state if places.empty?
625
+ best_places address, places, canonicalize
626
+ end
627
+
628
+ def geocode_intersection (address, canonical_place=false)
629
+ candidates = find_candidates address
630
+ return [] if candidates.empty?
631
+ return best_places(address, candidates, canonical_place) if candidates[0][:street].nil?
632
+
633
+ features = rows_to_h candidates, :fid
634
+ intersects = intersections_by_fid features.keys.flatten
635
+ intersects.map! {|record|
636
+ feat1, feat2 = record.values_at(:fid1, :fid2).map {|k| features[[k]][0]}
637
+ record.merge! feat1
638
+ record[:street1] = record.delete(:street)
639
+ record[:street2] = feat2[:street]
640
+ record[:lon], record[:lat] = unpack_geometry(record.delete(:point))[0]
641
+ record[:precision] = :intersection
642
+ record[:street_score] = (feat1[:street_score].to_f + feat2[:street_score].to_f)/2
643
+ record
644
+ }
645
+ #pp(intersects)
646
+
647
+ score_candidates! address, intersects
648
+ best_candidates! intersects
649
+
650
+ by_point = rows_to_h(intersects, :lon, :lat)
651
+ candidates = by_point.values.map {|records| records[0]}
652
+
653
+ canonicalize_places! candidates if canonical_place
654
+ candidates.each {|record| clean_record! record}
655
+ candidates
656
+ end
657
+
658
+ # Given an Address object, return a list of possible geocodes by address
659
+ # range interpolation. If canonicalize is true, attempt to return the
660
+ # "primary" street and place names, if they are different from the ones
661
+ # given.
662
+ def geocode_address (address, canonical_place=false)
663
+ candidates = find_candidates address
664
+ return [] if candidates.empty?
665
+ return best_places(address, candidates, canonical_place) if candidates[0][:street].nil?
666
+
667
+ score_candidates! address, candidates
668
+ best_candidates! candidates
669
+
670
+ #candidates.sort {|a,b| b[:score] <=> a[:score]}.each {|candidate|
671
+ add_ranges! address, candidates
672
+ score_candidates! address, candidates
673
+ #pp candidates.sort {|a,b| b[:score] <=> a[:score]}
674
+ best_candidates! candidates
675
+
676
+ # sometimes multiple fids match the same tlid
677
+ by_tlid = rows_to_h candidates, :tlid
678
+ candidates = by_tlid.values.map {|records| records[0]}
679
+
680
+ # if no number is assigned in the query, only return one
681
+ # result for each street/zip combo
682
+ if !address.number.empty?
683
+ extend_ranges! candidates
684
+ else
685
+ by_street = rows_to_h candidates, :street, :zip
686
+ candidates = by_street.values.map {|records| records[0]}
687
+ merge_edges! candidates
688
+ end
689
+ candidates.map {|record|
690
+ dist = interpolation_distance record
691
+ $stderr.print "DIST: #{dist}\n" if @debug
692
+ points = unpack_geometry record[:geometry]
693
+ points.reverse! if record[:flipped]
694
+ record[:lon], record[:lat] = interpolate points, dist
695
+ }
696
+
697
+ canonicalize_places! candidates if canonical_place
698
+
699
+ candidates.each {|record| clean_record! record}
700
+ candidates
701
+ end
702
+
703
+ public
704
+
705
+ # Geocode a given address or place name string. The max_penalty and cutoff
706
+ # arguments are passed to the Address parse functions. If canonicalize is
707
+ # true, attempt to return the "primary" street and place names, if they are
708
+ # different from the ones given.
709
+ #
710
+ # Returns possible candidate matches as a list of hashes.
711
+ #
712
+ # * The :lat and :lon values of each hash store the range-interpolated
713
+ # address coordinates as latitude and longitude in the WGS84 spheroid.
714
+ # * The :precision value may be one of :city, :zip, :street, or :range, in
715
+ # order of increasing precision.
716
+ # * The :score value will be a float between 0.0 and 1.0 representing
717
+ # the approximate "goodness" of the candidate match.
718
+ # * The other values in the hash will represent various structured
719
+ # components of the address and place name.
720
+ def geocode (info_to_geocode, canonical_place=false)
721
+ address = Address.new info_to_geocode
722
+ $stderr.print "ADDR: #{address.inspect}\n" if @debug
723
+ return [] if address.city.empty? and address.zip.empty?
724
+ results = []
725
+ start_time = Time.now if @debug
726
+ if address.po_box? and !address.zip.empty?
727
+ results = geocode_place address, canonical_place
728
+ end
729
+ if address.intersection? and !address.street.empty? and address.number.empty?
730
+ results = geocode_intersection address, canonical_place
731
+ end
732
+ if results.empty? and !address.street.empty?
733
+ results = geocode_address address, canonical_place
734
+ end
735
+ if results.empty?
736
+ results = geocode_place address, canonical_place
737
+ end
738
+ if @debug
739
+ runtime = format("%.3f", Time.now - start_time)
740
+ $stderr.print "DONE: #{runtime}s\n"
741
+ end
742
+ results
743
+ end
744
+ end
745
+ end