RubyGems - geo_coder - Versions diffs - 0.1.0 - Mend

geo_coder 0.1.0

Files changed (119) hide show

data/Gemfile +12 -0
data/Gemfile.lock +32 -0
data/History.txt +6 -0
data/Makefile +13 -0
data/Manifest.txt +18 -0
data/README.rdoc +197 -0
data/Rakefile +53 -0
data/TODO.txt +8 -0
data/VERSION +1 -0
data/bin/build_indexes +8 -0
data/bin/rebuild_cluster +22 -0
data/bin/rebuild_metaphones +23 -0
data/bin/tiger_import +59 -0
data/demos/demo/app/ext/geocodewrap.rb +84 -0
data/demos/demo/app/views/index.builder +13 -0
data/demos/demo/app/views/index.erb +71 -0
data/demos/demo/config.ru +12 -0
data/demos/demo/config/bootstraps.rb +130 -0
data/demos/demo/config/geoenvironment.rb +25 -0
data/demos/demo/geocoder_helper.rb +12 -0
data/demos/demo/geocom_geocode.rb +10 -0
data/demos/demo/main.rb +3 -0
data/demos/demo/rakefile.rb +17 -0
data/demos/demo/tmp/restart.txt +0 -0
data/demos/simpledemo/views/index.builder +13 -0
data/demos/simpledemo/views/index.erb +69 -0
data/demos/simpledemo/ws.rb +83 -0
data/doc/Makefile +7 -0
data/doc/html4css1.css +279 -0
data/doc/lookup.rst +193 -0
data/doc/parsing.rst +125 -0
data/doc/voidspace.css +147 -0
data/geo_coder.gemspec +172 -0
data/lib/geocoder/us.rb +21 -0
data/lib/geocoder/us/address.rb +290 -0
data/lib/geocoder/us/constants.rb +670 -0
data/lib/geocoder/us/database.rb +745 -0
data/lib/geocoder/us/import.rb +181 -0
data/lib/geocoder/us/import/tiger.rb +13 -0
data/lib/geocoder/us/numbers.rb +58 -0
data/navteq/README +4 -0
data/navteq/convert.sql +37 -0
data/navteq/navteq_import +39 -0
data/navteq/prepare.sql +92 -0
data/sql/cluster.sql +16 -0
data/sql/convert.sql +80 -0
data/sql/create.sql +37 -0
data/sql/index.sql +12 -0
data/sql/place.csv +104944 -0
data/sql/place.sql +104948 -0
data/sql/setup.sql +78 -0
data/src/Makefile +13 -0
data/src/README +14 -0
data/src/liblwgeom/Makefile +75 -0
data/src/liblwgeom/box2d.c +54 -0
data/src/liblwgeom/lex.yy.c +4799 -0
data/src/liblwgeom/liblwgeom.h +1405 -0
data/src/liblwgeom/lwalgorithm.c +946 -0
data/src/liblwgeom/lwalgorithm.h +52 -0
data/src/liblwgeom/lwcircstring.c +759 -0
data/src/liblwgeom/lwcollection.c +541 -0
data/src/liblwgeom/lwcompound.c +118 -0
data/src/liblwgeom/lwcurvepoly.c +86 -0
data/src/liblwgeom/lwgeom.c +886 -0
data/src/liblwgeom/lwgeom_api.c +2201 -0
data/src/liblwgeom/lwgparse.c +1219 -0
data/src/liblwgeom/lwgunparse.c +1054 -0
data/src/liblwgeom/lwline.c +525 -0
data/src/liblwgeom/lwmcurve.c +125 -0
data/src/liblwgeom/lwmline.c +137 -0
data/src/liblwgeom/lwmpoint.c +138 -0
data/src/liblwgeom/lwmpoly.c +141 -0
data/src/liblwgeom/lwmsurface.c +129 -0
data/src/liblwgeom/lwpoint.c +439 -0
data/src/liblwgeom/lwpoly.c +579 -0
data/src/liblwgeom/lwsegmentize.c +1047 -0
data/src/liblwgeom/lwutil.c +369 -0
data/src/liblwgeom/measures.c +861 -0
data/src/liblwgeom/postgis_config.h +93 -0
data/src/liblwgeom/ptarray.c +847 -0
data/src/liblwgeom/vsprintf.c +179 -0
data/src/liblwgeom/wktparse.h +126 -0
data/src/liblwgeom/wktparse.lex +74 -0
data/src/liblwgeom/wktparse.tab.c +2353 -0
data/src/liblwgeom/wktparse.tab.h +145 -0
data/src/liblwgeom/wktparse.y +385 -0
data/src/libsqlite3_geocoder/Makefile +22 -0
data/src/libsqlite3_geocoder/Makefile.nix +15 -0
data/src/libsqlite3_geocoder/Makefile.redhat +15 -0
data/src/libsqlite3_geocoder/extension.c +121 -0
data/src/libsqlite3_geocoder/extension.h +13 -0
data/src/libsqlite3_geocoder/levenshtein.c +42 -0
data/src/libsqlite3_geocoder/metaphon.c +278 -0
data/src/libsqlite3_geocoder/util.c +37 -0
data/src/libsqlite3_geocoder/wkb_compress.c +54 -0
data/src/metaphone/Makefile +7 -0
data/src/metaphone/README +49 -0
data/src/metaphone/extension.c +37 -0
data/src/metaphone/metaphon.c +251 -0
data/src/shp2sqlite/Makefile +37 -0
data/src/shp2sqlite/Makefile.nix +36 -0
data/src/shp2sqlite/Makefile.redhat +35 -0
data/src/shp2sqlite/dbfopen.c +1595 -0
data/src/shp2sqlite/getopt.c +695 -0
data/src/shp2sqlite/getopt.h +127 -0
data/src/shp2sqlite/shapefil.h +500 -0
data/src/shp2sqlite/shp2sqlite.c +1974 -0
data/src/shp2sqlite/shpopen.c +1894 -0
data/tests/address.rb +236 -0
data/tests/benchmark.rb +20 -0
data/tests/constants.rb +57 -0
data/tests/data/address-sample.csv +52 -0
data/tests/data/db-test.csv +57 -0
data/tests/data/locations.csv +4 -0
data/tests/database.rb +137 -0
data/tests/generate.rb +34 -0
data/tests/numbers.rb +46 -0
data/tests/run.rb +11 -0
metadata +237 -0

data/demos/simpledemo/ws.rb ADDED Viewed

@@ -0,0 +1,83 @@
+require 'rubygems'
+require 'sinatra'
+require 'geocoder/us/database'
+require 'fastercsv'
+require 'json'
+set :port, 8080
+@@db = Geocoder::US::Database.new("/fortiusone/geocoder/geocoder.db")
+get '/' do
+  unless params[:address].nil?
+    @records = @@db.geocode params[:address]
+  end
+  case params[:format]
+  when /xml/
+    builder :index
+  when /atom/
+    builder :atom
+  else
+    erb :index
+  end
+end
+require 'open-uri'
+get '/link.:format' do
+  if(params.include?(:url))
+	csv_file = params[:url]
+  else
+  csv_file = "uploads/#{params[:filename]}.csv"
+end
+  csv = FasterCSV.parse(open(csv_file))
+  headers = csv[0]
+  @records = csv.collect do |record|
+    next if record == headers
+    begin
+      (@@db.geocode record[1]).first
+    rescue Exception => e
+      puts e.message
+      next
+    end
+  end.compact
+  case params[:format]
+  when /atom/
+    builder :atom
+  when /xml/
+    builder :index
+  else
+    erb :index
+  end
+end
+post '/batch' do
+  csv_file = request.env["rack.input"].read
+  csv = FasterCSV.parse(csv_file, :row_sep => "*", :col_sep => "|")
+  headers = csv[0]
+  @records = csv.collect do |record|
+  next if record == headers
+    begin
+      (@@db.geocode record[1]).first.merge(headers[0] => record[0])
+    rescue Exception => e
+      puts e.message
+    next
+    end
+     end.compact
+  case params[:format]
+  when /xml/
+    builder :index
+  when /atom/
+    builder :atom
+  when /json/
+    @records.to_json
+  else
+    erb :index
+  end
+end

data/doc/Makefile ADDED Viewed

@@ -0,0 +1,7 @@
+all: lookup.html parsing.html
+%.html: %.rst voidspace.css
+	rst2html --stylesheet-path=voidspace.css --no-compact-lists $< > $@
+clean:
+	rm -f *.html

data/doc/html4css1.css ADDED Viewed

@@ -0,0 +1,279 @@
+/*
+:Author: David Goodger
+:Contact: goodger@users.sourceforge.net
+:Date: $Date: 2005-12-18 01:56:14 +0100 (Sun, 18 Dec 2005) $
+:Revision: $Revision: 4224 $
+:Copyright: This stylesheet has been placed in the public domain.
+Default cascading style sheet for the HTML output of Docutils.
+See http://docutils.sf.net/docs/howto/html-stylesheets.html for how to
+customize this style sheet.
+*/
+/* used to remove borders from tables and images */
+.borderless, table.borderless td, table.borderless th {
+  border: 0 }
+table.borderless td, table.borderless th {
+  /* Override padding for "table.docutils td" with "! important".
+     The right padding separates the table cells. */
+  padding: 0 0.5em 0 0 ! important }
+.first {
+  /* Override more specific margin styles with "! important". */
+  margin-top: 0 ! important }
+.last, .with-subtitle {
+  margin-bottom: 0 ! important }
+.hidden {
+  display: none }
+a.toc-backref {
+  text-decoration: none ;
+  color: black }
+blockquote.epigraph {
+  margin: 2em 5em ; }
+dl.docutils dd {
+  margin-bottom: 0.5em }
+/* Uncomment (and remove this text!) to get bold-faced definition list terms
+dl.docutils dt {
+  font-weight: bold }
+*/
+div.abstract {
+  margin: 2em 5em }
+div.abstract p.topic-title {
+  font-weight: bold ;
+  text-align: center }
+div.admonition, div.attention, div.caution, div.danger, div.error,
+div.hint, div.important, div.note, div.tip, div.warning {
+  margin: 2em ;
+  border: medium outset ;
+  padding: 1em }
+div.admonition p.admonition-title, div.hint p.admonition-title,
+div.important p.admonition-title, div.note p.admonition-title,
+div.tip p.admonition-title {
+  font-weight: bold ;
+  font-family: sans-serif }
+div.attention p.admonition-title, div.caution p.admonition-title,
+div.danger p.admonition-title, div.error p.admonition-title,
+div.warning p.admonition-title {
+  color: red ;
+  font-weight: bold ;
+  font-family: sans-serif }
+/* Uncomment (and remove this text!) to get reduced vertical space in
+   compound paragraphs.
+div.compound .compound-first, div.compound .compound-middle {
+  margin-bottom: 0.5em }
+div.compound .compound-last, div.compound .compound-middle {
+  margin-top: 0.5em }
+*/
+div.dedication {
+  margin: 2em 5em ;
+  text-align: center ;
+  font-style: italic }
+div.dedication p.topic-title {
+  font-weight: bold ;
+  font-style: normal }
+div.figure {
+  margin-left: 2em ;
+  margin-right: 2em }
+div.footer, div.header {
+  clear: both;
+  font-size: smaller }
+div.line-block {
+  display: block ;
+  margin-top: 1em ;
+  margin-bottom: 1em }
+div.line-block div.line-block {
+  margin-top: 0 ;
+  margin-bottom: 0 ;
+  margin-left: 1.5em }
+div.sidebar {
+  margin-left: 1em ;
+  border: medium outset ;
+  padding: 1em ;
+  background-color: #ffffee ;
+  width: 40% ;
+  float: right ;
+  clear: right }
+div.sidebar p.rubric {
+  font-family: sans-serif ;
+  font-size: medium }
+div.system-messages {
+  margin: 5em }
+div.system-messages h1 {
+  color: red }
+div.system-message {
+  border: medium outset ;
+  padding: 1em }
+div.system-message p.system-message-title {
+  color: red ;
+  font-weight: bold }
+div.topic {
+  margin: 2em }
+h1.section-subtitle, h2.section-subtitle, h3.section-subtitle,
+h4.section-subtitle, h5.section-subtitle, h6.section-subtitle {
+  margin-top: 0.4em }
+h1.title {
+  text-align: center }
+h2.subtitle {
+  text-align: center }
+hr.docutils {
+  width: 75% }
+img.align-left {
+  clear: left }
+img.align-right {
+  clear: right }
+ol.simple, ul.simple {
+  margin-bottom: 1em }
+ol.arabic {
+  list-style: decimal }
+ol.loweralpha {
+  list-style: lower-alpha }
+ol.upperalpha {
+  list-style: upper-alpha }
+ol.lowerroman {
+  list-style: lower-roman }
+ol.upperroman {
+  list-style: upper-roman }
+p.attribution {
+  text-align: right ;
+  margin-left: 50% }
+p.caption {
+  font-style: italic }
+p.credits {
+  font-style: italic ;
+  font-size: smaller }
+p.label {
+  white-space: nowrap }
+p.rubric {
+  font-weight: bold ;
+  font-size: larger ;
+  color: maroon ;
+  text-align: center }
+p.sidebar-title {
+  font-family: sans-serif ;
+  font-weight: bold ;
+  font-size: larger }
+p.sidebar-subtitle {
+  font-family: sans-serif ;
+  font-weight: bold }
+p.topic-title {
+  font-weight: bold }
+pre.address {
+  margin-bottom: 0 ;
+  margin-top: 0 ;
+  font-family: serif ;
+  font-size: 100% }
+pre.literal-block, pre.doctest-block {
+  margin-left: 2em ;
+  margin-right: 2em ;
+  background-color: #eeeeee }
+span.classifier {
+  font-family: sans-serif ;
+  font-style: oblique }
+span.classifier-delimiter {
+  font-family: sans-serif ;
+  font-weight: bold }
+span.interpreted {
+  font-family: sans-serif }
+span.option {
+  white-space: nowrap }
+span.pre {
+  white-space: pre }
+span.problematic {
+  color: red }
+span.section-subtitle {
+  /* font-size relative to parent (h1..h6 element) */
+  font-size: 80% }
+table.citation {
+  border-left: solid 1px gray;
+  margin-left: 1px }
+table.docinfo {
+  margin: 2em 4em }
+table.docutils {
+  margin-top: 0.5em ;
+  margin-bottom: 0.5em }
+table.footnote {
+  border-left: solid 1px black;
+  margin-left: 1px }
+table.docutils td, table.docutils th,
+table.docinfo td, table.docinfo th {
+  padding-left: 0.5em ;
+  padding-right: 0.5em ;
+  vertical-align: top }
+table.docutils th.field-name, table.docinfo th.docinfo-name {
+  font-weight: bold ;
+  text-align: left ;
+  white-space: nowrap ;
+  padding-left: 0 }
+h1 tt.docutils, h2 tt.docutils, h3 tt.docutils,
+h4 tt.docutils, h5 tt.docutils, h6 tt.docutils {
+  font-size: 100% }
+tt.docutils {
+  background-color: #eeeeee }
+ul.auto-toc {
+  list-style-type: none }

data/doc/lookup.rst ADDED Viewed

@@ -0,0 +1,193 @@
+.. _lookup:
+===================================
+Geocoder.us Address Lookup Strategy
+===================================
+:Author: Schuyler Erle
+:Contact: schuyler at geocoder dot us
+:Created: 2009/03/13
+:Edited: 2009/03/14
+Definitions
+-----------
+Edge
+  Database representation of a street segment, consisting of a linestring
+  geometry and an edge ID. Edges relate to many ranges and many features
+  through its ID.
+Feature
+  Database representation of a named street, consisting of street name
+  and modifier elements, a reference ZIP code, and a primary/alternate flag.
+Range
+  Database representation of a range of address numbers on a given
+  street, consisting of range start and end numbers, an optional prefix
+  ending with a non-numeric character, and a delivery ZIP code for that
+  range.
+Place
+  Database representation of a ZIP code, consisting of a city name,
+  state abbreviation, a ZIP code, and a primary/alternate flag.
+Address record
+  A set consisting of exactly one edge, one feature, and one range, related
+  through the edge ID.
+Address query
+  An ordered set of {Number Prefix, Number, Directional Prefix, Type Prefix,
+  Qualifier Prefix, Street Name, Qualifier Suffix, Type Suffix, Directional
+  Suffix, City, State, ZIP}. All of the elements are optional except Number and
+  Street Name. Either ZIP or City must also be present. The State element
+  and all of the prefix and suffix elements are assumed to be normalized to
+  standard postal abbreviations.
+Address string
+  A string including some or all of the elements of an address.
+Address Lookup Strategy
+-----------------------
+1. Given a an address query, initialize an empty set of candidate places,
+   and an empty set of candidate address records.
+#. If a ZIP was given, look up `the place from the ZIP`_, and add the
+   place, if any, to the candidate place set.
+#. If a city was given, look up all `the places matching the metaphone hash
+   of the city name`_, and add them, if any, to the candidate place set.
+#. Generate a unique set of ZIPs from the set of candidate places, since a ZIP
+   may have one or more names associated with it.
+#. Generate `a list of candidate address records`_ by fetching all the street
+   features matching the metaphone hash of the street name and one of the ZIPs
+   in the query set, along with the ranges matching the edge ID of each
+   feature, where the given number is in the range. The edge does not
+   need to be fetched yet.
+#. If the look up generates no results, optionally generate `more candidate
+   records`_ by looking up all the street features matching the metaphone hash
+   of the street name, along with the ranges matching the edge ID of each
+   feature, where the given number is in the range. This may be a very time
+   consuming database query, because some street names are quite common.
+#. Score each of the candidate records as follows:
+   a. Score one point for every provided element of the address query that it
+      matches exactly.
+   #. Optionally, compute the scaled Damerau-Levenshtein distance (or
+      alternately the simple Levenshtein distance) between each provided
+      element of the address query and the corresponding element in the
+      candidate. Score one minus the scaled distance, which yields a fraction
+      of a point.
+   #. Score one point if the parity of starting range number matches the parity
+      of the queried address number.
+   #. Note that the maximum possible score is equal to the number of provided
+      elements in the address query. Divide the score by the maximum possible.
+      This is the confidence value of the candidate.
+#. Sort the candidate address records by confidence. Retain only the records
+   that share the highest confidence as candidates.
+#. Fetch `the edges and primary feature names`_ matching the edge IDs of
+   the remaining candidate address records.
+#. For each remaining candidate record:
+   a. Replace the candidate record feature elements with those of the
+      primary feature name for that edge.
+   #. Fetch `all of the ranges for the edge ID`_ of the candidate, sorted by
+      starting number.
+   #. Compute the sum of the differences of the starting and ending house
+      number for each range. This is the total number width of the edge.
+   #. Take the difference between the candidate starting number and the lowest
+      starting number, add the difference between the queried number and the
+      candidate starting number, and divide by the total number width. This is
+      the interpolation distance.
+   #. Optionally, find the local UTM zone and project the edge into it.
+   #. Find the point along the line at the interpolation distance.
+   #. If the edge was projected, unproject the point.
+   #. Assign the point as the geocoded location of the query to the candidate
+      record.
+#. Construct a set of result ZIPs from the remaining candidates, and look up
+   `the primary name and state for each ZIP`_ in the set. Assign the matching
+   primary city and state to each candidate.
+#. Return the set of candidate records as the result of the query.
+SQL Statements
+--------------
+the place from the ZIP
+~~~~~~~~~~~~~~~~~~~~~~~
+::
+    SELECT * FROM place WHERE zip = '...';
+the places matching the metaphone hash of the city name
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+::
+    SELECT * FROM place WHERE city_phone = metaphone('...');
+a list of candidate address records
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+::
+    SELECT feature.*, range.* FROM feature, range
+        WHERE name_phone = metaphone('...') AND feature.zip IN (...)
+        AND range.tlid = feature.tlid
+        AND fromhn <= ... AND tohn >= ...;
+more candidate records
+~~~~~~~~~~~~~~~~~~~~~~
+::
+    SELECT feature.*, range.* FROM feature, range
+        WHERE name_phone = metaphone('...')
+        AND range.tlid = feature.tlid
+        AND fromhn <= ... AND tohn >= ...;
+the edges and primary feature names
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+::
+    SELECT feature.*, edge.* FROM feature, edge
+        WHERE feature.tlid = ... AND paflag = 'P'
+        AND edge.tlid = feature.tlid;
+    -- or
+    SELECT feature.*, edge.* FROM feature, edge
+        WHERE feature.tlid IN (...)
+        AND paflag = 'P'
+        AND edge.tlid = feature.tlid;
+all of the ranges for the edge ID
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+::
+    SELECT * FROM range WHERE range.tlid = ...;
+    -- or
+    SELECT * FROM range WHERE range.tlid IN (...);
+the primary name and state for each ZIP
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+::
+    SELECT * FROM place WHERE zip IN (...) AND paflag = 'P';
+= 30 =