Indirizzo 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ coverage/
2
+ coverage.data
3
+ *.gem
4
+ .bundle
5
+ Gemfile.lock
6
+ pkg/*
7
+ .rvmrc
@@ -0,0 +1,5 @@
1
+ rvm:
2
+ - 1.9.2
3
+ branches:
4
+ only:
5
+ - master
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source :rubygems
2
+
3
+ gemspec
@@ -0,0 +1,28 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = %q{Indirizzo}
3
+ s.version = "0.1.0"
4
+
5
+ s.authors = [%q{Dave Worth}]
6
+ s.date = %q{2011-12-14}
7
+ s.description = %q{Indirizzo is simply an extraction of the US Street Address parsing code from Geocoder::US}
8
+ s.email = %q{dave@highgroove.com}
9
+
10
+ s.homepage = %q{http://github.com/daveworth/indirizzo}
11
+ s.licenses = [%q{LGPL}]
12
+ s.require_paths = [%q{lib}]
13
+ s.rubygems_version = %q{1.9.2}
14
+ s.summary = %q{Indirizzo is simply an extraction of the US Street Address parsing code from Geocoder::US}
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+ s.extra_rdoc_files = [
21
+ "LICENSE.txt",
22
+ "README.md"
23
+ ]
24
+
25
+ s.add_development_dependency('rake')
26
+ s.add_development_dependency('cover_me')
27
+ s.add_development_dependency('awesome_print')
28
+ end
@@ -0,0 +1,165 @@
1
+ GNU LESSER GENERAL PUBLIC LICENSE
2
+ Version 3, 29 June 2007
3
+
4
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
5
+ Everyone is permitted to copy and distribute verbatim copies
6
+ of this license document, but changing it is not allowed.
7
+
8
+
9
+ This version of the GNU Lesser General Public License incorporates
10
+ the terms and conditions of version 3 of the GNU General Public
11
+ License, supplemented by the additional permissions listed below.
12
+
13
+ 0. Additional Definitions.
14
+
15
+ As used herein, "this License" refers to version 3 of the GNU Lesser
16
+ General Public License, and the "GNU GPL" refers to version 3 of the GNU
17
+ General Public License.
18
+
19
+ "The Library" refers to a covered work governed by this License,
20
+ other than an Application or a Combined Work as defined below.
21
+
22
+ An "Application" is any work that makes use of an interface provided
23
+ by the Library, but which is not otherwise based on the Library.
24
+ Defining a subclass of a class defined by the Library is deemed a mode
25
+ of using an interface provided by the Library.
26
+
27
+ A "Combined Work" is a work produced by combining or linking an
28
+ Application with the Library. The particular version of the Library
29
+ with which the Combined Work was made is also called the "Linked
30
+ Version".
31
+
32
+ The "Minimal Corresponding Source" for a Combined Work means the
33
+ Corresponding Source for the Combined Work, excluding any source code
34
+ for portions of the Combined Work that, considered in isolation, are
35
+ based on the Application, and not on the Linked Version.
36
+
37
+ The "Corresponding Application Code" for a Combined Work means the
38
+ object code and/or source code for the Application, including any data
39
+ and utility programs needed for reproducing the Combined Work from the
40
+ Application, but excluding the System Libraries of the Combined Work.
41
+
42
+ 1. Exception to Section 3 of the GNU GPL.
43
+
44
+ You may convey a covered work under sections 3 and 4 of this License
45
+ without being bound by section 3 of the GNU GPL.
46
+
47
+ 2. Conveying Modified Versions.
48
+
49
+ If you modify a copy of the Library, and, in your modifications, a
50
+ facility refers to a function or data to be supplied by an Application
51
+ that uses the facility (other than as an argument passed when the
52
+ facility is invoked), then you may convey a copy of the modified
53
+ version:
54
+
55
+ a) under this License, provided that you make a good faith effort to
56
+ ensure that, in the event an Application does not supply the
57
+ function or data, the facility still operates, and performs
58
+ whatever part of its purpose remains meaningful, or
59
+
60
+ b) under the GNU GPL, with none of the additional permissions of
61
+ this License applicable to that copy.
62
+
63
+ 3. Object Code Incorporating Material from Library Header Files.
64
+
65
+ The object code form of an Application may incorporate material from
66
+ a header file that is part of the Library. You may convey such object
67
+ code under terms of your choice, provided that, if the incorporated
68
+ material is not limited to numerical parameters, data structure
69
+ layouts and accessors, or small macros, inline functions and templates
70
+ (ten or fewer lines in length), you do both of the following:
71
+
72
+ a) Give prominent notice with each copy of the object code that the
73
+ Library is used in it and that the Library and its use are
74
+ covered by this License.
75
+
76
+ b) Accompany the object code with a copy of the GNU GPL and this license
77
+ document.
78
+
79
+ 4. Combined Works.
80
+
81
+ You may convey a Combined Work under terms of your choice that,
82
+ taken together, effectively do not restrict modification of the
83
+ portions of the Library contained in the Combined Work and reverse
84
+ engineering for debugging such modifications, if you also do each of
85
+ the following:
86
+
87
+ a) Give prominent notice with each copy of the Combined Work that
88
+ the Library is used in it and that the Library and its use are
89
+ covered by this License.
90
+
91
+ b) Accompany the Combined Work with a copy of the GNU GPL and this license
92
+ document.
93
+
94
+ c) For a Combined Work that displays copyright notices during
95
+ execution, include the copyright notice for the Library among
96
+ these notices, as well as a reference directing the user to the
97
+ copies of the GNU GPL and this license document.
98
+
99
+ d) Do one of the following:
100
+
101
+ 0) Convey the Minimal Corresponding Source under the terms of this
102
+ License, and the Corresponding Application Code in a form
103
+ suitable for, and under terms that permit, the user to
104
+ recombine or relink the Application with a modified version of
105
+ the Linked Version to produce a modified Combined Work, in the
106
+ manner specified by section 6 of the GNU GPL for conveying
107
+ Corresponding Source.
108
+
109
+ 1) Use a suitable shared library mechanism for linking with the
110
+ Library. A suitable mechanism is one that (a) uses at run time
111
+ a copy of the Library already present on the user's computer
112
+ system, and (b) will operate properly with a modified version
113
+ of the Library that is interface-compatible with the Linked
114
+ Version.
115
+
116
+ e) Provide Installation Information, but only if you would otherwise
117
+ be required to provide such information under section 6 of the
118
+ GNU GPL, and only to the extent that such information is
119
+ necessary to install and execute a modified version of the
120
+ Combined Work produced by recombining or relinking the
121
+ Application with a modified version of the Linked Version. (If
122
+ you use option 4d0, the Installation Information must accompany
123
+ the Minimal Corresponding Source and Corresponding Application
124
+ Code. If you use option 4d1, you must provide the Installation
125
+ Information in the manner specified by section 6 of the GNU GPL
126
+ for conveying Corresponding Source.)
127
+
128
+ 5. Combined Libraries.
129
+
130
+ You may place library facilities that are a work based on the
131
+ Library side by side in a single library together with other library
132
+ facilities that are not Applications and are not covered by this
133
+ License, and convey such a combined library under terms of your
134
+ choice, if you do both of the following:
135
+
136
+ a) Accompany the combined library with a copy of the same work based
137
+ on the Library, uncombined with any other library facilities,
138
+ conveyed under the terms of this License.
139
+
140
+ b) Give prominent notice with the combined library that part of it
141
+ is a work based on the Library, and explaining where to find the
142
+ accompanying uncombined form of the same work.
143
+
144
+ 6. Revised Versions of the GNU Lesser General Public License.
145
+
146
+ The Free Software Foundation may publish revised and/or new versions
147
+ of the GNU Lesser General Public License from time to time. Such new
148
+ versions will be similar in spirit to the present version, but may
149
+ differ in detail to address new problems or concerns.
150
+
151
+ Each version is given a distinguishing version number. If the
152
+ Library as you received it specifies that a certain numbered version
153
+ of the GNU Lesser General Public License "or any later version"
154
+ applies to it, you have the option of following the terms and
155
+ conditions either of that published version or of any later version
156
+ published by the Free Software Foundation. If the Library as you
157
+ received it does not specify a version number of the GNU Lesser
158
+ General Public License, you may choose any version of the GNU Lesser
159
+ General Public License ever published by the Free Software Foundation.
160
+
161
+ If the Library as you received it specifies that a proxy can decide
162
+ whether future versions of the GNU Lesser General Public License shall
163
+ apply, that proxy's public statement of acceptance of any version is
164
+ permanent authorization for you to choose that version for the
165
+ Library.
@@ -0,0 +1,56 @@
1
+ # Introduction
2
+
3
+ Indirizzo is a simple extraction of the Address class (along with the numbers
4
+ and constants handling code) from [Geocommons](http://geocommons.com/)'
5
+ [Geocoder::US 2.0](https://github.com/geocommons/geocoder) gem.
6
+
7
+ [![Build Status](https://secure.travis-ci.org/daveworth/Indirizzo.png)](http://travis-ci.org/daveworth/Indirizzo)
8
+
9
+ ## Background
10
+
11
+ My motivation for creating this extraction is the dearth of high-quality,
12
+ flexible, street address parsing gems available to the Ruby community. After
13
+ digging into Ruby-Toolbox looking for alternatives I came up with tools based on
14
+ the Perl
15
+ [GEO::StreetAddress::US](http://search.cpan.org/~sderle/Geo-StreetAddress-US-0.99/US.pm)
16
+ such as [street\_address](https://github.com/astevens/street_address). The
17
+ street_address gem ended up being much to restrictive for my needs and my
18
+ continued searching brought me to the Geocoder::US gem. Regrettably the
19
+ constraints of needing a SQLite3 database for proper geocoding added overhead to
20
+ my simple needs. I simply need to parse addresses that may, or may not, be
21
+ "complete" or "well-formed". Thus Indirizzo was born.
22
+
23
+ ## Usage
24
+
25
+ ```ruby
26
+ require 'Indirizzo'
27
+ Indirizzo::Address.new("some address")
28
+ ```
29
+
30
+ ## License
31
+
32
+ Indirizzo is a direct derivative of [Geocoder::US 2.0](https://github.com/geocommons/geocoder)
33
+
34
+ Geocoder::US 2.0 was based on earlier work by Schuyler Erle on
35
+ a Perl module of the same name. You can find it at
36
+ [http://search.cpan.org/~sderle/](http://search.cpan.org/~sderle/).
37
+
38
+ Geocoder::US 2.0 was written by Schuyler Erle, of Entropy Free LLC,
39
+ with the gracious support of FortiusOne, Inc. Please send bug reports,
40
+ patches, kudos, etc. to patches at geocoder.us.
41
+
42
+ Copyright (c) 2009 FortiusOne, Inc.
43
+
44
+ This program is free software: you can redistribute it and/or modify
45
+ it under the terms of the GNU General Public License as published by
46
+ the Free Software Foundation, either version 3 of the License, or
47
+ (at your option) any later version.
48
+
49
+ This program is distributed in the hope that it will be useful,
50
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
51
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
52
+ GNU General Public License for more details.
53
+
54
+ You should have received a copy of the GNU General Public License
55
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
56
+
@@ -0,0 +1,31 @@
1
+ require 'rake/testtask'
2
+ require 'bundler/gem_tasks'
3
+ require 'bundler'
4
+ begin
5
+ Bundler.setup(:default, :development)
6
+ rescue Bundler::BundlerError => e
7
+ $stderr.puts e.message
8
+ $stderr.puts "Run `bundle install` to install missing gems"
9
+ exit e.status_code
10
+ end
11
+ require 'rake'
12
+
13
+ Rake::TestTask.new(:test) do |test|
14
+ test.libs << 'lib' << 'test'
15
+ test.pattern = 'test/**/test_*.rb'
16
+ test.verbose = true
17
+ end
18
+
19
+ task :default => :test
20
+
21
+ namespace :cover_me do
22
+ desc "Generates and opens code coverage report."
23
+ task :report do
24
+ require 'cover_me'
25
+ CoverMe.complete!
26
+ end
27
+ end
28
+
29
+ task :test do
30
+ Rake::Task['cover_me:report'].invoke
31
+ end
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.0
@@ -0,0 +1 @@
1
+ require 'Indirizzo/Address'
@@ -0,0 +1,286 @@
1
+ require 'indirizzo/constants'
2
+
3
+ module Indirizzo
4
+ # Defines the matching of parsed address tokens.
5
+ Match = {
6
+ # FIXME: shouldn't have to anchor :number and :zip at start/end
7
+ :number => /^(\d+\W|[a-z]+)?(\d+)([a-z]?)\b/io,
8
+ :street => /(?:\b(?:\d+\w*|[a-z'-]+)\s*)+/io,
9
+ :city => /(?:\b[a-z'-]+\s*)+/io,
10
+ :state => Regexp.new(State.regexp.source + "\s*$", Regexp::IGNORECASE),
11
+ :zip => /(\d{5})(?:-\d{4})?\s*$/o,
12
+ :at => /\s(at|@|and|&)\s/io,
13
+ :po_box => /\b[P|p]*(OST|ost)*\.*\s*[O|o|0]*(ffice|FFICE)*\.*\s*[B|b][O|o|0][X|x]\b/
14
+ }
15
+
16
+ # The Address class takes a US street address or place name and
17
+ # constructs a list of possible structured parses of the address
18
+ # string.
19
+ class Address
20
+ attr_accessor :text
21
+ attr_accessor :prenum, :number, :sufnum
22
+ attr_accessor :street
23
+ attr_accessor :city
24
+ attr_accessor :state
25
+ attr_accessor :zip, :plus4
26
+
27
+ # Takes an address or place name string as its sole argument.
28
+ def initialize (text)
29
+ raise ArgumentError, "no text provided" unless text and !text.empty?
30
+ if text.class == Hash
31
+ @text = ""
32
+ assign_text_to_address text
33
+ else
34
+ @text = clean text
35
+ parse
36
+ end
37
+ end
38
+
39
+ # Removes any characters that aren't strictly part of an address string.
40
+ def clean (value)
41
+ value.strip \
42
+ .gsub(/[^a-z0-9 ,'&@\/-]+/io, "") \
43
+ .gsub(/\s+/o, " ")
44
+ end
45
+
46
+ def assign_text_to_address(text)
47
+ if !text[:address].nil?
48
+ @text = clean text[:address]
49
+ parse
50
+ else
51
+ @street = []
52
+ @prenum = text[:prenum]
53
+ @sufnum = text[:sufnum]
54
+ if !text[:street].nil?
55
+ @street = text[:street].scan(Match[:street])
56
+ end
57
+ @number = ""
58
+ if !@street.nil?
59
+ if text[:number].nil?
60
+ @street.map! { |single_street|
61
+ single_street.downcase!
62
+ @number = single_street.scan(Match[:number])[0].reject{|n| n.nil? || n.empty?}.first.to_s
63
+ single_street.sub! @number, ""
64
+ single_street.sub! /^\s*,?\s*/o, ""
65
+ }
66
+ else
67
+ @number = text[:number].to_s
68
+ end
69
+ @street = expand_streets(@street)
70
+ street_parts
71
+ end
72
+ @city = []
73
+ if !text[:city].nil?
74
+ @city.push(text[:city])
75
+ @text = text[:city].to_s
76
+ else
77
+ @city.push("")
78
+ end
79
+ if !text[:region].nil?
80
+ # @state = []
81
+ @state = text[:region]
82
+ if @state.length > 2
83
+ # full_state = @state.strip # special case: New York
84
+ @state = State[@state]
85
+ end
86
+ elsif !text[:country].nil?
87
+ @state = text[:country]
88
+ elsif !text[:state].nil?
89
+ @state = text[:state]
90
+ end
91
+
92
+ @zip = text[:postal_code]
93
+ @plus4 = text[:plus4]
94
+ if !@zip
95
+ @zip = @plus4 = ""
96
+ end
97
+ end
98
+ end
99
+
100
+ # Expands a token into a list of possible strings based on
101
+ # the Geocoder::US::Name_Abbr constant, and expands numerals and
102
+ # number words into their possible equivalents.
103
+ def expand_numbers (string)
104
+ if /\b\d+(?:st|nd|rd|th)?\b/o.match string
105
+ match = $&
106
+ num = $&.to_i
107
+ elsif Ordinals.regexp.match string
108
+ num = Ordinals[$&]
109
+ match = $&
110
+ elsif Cardinals.regexp.match string
111
+ num = Cardinals[$&]
112
+ match = $&
113
+ end
114
+ strings = []
115
+ if num and num < 100
116
+ [num.to_s, Ordinals[num], Cardinals[num]].each {|replace|
117
+ strings << string.sub(match, replace)
118
+ }
119
+ else
120
+ strings << string
121
+ end
122
+ strings
123
+ end
124
+
125
+ def parse_zip(regex_match, text)
126
+ idx = text.rindex(regex_match)
127
+ text[idx...idx+regex_match.length] = ""
128
+ text.sub! /\s*,?\s*$/o, ""
129
+ @zip, @plus4 = @zip.map {|s|s.strip}
130
+ text
131
+ end
132
+
133
+ def parse_state(regex_match, text)
134
+ idx = text.rindex(regex_match)
135
+ text[idx...idx+regex_match.length] = ""
136
+ text.sub! /\s*,?\s*$/o, ""
137
+ @full_state = @state[0].strip # special case: New York
138
+ @state = State[@full_state]
139
+ text
140
+ end
141
+
142
+ def parse_number(regex_match, text)
143
+ # FIXME: What if this string appears twice?
144
+ idx = text.index(regex_match)
145
+ text[idx...idx+regex_match.length] = ""
146
+ text.sub! /^\s*,?\s*/o, ""
147
+ @prenum, @number, @sufnum = @number.map {|s| s and s.strip}
148
+ text
149
+ end
150
+
151
+ def parse
152
+ text = @text.clone.downcase
153
+
154
+ @zip = text.scan(Match[:zip])[-1]
155
+ if @zip
156
+ text = parse_zip($&, text)
157
+ else
158
+ @zip = @plus4 = ""
159
+ end
160
+
161
+ @state = text.scan(Match[:state])[-1]
162
+ if @state
163
+ text = parse_state($&, text)
164
+ else
165
+ @full_state = ""
166
+ @state = ""
167
+ end
168
+
169
+ @number = text.scan(Match[:number])[0]
170
+ # FIXME: 230 Fish And Game Rd, Hudson NY 12534
171
+ if @number # and not intersection?
172
+ text = parse_number($&, text)
173
+ else
174
+ @prenum = @number = @sufnum = ""
175
+ end
176
+
177
+ # FIXME: special case: Name_Abbr gets a bit aggressive
178
+ # about replacing St with Saint. exceptional case:
179
+ # Sault Ste. Marie
180
+
181
+ # FIXME: PO Box should geocode to ZIP
182
+ @street = text.scan(Match[:street])
183
+ @street = expand_streets(@street)
184
+ # SPECIAL CASE: 1600 Pennsylvania 20050
185
+ @street << @full_state if @street.empty? and @state.downcase != @full_state.downcase
186
+
187
+ @city = text.scan(Match[:city])
188
+ if !@city.empty?
189
+ @city = [@city[-1].strip]
190
+ add = @city.map {|item| item.gsub(Name_Abbr.regexp) {|m| Name_Abbr[m]}}
191
+ @city |= add
192
+ @city.map! {|s| s.downcase}
193
+ @city.uniq!
194
+ else
195
+ @city = []
196
+ end
197
+
198
+ # SPECIAL CASE: no city, but a state with the same name. e.g. "New York"
199
+ @city << @full_state if @state.downcase != @full_state.downcase
200
+ end
201
+
202
+ def expand_streets(street)
203
+ if !street.empty? && !street[0].nil?
204
+ street.map! {|s|s.strip}
205
+ add = street.map {|item| item.gsub(Name_Abbr.regexp) {|m| Name_Abbr[m]}}
206
+ street |= add
207
+ add = street.map {|item| item.gsub(Std_Abbr.regexp) {|m| Std_Abbr[m]}}
208
+ street |= add
209
+ street.map! {|item| expand_numbers(item)}
210
+ street.flatten!
211
+ street.map! {|s| s.downcase}
212
+ street.uniq!
213
+ else
214
+ street = []
215
+ end
216
+ street
217
+ end
218
+
219
+ def street_parts
220
+ strings = []
221
+ # Get all the substrings delimited by whitespace
222
+ @street.each {|string|
223
+ tokens = string.split(" ")
224
+ strings |= (0...tokens.length).map {|i|
225
+ (i...tokens.length).map {|j| tokens[i..j].join(" ")}}.flatten
226
+ }
227
+ strings = remove_noise_words(strings)
228
+
229
+ # Try a simpler case of adding the @number in case everything is an abbr.
230
+ strings += [@number] if strings.all? {|s| Std_Abbr.key? s or Name_Abbr.key? s}
231
+ strings.uniq
232
+ end
233
+
234
+ def remove_noise_words(strings)
235
+ # Don't return strings that consist solely of abbreviations.
236
+ # NOTE: Is this a micro-optimization that has edge cases that will break?
237
+ # Answer: Yes, it breaks on simple things like "Prairie St" or "Front St"
238
+ prefix = Regexp.new("^" + Prefix_Type.regexp.source + "\s*", Regexp::IGNORECASE)
239
+ suffix = Regexp.new("\s*" + Suffix_Type.regexp.source + "$", Regexp::IGNORECASE)
240
+ predxn = Regexp.new("^" + Directional.regexp.source + "\s*", Regexp::IGNORECASE)
241
+ sufdxn = Regexp.new("\s*" + Directional.regexp.source + "$", Regexp::IGNORECASE)
242
+ good_strings = strings.map {|s|
243
+ s = s.clone
244
+ s.gsub!(predxn, "")
245
+ s.gsub!(sufdxn, "")
246
+ s.gsub!(prefix, "")
247
+ s.gsub!(suffix, "")
248
+ s
249
+ }
250
+ good_strings.reject! {|s| s.empty?}
251
+ strings = good_strings if !good_strings.empty? {|s| not Std_Abbr.key?(s) and not Name_Abbr.key?(s)}
252
+ strings
253
+ end
254
+
255
+ def city_parts
256
+ strings = []
257
+ @city.map {|string|
258
+ tokens = string.split(" ")
259
+ strings |= (0...tokens.length).to_a.reverse.map {|i|
260
+ (i...tokens.length).map {|j| tokens[i..j].join(" ")}}.flatten
261
+ }
262
+ # Don't return strings that consist solely of abbreviations.
263
+ # NOTE: Is this a micro-optimization that has edge cases that will break?
264
+ # Answer: Yes, it breaks on "Prairie"
265
+ good_strings = strings.reject {|s| Std_Abbr.key? s}
266
+ strings = good_strings if !good_strings.empty?
267
+ strings.uniq
268
+ end
269
+
270
+ def city= (strings)
271
+ # NOTE: This will still fail on: 100 Broome St, 33333 (if 33333 is
272
+ # Broome, MT or what)
273
+ strings = expand_streets(strings) # fix for "Mountain View" -> "Mountain Vw"
274
+ match = Regexp.new('\s*\b(?:' + strings.join("|") + ')\b\s*$', Regexp::IGNORECASE)
275
+ @street = @street.map {|string| string.gsub(match, '')}.select {|s|!s.empty?}
276
+ end
277
+
278
+ def po_box?
279
+ !Match[:po_box].match(@text).nil?
280
+ end
281
+
282
+ def intersection?
283
+ !Match[:at].match(@text).nil?
284
+ end
285
+ end
286
+ end